Download middleware Scrapy of proxy middleware HttpProxyMiddleware

Simply use the built-in proxy middleware

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from wyb.items import WybItem
 4 from scrapy.dupefilters import RFPDupeFilter
 5 from scrapy.http.response.html import HtmlResponse
 6 from scrapy.http.cookies import CookieJar
 7 from urllib.parse import urlencode
 8 from scrapy.http import Request
 9 
10 
11 classChoutiSpider (scrapy.Spider):
 12 is      name = ' chouti ' 
13 is      # crawled pages orientation of the domain only 
14      allowed_domains = [ ' chouti.com ' ]
 15      start_urls = [ ' https://dig.chouti.com/ ' ]
 16  
. 17      DEF start_requests (Self):
 18 is          Import OS
 . 19          # proxy settings in downloadmiddleware httppxoxy 
20 is          os.environ [ ' the HTTP_PROXY ' ] = ' 1.1.1.2 ' 
21 is          os.environ ['HTTPS_PROXY'] = 'http://root:[email protected]:8888/'
22         # 方式1
23         for url in self.start_urls:
24             yield Request(url=url)
25         # 方式2
26         # req_list = []
27         # for url in self.start_urls:
28         #     req_list.append(Request(url=url))
29         # return req_list
30 
31     def parse(self, response):
32         """
33         第一次访问响应response
34         :param response:
35         :return:
36         """
37         page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
38         for page in page_list:
39             from scrapy.http import Request
40             page = "https://dig.chouti.com"+page
41             # 继续发请求,回调函数parse
42             yield Request(url=page, callback=self.parse, meta={"proxy": "http://root:[email protected]:8888/"})

 Source code analysis

 

 

 

 

  

 

 

 

 to sum up:

  If you want to use just a simple request at the time of the beginning of the process environment variables defined os.environ

. 1          Import OS
 2          # proxy settings in downloadmiddleware httppxoxy 
. 3          os.environ [ ' the HTTP_PROXY ' ] = ' 1.1.1.2 ' 
. 4          os.environ [ ' https_proxy ' ] = ' HTTP: // the root: [email protected]: 8888 / ' 
. 5  
. 6  
. 7       the yield the Request (URL = Page, the callback = self.parse, Meta = { " Proxy " : " HTTP: // the root: [email protected]: 8888 / " })    # Meta higher priority than environ

   If you want to customize the download agent middleware you need to define these methods

 1 def __init__(self, auth_encoding='latin-1'):
 2 
 3 @classmethod
 4 def from_crawler(cls, crawler):
 5 
 6 def _basic_auth_header(self, username, password):
 7 
 8 def _get_proxy(self, url, orig_type):
 9 
10 def process_request(self, request, spider):
11 
12 def _set_proxy(self, request, scheme): 

Customize the way a middleware proxy download

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import base64
 4 import random
 5 from six.moves.urllib.parse import unquote, urlunparse
 6 
 7 try:
 8     from urllib2 import _parse_proxy
 9 except ImportError:
10     from urllib.request import _parse_proxy
11 from scrapy.utils.python import to_bytes
12 
13 
14 class WybProxyMiddleware(object):
15 
16     def _basic_auth_header(self, username, password):
17         user_pass = to_bytes(
18             '%s:%s' % (unquote(username), unquote(password)),
19             encoding='latin-1')
20         return base64.b64encode(user_pass)
21 
22     def process_request(self, request, spider):
23         PROXIES = [
24             "http://root:[email protected]:8888/",
25             "http://root:[email protected]:8888/",
26             "http://root:[email protected]:8888/",
27             "http://root:[email protected]:8888/",
28             "http://root:[email protected]:8888/",
29             "http://root:[email protected]:8888/",
30         ]
31         # url = "http://root:[email protected]:8888/"
32         url = random.choice(PROXIES)
33         orig_type = ""
34         proxy_type, user, password, hostport = _parse_proxy(url)
35         proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
36 
37         if user:
38             creds = self._basic_auth_header(user, password)
39         else:
40             creds = None
41 
42         request.meta['proxy'] = proxy_url
43         if creds:
44             request.headers['Proxy-Authorization'] = b'Basic ' + creds

Custom download agent middleware Second way, looked very easy to define their own source

 1 class ProxyMiddleware(object):
 2     def process_request(self, request, spider):
 3         PROXIES = [
 4             {'ip_port': '220.230.240.200:80', 'user_pass': 'woshinizuzong'},
 5             {'ip_port': '220.230.240.201:80', 'user_pass': 'woshinidie'},
 6             {'ip_port': '220.230.240.202:8888', 'user_pass': 'woshiniye'},
 7             {'ip_port': '220.230.240.203:80', 'user_pass': 'caonidaba'},
 8             {'ip_port': '220.230.240.204:80', 'user_pass': 'jiaowolaoba'},
 9             {'ip_port': '220.230.240.205:8888', 'user_pass': 'shuowodiaoda'},
10         ]
11         proxy = random.choice(PROXIES)
12         if proxy['user_pass'] is not None:
13             request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
14             encoded_user_pass = base64.b64decode(to_bytes(proxy['user_pass']))
15             request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
16         else:
17             request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])

 Configuration file

 

 Curious look Scrapy profile

 

Guess you like

Origin www.cnblogs.com/Alexephor/p/11440483.html