tip:
The general idea:: from the network (the URL http://ip.jiangxianli.com/api/proxy_ips acquiring proxy ip and port, to keep the list; a random number taken from a list ip, time and frequency provided a supermarket; catch exceptions) from the list to delete unwanted ip proxy ip proxy and set a new
settings.py would also like to open the download middleware
DOWNLOADER_MIDDLEWARES = { 'tianmao.middlewares.TestDownloaderMiddleware': 543, }
code:
from scrapy import signals import requests import json, random class TestDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self): # 获取请求到的ip内容 res = requests.get('http://ip.jiangxianli.com/api/proxy_ips') # 存入列表 self.ip_list = json.loads(res.content)['data']['data'] self.random_int = 1 print('init method is running ...') @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # print(request.meta) print('process request is running ...') # 获取代理 self.get_proxy(request) return None def process_response(self, request, response, spider): print('process_response is running ...') # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain print('exception is %s' % exception) if exception: # 从ip_list删除之前的ip_dic self.ip_list.pop(self.random_int) # 再次选一个IP地址,返回request request = self.get_proxy(request) return request def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) def get_proxy(self, request): num_ip =len (self.ip_list) Print ( ' now a total of% d ip address ' % num_ip) # Random a ip self.random_int = the random.randint (0, num_ip) Print ( ' random Integer to% d ' % Self .random_int) ip_dic = self.ip_list [self.random_int +. 1 ] Print ( ' random to the ip address is: S% ' % ip_dic) ip = ip_dic.get ( ' ip ' ) Port = ip_dic.get ( ' Port ' ) ip_address = ' HTTP: // ' + IP + ' : ' + Port # Set Agent request.META [ ' Proxy ' ] = ip_address # Set the maximum request time request.META [ ' download_timeout ' ] =. 5 return Request