A proxy ip pool middleware is recorded here, which can be reused directly when doing projects in the future
middleware documentation # -*-coding: utf-8-*- # Define here the models for your spider middleware # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html Import Random from scrapy Import Signals # create a middleware proxy ip pool from the Collections Import defaultdict from scrapy.exceptions Import NotConfigured class RandomProxyMiddleware (Object): DEF __init__ (Self, Settings): # The third step is to configure and initialize variables #Write a list of configuration settings in PROXIES # from the proxy settings in the time coming (the environment variable time coming) self.proxies = settings.getlist ( " PROXIES " ) self.stats = defaultdict (int) # The default value is 0 count the number of = 3 self.max_failed # request no more than three times @classmethod DEF from_cralwer (CLS, crawler): # first step in creating middleware objects # first get the configuration HTTPPROXY_ENABLED to see if the agent is enabled, iF not crawler.settings.getbool ( " HTTPPROXY_ENABLED " ): # If the agent is not enabled The raise NotConfigured #auth_encoding = crawler.settings.get ( "HTTPPROXY_AUTH_ENCODING") # read the configuration, there being no # Step return CLS (crawler.settings) # CLS () call is actually init () function, if accepted init parameter, cls you need parameters DEF process_request (Self, request, Spider): # step four randomly assigned a ip proxy for each request object # make the initial request to use a proxy url Do not use proxy ip IF self.proxies and not request.meta.get ( " proxy " ) and request.url not in spider.start_urls: request.meta [ " proxy " ] =The random.choice (self.proxies) DEF process_response (Self, Request, Response, Spider): # Step Five: request succeeded cur_proxy = request.meta.get ( ' Proxy ' ) # determines whether or not the other party is banned from this IF Response.Status > 400 : # to a corresponding number of failures ip + 1'd self.stats [cur_proxy] + =. 1 Print ( " current ip {}, {} occurrence of an error status code " .format (cur_proxy, self.stats [cur_proxy]) ) # when a certain number of failures accumulated to a certain number of ip iF self.stats [cur_proxy]> = self.max_failed: # current ip failed more than three times Print ( " The current status code is {}, {} agent may be sealed " .format (response.status, cur_proxy)) # can be considered to be the other side of the closure ip, ip delete this from the agent pool self.remove_proxy (cur_proxy) del Request .meta [ ' Proxy ' ] # this request again to the scheduler, re-download return request # status code normal, a normal return return Response DEF process_exception (Self, request, Exception, Spider): # step Five: the request failed = request.meta.get cur_proxy ( ' proxy ' ) # remove this agent from twisted.internet.error ImportConnectionRefusedError, TimeoutError # if this request is a proxy, and a network request given that the ip a problem IF cur_proxy and the isinstance (Exception, (ConnectionRefusedError, TimeoutError)): Print ( " current {} and the current {} " .format (Exception, cur_proxy)) self.remove_proxy (cur_proxy) del request.META [ ' Proxy ' ] # re-download request return request DEF remove_proxy (Self, Proxy): IF Proxy in self.proxies: self.proxies.remove (proxy) print( " Delete from the list of agents} { " .format (Proxy))
Then set it in settings
settings file # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { ' tutorial.middlewares.RandomProxyMiddleware ' : 749, # modify the download priority number }
Finished