scrapy proxy ip pool middleware

A proxy ip pool middleware is recorded here, which can be reused directly when doing projects in the future

middleware documentation 


# -*-coding: utf-8-*- 
# Define here the models for your spider middleware 
# See documentation in: 
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html 

Import Random
 from scrapy Import Signals 



# create a middleware proxy ip pool 
from the Collections Import defaultdict
 from scrapy.exceptions Import NotConfigured 

class RandomProxyMiddleware (Object): 

    DEF  __init__ (Self, Settings):
         # The third step is to configure and initialize variables 
        #Write a list of configuration settings in PROXIES 
        # from the proxy settings in the time coming (the environment variable time coming) 
        self.proxies = settings.getlist ( " PROXIES " ) 
        self.stats = defaultdict (int)   # The default value is 0 count the number of 
        = 3 self.max_failed   # request no more than three times 

    @classmethod 
    DEF from_cralwer (CLS, crawler):
         # first step in creating middleware objects 
        # first get the configuration HTTPPROXY_ENABLED to see if the agent is enabled, 
        iF  not crawler.settings.getbool ( " HTTPPROXY_ENABLED " ):   # If the agent is not enabled 
            The raise NotConfigured
         #auth_encoding = crawler.settings.get ( "HTTPPROXY_AUTH_ENCODING") # read the configuration, there being no 
        # Step 
        return CLS (crawler.settings)   # CLS () call is actually init () function, if accepted init parameter, cls you need parameters 

    DEF process_request (Self, request, Spider):
         # step four randomly assigned a ip proxy for each request object 
        # make the initial request to use a proxy url Do not use proxy ip 
        IF self.proxies and  not request.meta.get ( " proxy " ) and request.url not  in spider.start_urls: 
            request.meta [ " proxy " ] =The random.choice (self.proxies) 
      

    DEF process_response (Self, Request, Response, Spider):
         # Step Five: request succeeded 
        cur_proxy = request.meta.get ( ' Proxy ' )
         # determines whether or not the other party is banned from this 
        IF Response.Status > 400 :
             # to a corresponding number of failures ip + 1'd 
            self.stats [cur_proxy] + =. 1 Print ( " current ip {}, {} occurrence of an error status code " .format (cur_proxy, self.stats [cur_proxy]) )
         # when a certain number of failures accumulated to a certain number of ip iF self.stats [cur_proxy]> = self.max_failed:   # current ip failed more than three times Print ( "
            
        
            The current status code is {}, {} agent may be sealed " .format (response.status, cur_proxy))
             # can be considered to be the other side of the closure ip, ip delete this from the agent pool 
            self.remove_proxy (cur_proxy)
             del Request .meta [ ' Proxy ' ]
             # this request again to the scheduler, re-download 
            return request 

        # status code normal, a normal return 
        return Response 

    DEF process_exception (Self, request, Exception, Spider):
         # step Five: the request failed 
        = request.meta.get cur_proxy ( ' proxy ' )    # remove this agent 
        from twisted.internet.error ImportConnectionRefusedError, TimeoutError
         # if this request is a proxy, and a network request given that the ip a problem 
        IF cur_proxy and the isinstance (Exception, (ConnectionRefusedError, TimeoutError)):
             Print ( " current {} and the current {} " .format (Exception, cur_proxy)) 
            self.remove_proxy (cur_proxy) 
            del request.META [ ' Proxy ' ]
             # re-download request 
            return request 

    DEF remove_proxy (Self, Proxy):
         IF Proxy in self.proxies: 
            self.proxies.remove (proxy) 
            print( " Delete from the list of agents} { " .format (Proxy))

Then set it in settings

settings file 

# Enable or disable downloader middlewares 
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 
DOWNLOADER_MIDDLEWARES = {
    ' tutorial.middlewares.RandomProxyMiddleware ' : 749,   # modify the download priority number 
}

Finished

Guess you like

Origin www.cnblogs.com/lattesea/p/12749385.html