Implementation process
1. engine crawlers find to be executed and perform start_requests method reptiles, and get an iterator
2. iterator loop when it will get to the Request object, and the request object encapsulates the url and a callback function to be accessed
3. All request object (task) is placed in a scheduler, for later downloading downloader
4. scheduler downloader to get to the download task (that is, the Request object), callback function after the download is complete
5. Go back spider callback function
yield Resquest()
yield Item()
Download middleware (before the agent is to download the middleware to do)
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 from scrapy.http import HtmlResponse 4 from scrapy.http import Request 5 6 7 class Md1(object): 8 9 @classmethod 10 def from_crawler(cls, crawler): 11 # This method is used by Scrapy to create your spiders. 12 s = cls() 13 return s 14 15 def process_request(self, request, spider): 16 # Called for each request that goes through the downloader 17 # middleware. 18 19 # Must either: 20 # - return None: continue processing this request 21 # - or return a Response object 22 # - or return a Request object 23 # - or raise IgnoreRequest: process_exception() methods of 24 # installed downloader middleware will be called 25 print('md1.process_request', request) 26 # Return # default return None None 27 # 1. Response object return 28 # If the return value md2 process_request not performed, a skip to the last process_response method returns middleware layers, 29 # and in django it will return from process_response current middleware directly (this is very different and a little Django) 30 # and middleware me directly intercepted url before forgery, the results of my own to www.xxx.com download, download body 31 # URL = request.url 32 # return HtmlResponse (URL = 'www.xxx.com', = Status 200 is, headers = None, b'innerjob body = ') 33 is # Import Requests 34 is # Result = requests.get (Request. url) 35 #HtmlResponse return (URL = request.url, Status = 200 is, headers = None, body = result.content) 36 37 [ # 2. Request object returned in this case will have a circular motion, to go back and forth the scheduler 38 is # return Request (URL = 'HTTPS: //dig.chouti.com/r/tec/hot/1') 39 40 # 3. thrown process_exception must write () method or else being given 41 is # from scrapy.exceptions Import IgnoreRequest 42 is # the raise IgnoreRequest 43 is 44 is # 4. Usually this is a request processing function but has built cookie can also be added various things added also been built cookie 45 Request.Headers [ ' user_agent ' ] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' 46 47 def process_response(self, request, response, spider): 48 # Called with the response returned from the downloader. 49 50 # Must either; 51 # - return a Response object 52 # - return a Request object 53 # - or raise IgnoreRequest 54 print('md1.process_response', request, response) 55 return response 56 57 def process_exception(self, request, exception, spider): 58 # Called when a download handler or a process_request() 59 # (from other downloader middleware) raises an exception. 60 61 # Must either: 62 # - return None: continue processing this exception 63 # - return a Response object: stops process_exception() chain 64 # - return a Request object: stops process_exception() chain 65 pass 66 67 68 class Md2(object): 69 70 def process_request(self, request, spider): 71 # Called for each request that goes through the downloader 72 # middleware. 73 74 # Must either: 75 # - return None: continue processing this request 76 # - or return a Response object 77 # - or return a Request object 78 # - or raise IgnoreRequest: process_exception() methods of 79 # installed downloader middleware will be called 80 print('md2.process_request', request) 81 return None 82 83 def process_response(self, request, response, spider): 84 # Called with the response returned from the downloader. 85 86 # Must either; 87 # - return a Response object 88 # - return a Request object 89 # - or raise IgnoreRequest 90 print('md2.process_response', request, response) 91 return response 92 93 def process_exception(self, request, exception, spider): 94 # Called when a download handler or a process_request() 95 # (from other downloader middleware) raises an exception. 96 97 # Must either: 98 # - return None: continue processing this exception 99 # - return a Response object: stops process_exception() chain 100 # - return a Request object: stops process_exception() chain 101 pass
application
1 DOWNLOADER_MIDDLEWARES = { 2 # 'redisdepth.middlewares.RedisdepthDownloaderMiddleware' 543, 3 # 'redisdepth.md.Md1' 666, 4 # 'redisdepth.md.Md2': 667 5 }
flow chart:
Reptile middleware (before the depth is like this do)
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 5 class Sd1(object): 6 # Not all methods need to be defined. If a method is not defined, 7 # scrapy acts as if the spider middleware does not modify the 8 # passed objects. 9 10 @classmethod 11 def from_crawler(cls, crawler): 12 # This method is used by Scrapy to create your spiders. 13 s = cls() 14 #crawler.signals.connect (s.spider_opened, signal = signals.spider_opened) # This is the extension need not look to the signal, the signal also Django inside this thing 15 return S 16 . 17 DEF process_spider_input (Self, Response, Spider) : 18 is # the Called for that goes through each Response The Spider . 19 # Middleware The INTO and Spider. 20 is 21 is # Should return or None Exception The raise AN. 22 is # executed after the download is complete, then to parse process 23 is return None 24 25 DEF process_spider_output (Self, the Response, the Result, Spider): 26 # Called with the results returned from the Spider, after 27 # it has processed the response. 28 29 # Must return an iterable of Request, dict or Item objects. 30 # spider处理完成 31 for i in result: 32 yield i 33 34 def process_spider_exception(self, response, exception, spider): 35 # Called when a spider or process_spider_input() method 36 # (from other spider middleware) raises an exception. 37 38 #None either AN or return Should Iterable of the Request, dict 39 # or Item Objects. 40 # exception call 41 is # return: None, to continue the subsequent processing middleware exception; Response containing or Item iterable (iterable), to scheduler or Pipeline 42 is Pass 43 is 44 is # started only when reptiles, read only once initially crawler start_requests process returns generator then here return loop of one 45 DEF process_start_requests (Self, start_requests, Spider): 46 is # the Called with Requests The Start of The Spider, and Works 47 # similarly to The process_spider_output () Method, the except 48 # that IT does have have A Not Associated Response. 49 50 # Must return only requests (not items). 51 for r in start_requests: 52 yield r 53 54 def spider_opened(self, spider): 55 spider.logger.info('Spider opened: %s' % spider.name) 56 57 58 class Sd2(object): 59 # Not all methods need to be defined. If a method is not defined, 60 # scrapy acts as if the spider middleware does not modify the 61 # passed objects. 62 63 @classmethod 64- DEF from_crawler (CLS, crawler): 65 # This Method, IS Used by Scrapy to the Create your Spiders. 66 S = CLS () 67 # crawler.signals.connect (s.spider_opened, Signal = signals.spider_opened) # this is the extended first need not see the signal, the Django inside also signal this thing 68 return S 69 70 DEF process_spider_input (Self, Response, Spider): 71 is # the Called for each Response that goes through the Spider 72 # Middleware and INTO Spider The. 73 is 74 # Should return or None Exception The raise AN. 75 return None 76 77 def process_spider_output(self, response, result, spider): 78 # Called with the results returned from the Spider, after 79 # it has processed the response. 80 81 # Must return an iterable of Request, dict or Item objects. 82 for i in result: 83 yield i 84 85 def process_spider_exception(self, response, exception, spider): 86 # Called when a spider or process_spider_input() method 87 # (from other spider middleware) raises an exception. 88 89 # Should return either None or an iterable of Request, dict 90 # or Item objects. 91 pass 92 93 def process_start_requests(self, start_requests, spider): 94 # Called with the start requests of the spider, and works 95 # similarly to the process_spider_output() method, except 96 # that it doesn’t have a response associated. 97 98 # Must return only requests (not items). 99 for r in start_requests: 100 yield r 101 102 def spider_opened(self, spider): 103 spider.logger.info('Spider opened: %s' % spider.name)
application
1 SPIDER_MIDDLEWARES = { 2 # 'redisdepth.middlewares.RedisdepthSpiderMiddleware' 543, 3 ' redisdepth.sd.Sd1 ' 666 , 4 ' redisdepth.sd.Sd2 ' 667 , 5 6 }
flow chart
Built-in middleware downloads
Built-in middleware reptiles
This is an experiment screenshot