start_urls internal principle
step
. 1 Scrapy engine crawlers taken initial URL: 2 . 1 . Start_requests call and acquires the return value . 3 2.v = ITER (return value) . 4 . 3 . . 5 REQl executed = V. __Next__ () . 6 REQ2 executed = V. __Next__ () . 7 REQ3 executed = V. __next__ () . 8 4. put all scheduler
write
. 1 class ChoutiSpider (scrapy.Spider): 2 name = ' chouti ' . 3 # crawled pages oriented only in the domain . 4 allowed_domains = [ ' chouti.com ' ] . 5 start_urls = [ ' HTTPS: //dig.chouti. COM / ' ] . 6 cookie_dict = {} . 7 . 8 DEF start_requests (Self): . 9 # mode. 1 10 # for URL in self.start_urls: . 11 # the yield the Request (URL = URL) 12 is # embodiment 2 13 is # req_list = [] 14 # for url in self.start_urls: 15 # req_list.append(Request(url=url)) 16 # return req_list 17 pass
Use of knowledge
Iterables or a generator of direct iterator iter way into the future when you can customize start_urls sent directly post their own request, with the built-in default get method, url can also get to take in the cache redis.
Source part:
Depth, priority
Source Process Analysis
I combine this blog depth https://www.cnblogs.com/Alexephor/p/11437061.html
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from wyb.items import WybItem 4 from scrapy.dupefilters import RFPDupeFilter 5 from scrapy.http.response.html import HtmlResponse 6 from scrapy.http.cookies import CookieJar 7 from urllib.parse import urlencode 8 from scrapy.http import Request 9 10 11 classChoutiSpider (scrapy.Spider): 12 is name = ' chouti ' 13 is # crawled pages orientation of the domain only 14 allowed_domains = [ ' chouti.com ' ] 15 start_urls = [ ' https://dig.chouti.com/ ' ] 16 . 17 DEF start_requests (Self): 18 is Import OS . 19 # downloadmiddleware proxy settings in httppxoxy 20 is os.environ [' the HTTP_PROXY '] =' 1.1.1.2 ' 21 is os.environ [' https_proxy '] =' HTTP: // root: [email protected]: 8888 / ' 22 # mode 1 23 for url in self.start_urls: 24 yield Request(url=url) 25 # 方式2 26 # req_list = [] 27 # for url in self.start_urls: 28 # req_list.append(Request(url=url)) 29 # return req_list 30 31 def parse(self, response): 32 """ 33 第一次访问响应response 34 :param response: 35 :return: 36 """ 37 from scrapy.spidermiddlewares.depth import DepthMiddleware 38 from scrapy.http import Response 39 # response.request 40 # response.request.meta 此时None 41 print(response.meta.get('depth', 0)) 42 # response.request.meta['depth'] = 0 43 44 page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() 45 for page in page_list: 46 from scrapy.http import Request 47 page = "https://dig.chouti.com"+page 48 # Continue to issue requests, callback the parse 49 the yield the Request (URL = Page, the callback = self.parse, Meta = { "Proxy": "HTTP: // the root: [email protected]: 8888 /"} )
Source facie
to sum up:
- 0 is the beginning of each depth in yield from the original request depth plus 1
- Priority - * = depth profile default priority 0