python web crawler --CrawlSpider

- CrawlSpider
  - effect: the station data for crawling
  - CrawlSpider is a subclass of Spider
  - how to create a document based CrawlSpider reptile
    - Scrapy genspider -t crawl www.xxx.com XXX
  - Example: choutiPro

  - connecting extraction LinkExtractor is: according to the specified rule (regular) extraction connection
  - the rule the rule parser: connect the extractor to extract the connection request, then the acquired
  page specifying rules [callback] analytical
  - a link extractor corresponds to a unique a parser rule
    - Example: crawlspider depth (full stack) crawling sunlinecrawl [Example]

- Distributed (usually less than crawling enormous magnitude data, distributed with time is small)
  - concept: the program can be executed on a set of multi-state machines (distributed cluster), the data to be distributed crawling
  - scrapy native frame can be achieved if distributed?
    Can not

drawer:

# Spider file 

Import Scrapy
 from scrapy.linkextractors Import LinkExtractor
 from scrapy.spiders Import CrawlSpider, the Rule 

class ChoutiSpider (CrawlSpider): 
    name = ' chouti ' 
    # allowed_domains = [ 'www.xxx.com'] 
    start_urls = [ ' HTTPS: // dig.chouti.com/1 ' ] 

    # connection extractor: extracting conform to the rules from the beginning of the corresponding page url all connections; allow = regex 
    # n null, then extracting all connections page 
    link = LinkExtractor (allow R & lt = ' \ + D ' ) 
    the rules =(
         # Rule parser: Connect the extractor to extract the source connected to the corresponding page specifying analysis rule 
        # the Rule automatically sends a corresponding request to link 
        the Rule (Link, the callback = ' parse_item ' , Follow = True),
         # Follow: True connect the extractor to continue acting connection connected to the extractor extracted source code corresponding to the page 
    ) 

    DEF parse_item (Self, Response): 
        Item = {}
         # Item [ 'Domain_ID'] response.xpath = ( 'INPUT // [@ = ID "SID"] / @ value '). GET () 
        # ' ()). GET Item [ 'name'] response.xpath = ( '// div [@ ID = "name"] 
        # Item [' Description '] response.xpath = (' // div [@ ID = "Description"] '). GET () 
        return Item

 

Sunshine hotline network

# 1.spider文件

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunLineCrawl.items import SunlinecrawlItem,ContentItem

class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']

    link = LinkExtractor(allow=r'type=4&page=\d+') # Extract Page Connecting 
    link1 = LinkExtractor (the allow = R & lt ' Question / 2019 \ D + / \ D + \. Broad03.shtml ' ) # extract details page connected to 
    the rules = ( 
        the Rule (Link, the callback = ' parse_item ' , Follow = False), 
        the Rule (link1, the callback = ' parse_detail ' ), 
    ) 


    # parse the title name data and users 
    DEF parse_item (Self, Response): 
        tr_list = response.xpath ( ' // * [@ ID = "morelist"] / div / Table [ 2] // TR / TD / TR Table // ' )
         for TRin tr_list:
            title = tr.xpath('./td[2]/a[2]/text()').extract_first()
            net_friend = tr.xpath('./td[4]/text()').extract_first()
            item = SunlinecrawlItem()
            item['title'] = title
            item['net_friend'] = net_friend

            yield item

    # 解析出新闻的内容
    def parse_detail(self,response):
        content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]//text()').extract()
        content = ''.join(content)
        item = ContentItem()
        item['content'] = content

        yield item
--------------------------------------------------------------------------------
# 2.items文件

import scrapy

class SunlinecrawlItem(scrapy.Item):
    title = scrapy.Field()
    net_friend = scrapy.Field()

class ContentItem(scrapy.Item):
    content = Scrapy.Field ()
 -------------------------------------------- ------------------------------------
 # 3.pipelines file 

class SunlinecrawlPipeline (Object):
     DEF process_item ( Self, item, Spider):
         # determining what type of the received item (the Content / Sunlinecrawl) 
        IF item. the __class__ . the __name__ == ' SunlinecrawlItem ' :
             Print (item [ ' title ' ], item [ ' net_friend ' ]) 

        the else :
             Print (Item [ ' Content'])

        return item
--------------------------------------------------------------------------------
# 4.setting文件

BOT_NAME = 'sunLineCrawl'

SPIDER_MODULES = ['sunLineCrawl.spiders']
NEWSPIDER_MODULE = 'sunLineCrawl.spiders'

LOG_LEVEL = 'ERROR'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'

ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
   'sunLineCrawl.pipelines.SunlinecrawlPipeline': 300,
}

 

Guess you like

Origin www.cnblogs.com/bilx/p/11598692.html