Based crawlspider reptile sunlight complaints network
CrawlSpider the station data of crawling
- CrawlSpider is another form of reptiles. CrawlSpider is a subclass of Spider
- create a file based CrawlSpider reptiles:
- scrapy genspider -t crawl spiderName www.xxx.com
sun.py
# - * - Coding: UTF- . 8 - * - Import Scrapy from scrapy.linkextractors Import LinkExtractor from scrapy.spiders Import CrawlSpider, the Rule from sunPro.items Import SunproItem, SunProDetail # class the SunSpider (CrawlSpider): # name = ' Sun ' # allowed_domains # = [ ' www.xxx.com ' ] # start_urls = [ ' http://wz.sun0769.com/index.php/question/questionType?type=4&page= ' ] # # extractor connected: # # action : that is, according to the specified rule (the allow: regular) connected extract # link= LinkExtractor (the allow = R & lt ' type =. 4 & Page = \ D + ' ) # the rules = ( resolver ## Rule # # effect: is responsible for the page source data connections extractor extracts the connection corresponding to specified rules (callback) of parsing # the Rule (Link, the callback = ' parse_item ' , Follow = True), # #follow = True: connect the extractor to continue acting connection connected to the extractor extracts the corresponding source code page #) # # DEF parse_item ( Self, Response): # Print (Response) # crawling depth class the SunSpider (CrawlSpider): name = ' Sun ' # allowed_domains = [' Www.xxx.com ' ] start_urls = [ ' http://wz.sun0769.com/index.php/question/questionType?type=4&page= ' ] # connection extractor: # effect: according to the specified rule is ( allow: extracting regular) for connection Link = LinkExtractor (the allow = R & lt ' type =. 4 & page = \ D + ' ) # use another connector extraction period to extract details page connector link_detail = LinkExtractor (the allow = R & lt ' Question / \ D + / . \ D + \ broad03.shtml ' ) the rules = ( parser rule # # effect: is responsible for the connection to the extractor extracts the connection data corresponding to the source page specifying rule (callback) parsing the rule (Link, the callback = 'parse_item ' , Follow = False), #follow = True: Connect the extractor to continue acting connection connected to the extractor extracts the corresponding page source code the Rule (link_detail, the callback = ' parse_detail ' ) ) DEF parse_item (Self, Response ): tr_list = response.xpath ( ' // * [@ ID = "morelist"] / div / Table [2] // TR / TD / TR Table // ' ) for TR in tr_list: title = tr.xpath ( ' ./td [2] / A [2] / text () ' ) .extract_first () NUM = tr.xpath ( ' ./td [. 1] / text ()').extract_first() item = SunproItem() item['title'] = title item['num'] = num yield item def parse_detail(self,response): content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()').extract_first() num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first() num = num.split(':')[-1] item = SunProDetail() item['content'] = content item['num'] = num yield item
items.py
import scrapy class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() num = scrapy.Field() class SunProDetail(scrapy.Item): content = scrapy.Field() num = scrapy.Field()
pipline.py
class SunproPipeline(object): def process_item(self, item, spider): if item.__class__.__name__ == 'SunProDetail': content = item['content'] num = item['num'] else: title = item['content'] num = item['num'] return item