Reptile crawlspider

Based crawlspider reptile sunlight complaints network

 

CrawlSpider the station data of crawling
- CrawlSpider is another form of reptiles. CrawlSpider is a subclass of Spider
- create a file based CrawlSpider reptiles:
- scrapy genspider -t crawl spiderName www.xxx.com

 

 

sun.py

# - * - Coding: UTF- . 8 - * - 
Import Scrapy 
from scrapy.linkextractors Import LinkExtractor
 from scrapy.spiders Import CrawlSpider, the Rule
 from sunPro.items Import SunproItem, SunProDetail 

# class the SunSpider (CrawlSpider): 
# name = ' Sun ' 
# allowed_domains # = [ ' www.xxx.com ' ] 
# start_urls = [ ' http://wz.sun0769.com/index.php/question/questionType?type=4&page= ' ] 
# # extractor connected: 
# # action : that is, according to the specified rule (the allow: regular) connected extract 
# link= LinkExtractor (the allow = R & lt ' type =. 4 & Page = \ D + ' ) 
# the rules = ( 
resolver ## Rule 
# # effect: is responsible for the page source data connections extractor extracts the connection corresponding to specified rules (callback) of parsing 
# the Rule (Link, the callback = ' parse_item ' , Follow = True), 
# #follow = True: connect the extractor to continue acting connection connected to the extractor extracts the corresponding source code page 
#) 
# 
# DEF parse_item ( Self, Response): 
# Print (Response) 

# crawling depth 
class the SunSpider (CrawlSpider): 
    name = ' Sun ' 
    # allowed_domains = [' Www.xxx.com ' ] 
    start_urls = [ ' http://wz.sun0769.com/index.php/question/questionType?type=4&page= ' ] 
    # connection extractor: 
        # effect: according to the specified rule is ( allow: extracting regular) for connection 
    Link = LinkExtractor (the allow = R & lt ' type =. 4 & page = \ D + ' ) 
    # use another connector extraction period to extract details page connector 
    link_detail = LinkExtractor (the allow = R & lt ' Question / \ D + / . \ D + \ broad03.shtml ' ) 
    the rules = ( 
        parser rule # 
            # effect: is responsible for the connection to the extractor extracts the connection data corresponding to the source page specifying rule (callback) parsing 
        the rule (Link, the callback = 'parse_item ' , Follow = False), 
        #follow = True: Connect the extractor to continue acting connection connected to the extractor extracts the corresponding page source code 
        the Rule (link_detail, the callback = ' parse_detail ' ) 
    ) 

    DEF parse_item (Self, Response ): 
        tr_list = response.xpath ( ' // * [@ ID = "morelist"] / div / Table [2] // TR / TD / TR Table // ' )
         for TR in tr_list: 
            title = tr.xpath ( ' ./td [2] / A [2] / text () ' ) .extract_first () 
            NUM = tr.xpath ( ' ./td [. 1] / text ()').extract_first()
            item = SunproItem()
            item['title'] = title
            item['num'] = num

            yield item

    def parse_detail(self,response):
        content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()').extract_first()
        num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
        num = num.split(':')[-1]

        item = SunProDetail()
        item['content'] = content
        item['num'] = num

        yield item

 

items.py

import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    num = scrapy.Field()

class SunProDetail(scrapy.Item):
    content = scrapy.Field()
    num = scrapy.Field()

 

pipline.py

class SunproPipeline(object):
    def process_item(self, item, spider):
        if item.__class__.__name__ == 'SunProDetail':
            content = item['content']
            num = item['num']
        else:
            title = item['content']
            num = item['num']
        return item

 

Guess you like

Origin www.cnblogs.com/XLHIT/p/11332495.html