scrapy the Tomb three pages crawled

# Today target
 
** scrapy the Tomb three pages crawled ** 

today crawling Tomb notes that the novel, by the analysis of the main content of the novel in three pages, so we need to 
eleven parse

 * code implementation * 

daomu .py 

`` ` 
Import Scrapy
 from ..items Import DaomuItem 

class DaomuSpider (scrapy.Spider): 
    name = ' daomu ' 
    allowed_domains = [ ' daomubiji.com ' ] 
    start_urls = [ ' http://www.daomubiji.com/ ' ] 

    # analytic functions parse a page 
    DEF parse (Self, the Response):
         #link_list: [ 'HTTP: // XXX / DAO-MU-BI--JI. 1', '', '', ''] 
        link_list response.xpath = ( ' // UL [@ class = "Sub-MENU"] / Li / A / @ the href ' ) .extract ()
         for Link in link_list:
             # to the scheduler 
            the yield scrapy.Request ( 
                URL = Link, 
                the callback = self.parse_two_html 
            ) 

    # analytic function of two pages (chapter chapter number name ring the name link) 
    DEF parse_two_html (Self, the Response):
         # baseline xpath 
        article_list response.xpath = ( ' // Article This article was ')
        for Article This article was in article_list:
             # Create item object 
            item = () DaomuItem
             # info_list: [ 'Chapter' Lu and seven ',' blood dead '] 
            info_list = article.xpath ( ' ./a/text () ' .) .get () Split ()
             IF len (info_list) ==. 3 : 
                Item [ ' volume_name ' ] = info_list [0] 
                Item [ ' zh_num ' ] = info_list [. 1 ] 
                Item [ ' zh_name '] = info_list[2]
            the else : 
                Item [ ' volume_name ' ] = info_list [0] 
                Item [ ' zh_name ' ] = info_list [. 1 ] 
                Item [ ' zh_num ' ] = '' 

            # extract links and distributed scheduler queues 
            Item [ ' zh_link ' ] = Article This article was .xpath ( ' ./a/@href ' ) .get ()
             the yield scrapy.Request ( 
                URL = Item [ ' zh_link ' ],
                 #meta parameters: passing to the next item object analytic function 
                meta = { ' item ' :} item, 
                the callback = self.parse_three_html 
            ) 

    # resolved three pages (novel content) function 
    DEF parse_three_html (Self, Response):
         # obtaining a function transfer from the item object 
        item response.meta = [ ' item ' ]
         # CONTENT_LIST: [ 'paragraph 1', 'paragraph 2', '', ''] 
        CONTENT_LIST = response.xpath (
             " // Article This article was [@ class =" Content-Article This article was "] // P / text () ' 
        ) .extract () 

        Item [ 'zh_content' ] = ' \ N- ' .join (CONTENT_LIST) 

        the yield Item 

`` ` 

items.py 

` `` 

Import Scrapy 

class DaomuItem (scrapy.Item):
     # DEFINE The Fields here Wallpaper for your like Item: 
    # name = scrapy.Field ( ) 
    # volume names 
    volume_name = scrapy.Field ()
     # chapter number 
    zh_num = scrapy.Field ()
     # chapter name 
    zh_name = scrapy.Field ()
     # section links 
    zh_link = scrapy.Field ()
     # novel content 
    zh_content = scrapy.Field()

```

pipelines.py

```
class DaomuPipeline(object):
    def process_item(self, item, spider):

        filename = '/home/tarena/daomu/{}_{}_{}'.format(
            item['volume_name'],
            item['zh_num'],
            item['zh_name']
        )

        with open(filename,'w') as f:
            f.write(item['zh_content'])

        return item

```

 

Guess you like

Origin www.cnblogs.com/cxiaolong/p/11299754.html