爬取中华网科技新闻

爬取 http://tech.china.com/articles/

抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。

创建项目
scrapy startproject China

scrapy genspider -t crawl chinatech 

items.py

 1 from scrapy import Field, Item
 2 
 3 
 4 class ChinaItem(Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7 
 8     title = Field()
 9     text = Field()
10     datetime = Field()
11     source = Field()
12     url = Field()
13     website = Field()

chinatech.py

 1 import scrapy
 2 from scrapy.linkextractors import LinkExtractor
 3 from scrapy.spiders import CrawlSpider, Rule
 4 from China.items import *
 5 from China.loaders import *
 6 
 7 class ChinatechSpider(CrawlSpider):
 8     name = 'chinatech'
 9     allowed_domains = ['tech.china.com']
10     start_urls = ['http://tech.china.com/articles/']
11 
12     rules = (
13         Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
14              callback='parse_item'),
15         Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
16     )
17 
18     def parse_item(self, response):
19         loader = ChinaLoader(item=ChinaItem(), response=response)
20         loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
21         loader.add_value('url', response.url)
22         loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
23         loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
24         loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
25         loader.add_value('website', '中华网')
26         yield loader.load_item()

loads.py

 1 from scrapy.loader import ItemLoader
 2 from scrapy.loader.processors import TakeFirst, Join, Compose
 3 
 4 
 5 class NewsLoader(ItemLoader):
 6     default_output_processor = TakeFirst()
 7 
 8 
 9 class ChinaLoader(NewsLoader):
10     text_out = Compose(Join(), lambda s: s.strip())
11     source_out = Compose(Join(), lambda s: s.strip())

pipelines.py

 1 import json
 2 
 3 class ChinaPipeline(object):
 4 
 5     def __init__(self):
 6         self.filename = open("china.json", "w")
 7 
 8     def process_item(self, item, spider):
 9         text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
10         self.filename.write(text)
11         return item
12 
13     def close_spider(self, spider):
14         self.filename.close()
settings.py


 1 BOT_NAME = 'China'
 2 
 3 SPIDER_MODULES = ['China.spiders']
 4 NEWSPIDER_MODULE = 'China.spiders'
 5 
 6 ROBOTSTXT_OBEY = False
 7 
 8 ITEM_PIPELINES = {
 9     'China.pipelines.ChinaPipeline': 300,
10 }

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9240854.html