使用scrapy爬取新浪新闻

使用scrapy爬取新浪新闻
思路：通过观察，获取某节点作为当前节点，然后依次遍历大类链接小类链接子链接
要点：注意item和meta参数的使用。详情见代码newsina.py里面相关的注释
总结：个人因为item的位置，导致浪费了好多时间。
流程如下：

创建爬虫项目sina2
```
scrapy startproject sina2
```

设置items.py文件，存储要爬取的数据类型及字段名字


# -*- coding: utf-8 -*-

import scrapy


class Sina2Item(scrapy.Item):
    # 大类链接和url,如新闻　体育等
    parentUrl = scrapy.Field()
    parentTitle = scrapy.Field()

    # 小类链接和url，如新闻下面的国内　社会等
    subUrl = scrapy.Field()
    subTitle = scrapy.Field()
    # 小类存储路径
    subpath = scrapy.Field()

    # 子链接
    sonUrl = scrapy.Field()
    # 子链接里面的标题和内容
    head = scrapy.Field()
    content = scrapy.Field()

创建爬虫文件newsina.py


# -*- coding: utf-8 -*-

import os
import scrapy
from sina2.items import Sina2Item


class NewsinaSpider(scrapy.Spider):
    name = 'newsina'
    allowed_domains = ['sina.com.cn']
    start_urls = ['http://news.sina.com.cn/guide/']

    def parse(self, response):

        #　通过某节点作为根节点进行大类链接遍历
        for each in response.xpath("//div[@id='tab01']/div[@data-sudaclick!='citynav']"):
            # 获取大类链接和大类标题
            parentUrl = each.xpath('./h3/a/@href').extract()[0].encode('utf-8')
            parentTitle = each.xpath('./h3/a/text()').extract()[0].encode('utf-8')

            # 设置大类存储路径
            parentpath = './data/' + parentTitle
            if not os.path.exists(parentpath):
                os.makedirs(parentpath)

            #　遍历小类链接
            for other in each.xpath("./ul/li/a"):

                #　获取以大类链接开头的小类链接
                if other.xpath('./@href').extract()[0].startswith(parentUrl):
                    #　注意item的位置，不同的位置会导致不同的结果。尽量不要把item的数据在外循环和内循环里面分别获取，如必须这样做，则创建空列表添加item来解决。
                    item = Sina2Item()
                    subUrl = other.xpath('./@href').extract()[0].encode('utf-8')
                    subTitle = other.xpath('./text()').extract()[0].encode('utf-8')
                    subpath = parentpath + '/' + subTitle
                    item['parentUrl'] = parentUrl
                    item['parentTitle'] = parentTitle
                    item['subUrl'] = subUrl
                    item['subTitle'] = subTitle
                    item['subpath'] = subpath

                    if not os.path.exists(subpath):
                        os.makedirs(subpath)

                    #　发送小类链接请求，使用meta参数把item数据传递到回调函数里面，通过response.meta['']得到数据
                    yield scrapy.Request(url=item['subUrl'],meta={'meta_1':item},callback=self.second_parse)


    def second_parse(self,response):

        # 获取meta参数里面键为'meta_1'的值
        meta_1 = response.meta['meta_1']
        items = []
        #　遍历小类里面的子链接
        for each in response.xpath('//a/@href'):
            #　获取的子链接，以大类链接开头，以.shtml结尾
            if each.extract().encode('utf-8').startswith(meta_1['parentUrl']) and each.extract().encode('utf-8').endswith('.shtml'):
                item = Sina2Item()
                item['parentUrl'] = meta_1['parentUrl']
                item['parentTitle'] = meta_1['parentTitle']
                item['subUrl'] = meta_1['subUrl']
                item['subTitle'] = meta_1['subTitle']
                item['subpath'] = meta_1['subpath']
                item['sonUrl'] = each.extract().encode('utf-8')
                items.append(item)


        # 发送子链接请求
        for each in items:
            yield scrapy.Request(each['sonUrl'],meta={'meta_2':each},callback=self.detail_parse)

    def detail_parse(self,response):
        item = response.meta['meta_2']
        # 获取标题和内容不为空的子链接
        # if len(response.xpath("//h1[@class='main-title']/text()")) != 0 and len(response.xpath("//div[@class='article']/p/text()")) != 0:
        #   item['head'] = response.xpath("//h1[@class='main-title']/text()").extract()[0].encode('utf-8')
        #   item['content'] = ''.join(response.xpath("//div[@class='article']/p/text()").extract()).encode('utf-8')
        #   yield item
        item['head'] = response.xpath("//h1[@class='main-title']")
        item['content'] = ''.join(response.xpath("//div[@id='artibody']/p/text()").extract()).encode('utf-8')
        yield item

创建管道pipelines.py文件


# -*- coding: utf-8 -*-


class Sina2Pipeline(object):
    def process_item(self, item, spider):
        # 设置保存的文件名，把子链接去掉'http://'和'.shtml'，把'/'替换成‘_’，保存为txt文件格式
        self.filename = item['sonUrl'][7:-6].replace('/','_') + '.txt'
        self.file = open(item['subpath'] + '/' + self.filename,'w')
        self.file.write(item['sonUrl'] + '\n' + item['content'])
        self.file.close()
        return item

修改配置settings.py文件


# -*- coding: utf-8 -*-



BOT_NAME = 'sina2'

SPIDER_MODULES = ['sina2.spiders']
NEWSPIDER_MODULE = 'sina2.spiders'

USER_AGENT = 'User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'


# Disable cookies (enabled by default)

COOKIES_ENABLED = False


# Configure item pipelines


# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
   'sina2.pipelines.Sina2Pipeline': 300,
}

终端测试
```
scrapy crawl newsina
```
测试图片如下：结果里面也有空的，那是因为没有过滤内容和标题，关于过滤的看爬虫文件最下面的注释代码

使用scrapy爬取新浪新闻

猜你喜欢