scrapy简单实战

本程序有部分错误，请见谅。但其中所涉及的知识点尤为重要。
本爬虫爬取的是http://quotes.toscrape.com/‘的文本内容。具体请参考代码注释
quotes.py

import scrapy
from items import QuoteItem

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:    # 操作类似于迭代查询
            item = QuoteItem()
            text = quote.css('.text::text').extract_first()
            # 使用extract()方法，得到里面的文本内容
            # 之前xpath中我们获取元素是通过.entry-header h1::text,
            # 如果是属性则用.entry-header a::attr(href)
            #  pass  # 自动调用pass方法对url进行解析
            author = quote.css('.author::text').extract_first()
            # 使用extract_first()得到第一个
            tags = quote.css('.tags::text').extract()
            # 区别在于标签是否有一个内容，如果不是一个则用extract()
            # extract()返回的是列表类型
            #extract_first()返回的是字符串类型
            # 可以用控制台命令进行shell操作：进入命令行模式进行操作
            item['text'] = text   #给item赋值，实例化
            item['author'] = author
            item['text'] = tags
            yield item
        next = response.css('.paper .next a::attr(href)').extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url,callback=self.parse,dont_filter=True)
        # parse的作用自己回调自己
        #dont_filter防止自动过滤掉第二页
        # 数据的保存：“命令行模式：scrapy crawl quote -o quotes.json
        # scrapy crawl quote -o quotes.jl   保存成1行
        # scrapy crawl quote -o quotes.csv 保存为csv格式的数据

items.py

import scrapy


class QuoteItem(scrapy.Item):
    text = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()
    #  在爬取的过程中，定义一个存储的结构
    #相当于定义一个统一的数据结构，给他指定一个字段，然后将爬取到的结果作为一个整体存储下来

    # define the fields for your item here like:
    # name = scrapy.Field()

猜你喜欢