scrapy结合redis进行增量式爬虫

增量式爬虫:顾名思义就是以前爬取过的不在爬取,未爬取过的进行爬取。
需求:爬取https://www.4567kan.com/中的动作电影的标题和简介
分析:指定url, 创建工程,cd进入工程,创建爬虫文件,链接提取器提取页码链接,规则解析器进行规则制定,数据解析,持久化存储,其实都是一样的,重点在于怎么只爬取更新的电影。

核心:检测电影详情页的url之前是否爬取过
将爬取过的电影详情页url存储
存储到redis的set数据结构中(可以自动去重)
(redis数据库表的清空命令:登录客户端后:flushall)
查看存储的urls:smembers urls
查看存储的movieDdata:lrange movieDdata 0 -1

#主文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviepro.items import MovieproItem
class MovieSpider(CrawlSpider):
    name = 'movie'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567kan.com/index.php/vod/show/id/5/page/1.html']

    rules = (
        Rule(LinkExtractor(allow=r'\d+.html'), callback='parse_item', follow=True),
    )
    #创建redis连接对象
    conn = Redis(host='127.0.0.1',port=6379)
    #用于解析每一个页面中对应的电影详情页url
    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            detail_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
            #将详情页的url存入redis的set中
            ex = self.conn.sadd('urls',detail_url)#urls为了进行对比
            if ex==1:
                print('该url未被爬取过可进行数据的爬取')
                yield scrapy.Request(url=detail_url,callback=self.detail_parse)
            else:
                print('该url被爬取过 ,无数据可以爬取')
    def detail_parse(self,response):
        movie_name = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        jianjie = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        jianjie = ''.join(jianjie)
        item = MovieproItem()
        item['movie_name'] = movie_name
        item['jianjie']  = jianjie
        yield item
#items
import scrapy


class MovieproItem(scrapy.Item):
    # define the fields for your item here like:
    movie_name = scrapy.Field()
    jianjie = scrapy.Field()
#pipelines
class MovieproPipeline(object):
    conn = None
    def open_spider(self,spider):
        self.conn = spider.conn #连接对象在爬虫文件(spider)中已经创建过,直接调用
    def process_item(self, item, spider):
        dic = {
    
    
            'movie_name':item['movie_name'],
            'jianjie':item['jianjie']
        }
        print(dic)
        self.conn.lpush('movieDdata',dic)#存储我们爬取的数据
        return item

猜你喜欢

转载自blog.csdn.net/qwerty1372431588/article/details/107311417