Scrapy combines redis for incremental crawler

Incremental crawler: As the name suggests, the crawlers that have been crawled before are not crawled, and those that have not been crawled are crawled.
Requirement: Crawl the title and introduction of the action movie in https://www.4567kan.com/Analysis
: specify the url, create the project, cd into the project, create the crawler file, the link extractor extracts the page number link, and the rule parser performs the rule Formulation, data analysis, and persistent storage are actually the same. The point is how to crawl only updated movies.

Core: Check whether the url of the movie detail page has been crawled before.
Store the url of the crawled movie detail page
in the set data structure of redis (automatic deduplication)
(The empty command of the redis database table: after logging in to the client: flushall)
View stored urls:smembers urls
View stored movieDdata:lrange movieDdata 0 -1

#主文件
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviepro.items import MovieproItem
class MovieSpider(CrawlSpider):
    name = 'movie'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567kan.com/index.php/vod/show/id/5/page/1.html']

    rules = (
        Rule(LinkExtractor(allow=r'\d+.html'), callback='parse_item', follow=True),
    )
    #创建redis连接对象
    conn = Redis(host='127.0.0.1',port=6379)
    #用于解析每一个页面中对应的电影详情页url
    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            detail_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
            #将详情页的url存入redis的set中
            ex = self.conn.sadd('urls',detail_url)#urls为了进行对比
            if ex==1:
                print('该url未被爬取过可进行数据的爬取')
                yield scrapy.Request(url=detail_url,callback=self.detail_parse)
            else:
                print('该url被爬取过 ,无数据可以爬取')
    def detail_parse(self,response):
        movie_name = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        jianjie = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        jianjie = ''.join(jianjie)
        item = MovieproItem()
        item['movie_name'] = movie_name
        item['jianjie']  = jianjie
        yield item
#items
import scrapy


class MovieproItem(scrapy.Item):
    # define the fields for your item here like:
    movie_name = scrapy.Field()
    jianjie = scrapy.Field()
#pipelines
class MovieproPipeline(object):
    conn = None
    def open_spider(self,spider):
        self.conn = spider.conn #连接对象在爬虫文件(spider)中已经创建过,直接调用
    def process_item(self, item, spider):
        dic = {
    
    
            'movie_name':item['movie_name'],
            'jianjie':item['jianjie']
        }
        print(dic)
        self.conn.lpush('movieDdata',dic)#存储我们爬取的数据
        return item

Guess you like

Origin blog.csdn.net/qwerty1372431588/article/details/107311417