scrapy 使用bloomfilter实现增量爬取

直接上代码

import os
from pybloom_live import BloomFilter
from scrapy.exceptions import DropItem

class BloomCheckPipeline(object):
    def __int__(self):
        file_name = 'bloomfilter'

    def open_spider(self, spider):
        file_name = 'bloomfilter'
        is_exist = os.path.exists(file_name + '.blm')
        if is_exist:
            self.bf = BloomFilter.fromfile(open('bloomfilter.blm', 'rb'))
            print('open blm file success')
        else:
            self.bf = BloomFilter(100000, 0.001)
            print('didn\'t find the blm file')

    def process_item(self, item, spider):
        #我是过滤掉相同url的item 各位看需求
        if item['url'] in self.bf:
            print('drop one item for exist')
            raise DropItem('drop an item for exist')
        else:
            self.bf.add(item['url'])
            print('add one success')
            return item

    def close_spider(self, spider):
        self.bf.tofile(open('bloomfilter.blm', 'wb'))

这已经是二次爬取了 多次爬取防止有遗漏的

这里写图片描述

对了不要忘记 在setting里面添加这个pipeline
这里写图片描述

猜你喜欢

转载自blog.csdn.net/qq_33042187/article/details/78929834