scrapy-redis 自定义去重规则

############### xxx.py  ######

from
scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.connection import get_redis_from_settings from scrapy_redis import defaults class RdisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): server = get_redis_from_settings(settings) key = defaults.DUPEFILTER_KEY % {'timestamp':'myScrapy'} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)

到settings.py中配置

# ######################### scrapy redis连接 ##############
REDIS_HOST = "129.28.96.43"  #主机名
REDIS_PORT = 6379            #端口
REDIS_PARAMS = {'password':"beta"}
REDIS_ENCODEING = "utf-8"    #redis编码类型

# REDIS_URL = 'redis://user:pwd@hostname:9001' #连接URL 优先上面配置

DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'

# DUPEFLITER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
DUPEFLITER_CLASS = 'myscrapy.xxx.RedisDupeFilter'

猜你喜欢

转载自www.cnblogs.com/erhao9767/p/10623210.html