python笔记(爬虫scrapy框架 redis 队列和栈,优先级)

一、redis 队列和栈

  1. 方式一

    import redis
    
    class LifoQueue(object):
        """Per-spider LIFO queue."""
        def __init__(self):
            self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    
        def push(self, request):
            """Push a request"""
            self.server.lpush("USERS", request)
    
        def pop(self, timeout=0):
            """Pop a request"""
            data = self.server.lpop('USERS')
            return
    
    # [33,22,11]
    
  2. 方式二

    import redis
    
    class PriorityQueue(object):
        """Per-spider priority queue abstraction using redis' sorted set"""
        def __init__(self):
            self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    
        def push(self, request,score):
            """Push a request"""
            # data = self._encode_request(request)
            # score = -request.priority
            # We don't use zadd method as the order of arguments change depending on
            # whether the class is Redis or StrictRedis, and the option of using
            # kwargs only accepts strings, not bytes.
            self.server.execute_command('ZADD', 'xxxxxx', score, request)
    
        def pop(self, timeout=0):
            """
            Pop a request
            timeout not support in this queue class
            """
            # use atomic range/remove using multi/exec
            pipe = self.server.pipeline()
            pipe.multi()
            pipe.zrange('xxxxxx', 0, 0).zremrangebyrank('xxxxxx', 0, 0)
            results, count = pipe.execute()
            if results:
                return results[0]
    
    
    q = PriorityQueue()
    
    q.push('alex',99)
    q.push('oldboy',56)
    q.push('eric',77)
    
    
    v1 = q.pop()
    print(v1)
    v2 = q.pop()
    print(v2)
    v3 = q.pop()
    print(v3)
    

二、去重规则

  1. 方式一(redis集合)

    from scrapy.dupefilter import BaseDupeFilter
    import redis
    from scrapy.utils.request import request_fingerprint
    import scrapy_redis
    
    
    class DupFilter(BaseDupeFilter):
        def __init__(self):
            self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    
        def request_seen(self, request):
            """
            检测当前请求是否已经被访问过
            :param request: 
            :return: True表示已经访问过;False表示未访问过
            """
            fid = request_fingerprint(request)
            result = self.conn.sadd('visited_urls', fid)
            if result == 1:
                return False
            return True
    
  2. 方式二(redis去重)

    from scrapy_redis.dupefilter import RFPDupeFilter
    from scrapy_redis.connection import get_redis_from_settings
    from scrapy_redis import defaults
    
    class RedisDupeFilter(RFPDupeFilter):
        @classmethod
        def from_settings(cls, settings):
            """Returns an instance from given settings.
    
            This uses by default the key ``dupefilter:<timestamp>``. When using the
            ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
            it needs to pass the spider name in the key.
    
            Parameters
            ----------
            settings : scrapy.settings.Settings
    
            Returns
            -------
            RFPDupeFilter
                A RFPDupeFilter instance.
    
    
            """
            server = get_redis_from_settings(settings)
            # XXX: This creates one-time key. needed to support to use this
            # class as standalone dupefilter with scrapy's default scheduler
            # if scrapy passes spider on open() method this wouldn't be needed
            # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
            key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(server, key=key, debug=debug)
    

    setting.py

    REDIS_HOST = '140.143.227.206'                            # 主机名
    REDIS_PORT = 8888                                   # 端口
    REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
    REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
    
    # REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
    
    ################ 去重 ######################
    DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
    DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    

三、调度器

  1. scrapy crawl chouti --nolog

  2. 找到 SCHEDULER = “scrapy_redis.scheduler.Scheduler” 配置并实例化调度器对象
    - 执行Scheduler.from_crawler
    - 执行Scheduler.from_settings
    - 读取配置文件:

     		SCHEDULER_PERSIST			 # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
     		SCHEDULER_FLUSH_ON_START     # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
     		SCHEDULER_IDLE_BEFORE_CLOSE  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
     	- 读取配置文件:	
     		SCHEDULER_QUEUE_KEY			 # %(spider)s:requests
     		SCHEDULER_QUEUE_CLASS		 # scrapy_redis.queue.FifoQueue
     		SCHEDULER_DUPEFILTER_KEY     # '%(spider)s:dupefilter'
     		DUPEFILTER_CLASS			 # 'scrapy_redis.dupefilter.RFPDupeFilter'
     		SCHEDULER_SERIALIZER		 # "scrapy_redis.picklecompat"
    
     	- 读取配置文件:
     		REDIS_HOST = '140.143.227.206'                            # 主机名
     		REDIS_PORT = 8888                                   # 端口
     		REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
     		REDIS_ENCODING = "utf-8"      
     - 实例Scheduler对象
    
  3. 爬虫开始执行起始URL

    扫描二维码关注公众号,回复: 6434988 查看本文章
    调用 scheduler.enqueue_requests()
    
     def enqueue_request(self, request):
     	# 请求是否需要过滤?
     	# 去重规则中是否已经有?(是否已经访问过,如果未访问添加到去重记录中。)
     	if not request.dont_filter and self.df.request_seen(request):
     		self.df.log(request, self.spider)
     		# 已经访问过就不要再访问了
     		return False
     		
     	if self.stats:
     		self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
     	# print('未访问过,添加到调度器', request)
     	self.queue.push(request)
     	return True
    
  4. 下载器去调度器中获取任务,去下载

     - 调用 scheduler.next_requests()
     	def next_request(self):
     		block_pop_timeout = self.idle_before_close
     		request = self.queue.pop(block_pop_timeout)
     		if request and self.stats:
     			self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
     		return request
    

四、深度优先,广度优先

  1. 什么是深度优先?什么是广度优先?

  2. scrapy中如何实现深度和广度优先?

     先进先出,广度优先
     后进先出,深度优先
     
     优先级队列:
     	DEPTH_PRIORITY = 1  # 广度优先
     	DEPTH_PRIORITY = -1 # 深度优先
    
  3. scrapy中 调度器 和 队列 和 dupefilter的关系?

     调度器,调配添加或获取那个request.
     队列,存放request。
     dupefilter,访问记录。
    
  4. 配置

     连接redis配置:
     	REDIS_HOST = '140.143.227.206'                            # 主机名
     	REDIS_PORT = 8888                                   # 端口
     	REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
     	REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
     	
     去重的配置:
     	DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
     	DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
     
     调度器配置:
     	SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    
     	DEPTH_PRIORITY = 1  # 广度优先
     	# DEPTH_PRIORITY = -1 # 深度优先
     	SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
    
     	# 广度优先
     	# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
     	# 深度优先
     	# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
     	SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
    
     	SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
    
     	SCHEDULER_PERSIST = False  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
     	SCHEDULER_FLUSH_ON_START = True  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
     	# SCHEDULER_IDLE_BEFORE_CLOSE = 10  # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
    
    
     	SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key
    
     	# 优先使用DUPEFILTER_CLASS,如果么有就是用SCHEDULER_DUPEFILTER_CLASS
     	SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类
    

猜你喜欢

转载自blog.csdn.net/qq_41433183/article/details/90214581