Spider scrapy distributed source code analysis and implementation process

Scrapy_redis distributed framework to achieve a complete assembly, which also achieve the spider, RedisSpider slight changes in succession based on Spider scrapy original, not read from the initial URL start_urls list, but starting from the queue redis read.

scrapy_redis source in scrapy.redis.spider, not only to achieve a RedisSpider (distributed reptiles) also achieved RedisCrawlSpider (Distributed depth reptiles) logic, but a lot of both methods is the same.

Source as follows:

from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from scrapy.spiders import Spider, CrawlSpider

from . import connection


# Default batch size matches default concurrent requests setting.

DEFAULT_START_URLS_BATCH_SIZE = 16

DEFAULT_START_URLS_KEY = '%(name)s:start_urls'


class RedisMixin(object):

    """Mixin class to implement reading urls from a redis queue."""

    # Per spider redis key, default to DEFAULT_START_URLS_KEY.

    redis_key = None

    # Fetch this amount of start urls when idle. Default to DEFAULT_START_URLS_BATCH_SIZE.

    redis_batch_size = None

    # Redis client instance.

    server = None


    def start_requests(self):

        """Returns a batch of start requests from redis."""

        return self.next_requests()


    def setup_redis(self, crawler=None):

        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        """

        if self.server is not None:

            return


        if crawler is None:

            # We allow optional crawler argument to keep backwards

            # compatibility.

            # XXX: Raise a deprecation warning.

            crawler = getattr(self, 'crawler', None)


        if crawler is None:

            raise ValueError("crawler is required")


        settings = crawler.settings


        if self.redis_key is None:

            self.redis_key = settings.get(

                'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,

            )


        self.redis_key = self.redis_key % {'name': self.name}


        if not self.redis_key.strip():

            raise ValueError("redis_key must not be empty")


        if self.redis_batch_size is None:

            self.redis_batch_size = settings.getint(

                'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,

            )


        try:

            self.redis_batch_size = int(self.redis_batch_size)

        except (TypeError, ValueError):

            raise ValueError("redis_batch_size must be an integer")


        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "

                         "(batch size: %(redis_batch_size)s)", self.__dict__)


        self.server = connection.from_settings(crawler.settings)

        # The idle signal is called when the spider has no requests left,

        # that's when we will schedule new requests from redis queue

        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)


    def next_requests(self):

        """Returns a request to be scheduled or none."""

        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET')

        fetch_one = self.server.spop if use_set else self.server.lpop

        # XXX: Do we need to use a timeout here?

        found = 0

        while found < self.redis_batch_size:

            data = fetch_one(self.redis_key)

            if not data:

                # Queue empty.

                break

            req = self.make_request_from_data(data)

            if req:

                yield req

                found += 1

            else:

                self.logger.debug("Request not made from data: %r", data)


        if found:

            self.logger.debug("Read %s requests from '%s'", found, self.redis_key)


    def make_request_from_data(self, data):

        # By default, data is an URL.

        if '://' in data:

            return self.make_requests_from_url(data)

        else:

            self.logger.error("Unexpected URL from '%s': %r", self.redis_key, data)


    def schedule_next_requests(self):

        """Schedules a request if available"""

        for req in self.next_requests():

            self.crawler.engine.crawl(req, spider=self)


    def spider_idle(self):

        """Schedules a request if available, otherwise waits."""

        # XXX: Handle a sentinel to close the spider.

        self.schedule_next_requests()

        raise DontCloseSpider


class RedisSpider(RedisMixin, Spider):

    """Spider that reads urls from redis queue when idle."""


    @classmethod

    def from_crawler(self, crawler, *args, **kwargs):

        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)

        obj.setup_redis(crawler)

        return obj


class RedisCrawlSpider(RedisMixin, CrawlSpider):

    """Spider that reads urls from redis queue when idle."""


    @classmethod

    def from_crawler(self, crawler, *args, **kwargs):

        obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)

        obj.setup_redis(crawler)

        return obj

Start start_requests method is obtained from Redis all connections, then the URL to the weight list than to weight, not to re-transmitted to the engine through make_requests_from_url (url), then to the scheduler, where and general crawler idea is consistent .

When the engine receives the task queue is empty messages will call Spider_idle method, which in turn calls schedule_next_requests iterations to get the starting URL submitted to the engine, and then began the task until the task is completed in the initial call to the queue of URL.

   def schedule_next_requests(self):

        """Schedules a request if available"""

        for req in self.next_requests():

            self.crawler.engine.crawl(req, spider=self)


    def spider_idle(self):

        """Schedules a request if available, otherwise waits."""

        # XXX: Handle a sentinel to close the spider.

        self.schedule_next_requests()

        raise DontCloseSpider

RedisSpider and RedisCrawlSpider very simple, inherited the Spider and RedisMiXine, realized setup_redis method, which will be based on different crawler initialization setting, redis_key, and connect through the interface to the spider bound spider_idle signal binding.

class RedisSpider(RedisMixin, Spider):

    """Spider that reads urls from redis queue when idle."""


    @classmethod

    def from_crawler(self, crawler, *args, **kwargs):

        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)

        obj.setup_redis(crawler)

        return obj


class RedisCrawlSpider(RedisMixin, CrawlSpider):

    """Spider that reads urls from redis queue when idle."""


    @classmethod

    def from_crawler(self, crawler, *args, **kwargs):

        obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)

        obj.setup_redis(crawler)

        return obj


 def setup_redis(self, crawler=None):

        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        """

        if self.server is not None:

            return


        if crawler is None:

            # We allow optional crawler argument to keep backwards

            # compatibility.

            # XXX: Raise a deprecation warning.

            crawler = getattr(self, 'crawler', None)


        if crawler is None:

            raise ValueError("crawler is required")


        settings = crawler.settings


        if self.redis_key is None:

            self.redis_key = settings.get(

                'REDIS_START_URLS_KEY', DEFAULT_START_URLS_KEY,

            )


        self.redis_key = self.redis_key % {'name': self.name}


        if not self.redis_key.strip():

            raise ValueError("redis_key must not be empty")


        if self.redis_batch_size is None:

            self.redis_batch_size = settings.getint(

                'REDIS_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,

            )


        try:

            self.redis_batch_size = int(self.redis_batch_size)

        except (TypeError, ValueError):

            raise ValueError("redis_batch_size must be an integer")


        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "

                         "(batch size: %(redis_batch_size)s)", self.__dict__)


        self.server = connection.from_settings(crawler.settings)

        # The idle signal is called when the spider has no requests left,

        # that's when we will schedule new requests from redis queue

        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

Summary: RedisSpider acquired start_url generated from the initial queue Redis Request, and then submitted to scrapy Redis Scheduler scheduling corresponding to the engine, and re-queues to completion. When the scheduler conditions is referred to Request queue downloading engine to generate a Response object submitted to analytical Spider generate new Request, continue to fetch; after generation until no new Request, Spider starting from a queue to acquire URL, continuing the previous cycle, of course, if there is no queue, then reptiles is over.

Guess you like

Origin www.cnblogs.com/pypypy/p/12121827.html