Scrapy Practice Guide 1 - Common Configurations

This article is based on the scrapy 2.6 version description

foreword

After half a month of parameter tuning, I sorted out some commonly used configurations and practice methods in Scrapy (all lessons from blood and tears TAT)

Configuration instructions

settings.py common configuration

It is recommended to put the global configuration in this location, such as database connection, third-party secret key, mail configuration/webhook and other information, and Spider-related configuration is not recommended to be placed in this file
# 是否遵循robots.txt规则(建议关闭)
ROBOTSTXT_OBEY = False
# 注意: 网站实际最大并发数为min(CONCURRENT_REQUESTS, CONCURRENT_REQUESTS_PER_DOMAIN, CONCURRENT_REQUESTS_PER_IP)
# 否则可能达不到自己预期的并发数
# 最大并发请求数, 默认: 16
CONCURRENT_REQUESTS = 16
# 每个网站的最大并发请求数
CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 每个IP的最大并发请求数
CONCURRENT_REQUESTS_PER_IP = 16
# 下载延迟, 防止下载速度过快对网站服务器造成影响, 这会产生一个随机值, 值域为 0.5 * DOWNLOAD_DELAY ~ 1.5 * DOWNLOAD_DELAY
DOWNLOAD_DELAY = 0


# 禁用去重功能, 视具体场景而定, 若为长运行爬虫且URL多样性递增, 建议配置该项防止scrapy去重文件防止requests.seen持续增加导致磁盘占用过大(血与泪的教训, 之前没配置这个20个spider跑了3天打了几十个G)
# DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'

# url长度限制, 强烈建议调大(也是血泪的教训), 或者能够把控所以url长度不超过2083字符可忽略该配置
URLLENGTH_LIMIT = 2083

# jobdir 可用于暂停spider保留未运行的url, 适合任务需要暂停或中断接着上一次完成位置接着运行的场景
# JOBDIR = ''


# 是否开启重试, 默认: True
RETRY_ENABLED = True
# 是否开启Cookie, 默认: True
COOKIES_ENABLED = True
# 是否开启重定向, 默认: True
REDIRECT_ENABLED = True
# 固定UA, 一般用不上, 直接使用scrapy拓展的随机UA或自己写一个随机UA即可, 默认: None
USER_AGENT = None


# 全局日志级别, 值域: CRITICAL, ERROR, WARNING, INFO, DEBUG
LOG_LEVEL = 'DEBUG'
# 日志格式化样式, 实际使用中配置json字符串结构方便采集
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
# 日志存储文件路径
LOG_FILE = None
# logstats统计时间间隔, 默认60s, 可根据需求调整或关闭(None)
LOGSTATS_INTERVAL = 60.0


# 全局拓展配置
EXTENSIONS = {
    
    
}
# 全局ITEM 管道配置
ITEM_PIPELINES = {
    
    
}
# 全局默认请求头配置, 不建议配置, 建议在spider中配置或处理层配置
DEFAULT_REQUEST_HEADERS = {
    
    
}
# 全局爬虫中间件配置
SPIDER_MIDDLEWARES = {
    
    
}

# 其他一些业务全局应用配置可定义在此, 通过 settings.get() 获取配置

spiders/

All Spiders inherit BaseSpider, rewrite the constructor to initialize the Spider from the configuration
It is recommended to put the configuration related to a single Spider into the custom_settings of the Spider, such as the maximum number of concurrency, the maximum number of IP concurrency, whether to enable cookies and additional configuration, etc.
import scrapy

class BaseSpider(scrapy.Spider):
    
    def __init__(self, settings):
        pass

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        # 创建spider时传入settings参数
        spider = cls(crawler.settings, *args, **kwargs)
        spider._set_crawler(crawler)
        return spider

class DemoSpider(BaseSpider):
    name = 'DemoSpider'
    # 允许处理的http状态码, 默认parse只处理200~300的状态码
    handle_httpstatus_list = []
    # 同settings.py配置一样, 该配置优先级高于settings.py, 通过self.settings.get()获取
    custom_settings = {
    
    
        # 是否开启重试
        'RETRY_ENABLED': False,
    }
    
    def __init__(self, settings):
        super().__init__(settings)
        # 通过配置初始化爬虫逻辑

middlewares.py

All SpiderMiddleware inherits BaseSpiderMiddleware, DownloaderMiddleware inherits BaseDownloaderMiddleware, rewrite the constructor to initialize Middlewares from the configuration
import signals
from utils import get_single_name

class BaseSpiderMiddleware:
    """
    基础爬虫中间件, 含settings参数的构造方法
    """
    def __init__(self, settings=None):
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.settings)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        spider.logger.warn('SpiderMiddleware %s, Spider %s, process exception: %s' % (get_single_name(self), spider.name, exception))

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('SpiderMiddleware %s, Spider opened: %s' % (get_single_name(self), spider.name))


class BaseDownloaderMiddleware:
    """
    基础下载中间件, 含settings参数的构造方法
    """
    def __init__(self, settings=None):
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.settings)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        spider.logger.warn('DownloadMiddleware %s, Spider %s, process exception: %s' % (get_single_name(self), spider.name, exception))

    def spider_opened(self, spider):
        spider.logger.info('DownloadMiddleware: %s, Spider opened: %s' % (get_single_name(self), spider.name))

pipelines.py

All Pipelines inherit BasePipeline and rewrite the constructor to initialize Pipeline from the configuration
It is easier to manage and maintain through the initialization of the configuration loading Pipeline
import signals
from utils import get_single_name

class BasePipeline:
    """
    基础管道, 含settings参数的构造方法
    """
    def __init__(self, settings=None):
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.settings)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_item(self, item, spider):
        return item

    def spider_opened(self, spider):
        spider.logger.info('Pipeline: %s, Spider opened: %s' % (get_single_name(self), spider.name))

utils/

Store some extra processing tools
__init__.py
def get_single_name(tp):
    # 获取唯一名称, 可自行实现
    pass

Guess you like

Origin blog.csdn.net/qq_33129963/article/details/125538451