[Scrapy Framework] "Version 2.4.0 Source Code" Pipeline (Pipeline) Detailed Chapter

All source code analysis article index directory portal

[Scrapy Framework] Version 2.4.0 Source Code: All Configuration Directory Index

Introduction

Mainly used to handle the use of crawling data, including:

  1. Clean up HTML data
  2. Verify the captured data (check whether the project contains certain fields)
  3. Check for duplicates (and delete them)
  4. Store scraped items in the database

Interpretation of pipeline parameters

class SomethingPipeline(object):
    def __init__(self):    
        # 可选实现,做参数初始化等
        # 写入你的业务逻辑

    def process_item(self, item, spider):
        # item (Item 对象) – 爬取数据的item
        # spider (Spider 对象) – 爬取该item的spider
        # 这个方法必须实现,每个item pipeline组件都需要调用该方法,
        # 这个方法必须返回一个 Item 对象,被丢弃的item将不会被之后的pipeline组件所处理。
        return item

    def open_spider(self, spider):
        # spider (Spider 对象) – 被开启的spider
        # 可选实现,spider开启时,这个方法被调用。

    def close_spider(self, spider):
        # spider (Spider 对象) – 被关闭的spider
        # 可选实现,spider关闭时,这个方法被调用

Function example

Activate use (must be opened)

Release the 69 line ITEM_PIPELINES of the code in settings.py, otherwise the database cannot be written

# 这里把这3行注释开,无需修改。
DOWNLOADER_MIDDLEWARES = {
    
    
    '你的项目名称.middlewares.WwwCjnCnDownloaderMiddleware': 543,
}

Write data to JSON file

import json

from itemadapter import ItemAdapter

class JsonWriterPipeline:

    def open_spider(self, spider):
        self.file = open('items.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(ItemAdapter(item).asdict()) + "\n"
        self.file.write(line)
        return item

Write data to MongoDB

import pymongo
from scrapy.utils.project import get_project_settings
settings = get_project_settings()

class WwwCjnCnPipeline(object):
    # class中全部替换
    def __init__(self):
        host = settings["MONGODB_HOST"]
        port = settings["MONGODB_PORT"]
        dbname = settings["MONGODB_DBNAME"]
        sheetname = settings["MONGODB_SHEETNAME"]
        username = settings["MONGODB_USER"]
        password = settings["MONGODB_PASSWORD"]
        # 创建MONGODB数据库链接
        client = pymongo.MongoClient(host=host, port=port, username=username, password=password)
        # 指定数据库
        mydb = client[dbname]
        # 存放数据的数据库表名
        self.post = mydb[sheetname]

    def process_item(self, item, spider):
        data = dict(item)
        self.post.insert(data)
        return item

Take a screenshot of the data

from urllib.parse import quote

import scrapy
from itemadapter import ItemAdapter

class ScreenshotPipeline:
    """Pipeline that uses Splash to render screenshot of
    every Scrapy item."""

    SPLASH_URL = "http://localhost:8050/render.png?url={}"

    async def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        encoded_item_url = quote(adapter["url"])
        screenshot_url = self.SPLASH_URL.format(encoded_item_url)
        request = scrapy.Request(screenshot_url)
        response = await spider.crawler.engine.download(request, spider)

        if response.status != 200:
            # Error happened, return item.
            return item

        # Save screenshot to file, filename will be hash of url.
        url = adapter["url"]
        url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
        filename = f"{url_hash}.png"
        with open(filename, "wb") as f:
            f.write(response.body)

        # Store filename in item.
        adapter["screenshot_filename"] = filename
        return item

Duplicate data filtering

from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem

class DuplicatesPipeline:

    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        if adapter['id'] in self.ids_seen:
            raise DropItem(f"Duplicate item found: {item!r}")
        else:
            self.ids_seen.add(adapter['id'])
            return item

Guess you like

Origin blog.csdn.net/qq_20288327/article/details/113778510