All source code analysis article index directory portal
[Scrapy Framework] Version 2.4.0 Source Code: All Configuration Directory Index
Article Directory
Introduction
Mainly used to handle the use of crawling data, including:
- Clean up HTML data
- Verify the captured data (check whether the project contains certain fields)
- Check for duplicates (and delete them)
- Store scraped items in the database
Interpretation of pipeline parameters
class SomethingPipeline(object):
def __init__(self):
# 可选实现,做参数初始化等
# 写入你的业务逻辑
def process_item(self, item, spider):
# item (Item 对象) – 爬取数据的item
# spider (Spider 对象) – 爬取该item的spider
# 这个方法必须实现,每个item pipeline组件都需要调用该方法,
# 这个方法必须返回一个 Item 对象,被丢弃的item将不会被之后的pipeline组件所处理。
return item
def open_spider(self, spider):
# spider (Spider 对象) – 被开启的spider
# 可选实现,spider开启时,这个方法被调用。
def close_spider(self, spider):
# spider (Spider 对象) – 被关闭的spider
# 可选实现,spider关闭时,这个方法被调用
Function example
Activate use (must be opened)
Release the 69 line ITEM_PIPELINES of the code in settings.py, otherwise the database cannot be written
# 这里把这3行注释开,无需修改。
DOWNLOADER_MIDDLEWARES = {
'你的项目名称.middlewares.WwwCjnCnDownloaderMiddleware': 543,
}
Write data to JSON file
import json
from itemadapter import ItemAdapter
class JsonWriterPipeline:
def open_spider(self, spider):
self.file = open('items.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
self.file.write(line)
return item
Write data to MongoDB
import pymongo
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
class WwwCjnCnPipeline(object):
# class中全部替换
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname = settings["MONGODB_SHEETNAME"]
username = settings["MONGODB_USER"]
password = settings["MONGODB_PASSWORD"]
# 创建MONGODB数据库链接
client = pymongo.MongoClient(host=host, port=port, username=username, password=password)
# 指定数据库
mydb = client[dbname]
# 存放数据的数据库表名
self.post = mydb[sheetname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
Take a screenshot of the data
from urllib.parse import quote
import scrapy
from itemadapter import ItemAdapter
class ScreenshotPipeline:
"""Pipeline that uses Splash to render screenshot of
every Scrapy item."""
SPLASH_URL = "http://localhost:8050/render.png?url={}"
async def process_item(self, item, spider):
adapter = ItemAdapter(item)
encoded_item_url = quote(adapter["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await spider.crawler.engine.download(request, spider)
if response.status != 200:
# Error happened, return item.
return item
# Save screenshot to file, filename will be hash of url.
url = adapter["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = f"{url_hash}.png"
with open(filename, "wb") as f:
f.write(response.body)
# Store filename in item.
adapter["screenshot_filename"] = filename
return item
Duplicate data filtering
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter['id'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['id'])
return item