All source code analysis article index directory portal
[Scrapy Framework] Version 2.4.0 Source Code: All Configuration Directory Index
Article Directory
Introduction
Pipeline is used to process data captured by Scrapy.
The main purpose:
- Clean up HTML data
- Verify the captured data (check whether the project contains certain fields)
- Check the copy (and delete)
- Store Scrapy items in the database
Basic pipeline method
Each project pipeline component is a Python class
process_item (self, item, spider): The content of items defined by pipeline processing.
open_spider (self, spider): This method is called when the spider is opened.
close_spider (self, spider): This method is called when the spider is closed.
from_crawler (cls, crawler): This method will be called when a pipeline instance is created. This method must return a pipeline instance object, which is generally used to obtain the value configured in the configuration setting of the scrapy project.
Simple example of pipeline
Example of fetching data to adjust the price attribute
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class PricePipeline:
vat_factor = 1.15
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter.get('price'):
if adapter.get('price_excludes_vat'):
adapter['price'] = adapter['price'] * self.vat_factor
return item
else:
raise DropItem(f"Missing price in {item}")
Write data to JSON file
import json
from itemadapter import ItemAdapter
class JsonWriterPipeline:
def open_spider(self, spider):
self.file = open('items.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
self.file.write(line)
return item
Data writing to MongoDB
import pymongo
from itemadapter import ItemAdapter
class MongoPipeline:
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item
Screenshot of the page
import hashlib
from urllib.parse import quote
import scrapy
from itemadapter import ItemAdapter
class ScreenshotPipeline:
"""
每个Scrapy项目使用Splash渲染屏幕截图的管道
"""
SPLASH_URL = "http://localhost:8050/render.png?url={}"
async def process_item(self, item, spider):
adapter = ItemAdapter(item)
encoded_item_url = quote(adapter["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await spider.crawler.engine.download(request, spider)
if response.status != 200:
# Error happened, return item.
return item
# Save screenshot to file, filename will be hash of url.
url = adapter["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = f"{url_hash}.png"
with open(filename, "wb") as f:
f.write(response.body)
# Store filename in item.
adapter["screenshot_filename"] = filename
return item
Data Duplicate Filter
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter['id'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['id'])
return item
pipeline activation method
Set in settings.py, otherwise the crawled data cannot be processed
ITEM_PIPELINES = {
'myproject.pipelines.PricePipeline': 300,
'myproject.pipelines.JsonWriterPipeline': 800,
}