Scrapy之Images Pipeline

items. py

import scrapy

class MyItem(scrapy.Item):

    # ... other item fields ...
    img_urls = scrapy.Field()
    img_paths = scrapy.Field()

pipelines. py

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ZhihuImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for img_url in item['img_urls']:
            yield scrapy.Request(img_url)

    def item_completed(self, results, item, info):
        img_paths = [x['path'] for ok, x in results if ok]
        if not img_paths:
            raise DropItem("Item contains no images")
        item['img_paths'] = img_paths
        return item

注释

results返回一个元组list,典型值如下:

 [(True,
  {'checksum': '2b00042f7481c7b056c4b410d28f33cf',
   'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
   'url': 'http://www.example.com/files/product1.pdf'}),
 (False,
  Failure(...))]

setting. py

ITEM_PIPELINES = {'myProject.pipelines.MyImagesPipeline': 1}	#数字越低,优先级越高
IMAGES_STORE = 'D:\\path\\...'

猜你喜欢

转载自blog.csdn.net/masami269981/article/details/89453276