scrapy-pipelines的写法

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from monday_scrapy.mysqlhelper import MysqlHelper
import os
import requests
import scrapy

from scrapy.pipelines.images import ImagesPipeline

class MondayScrapyPipeline(object):
    def process_item(self, item, spider):
        return item


class StoreMysqlScrapyPipeline(object):
    def process_item(self, item, spider):
        # 存储数据到mysql中
        (insert_sql, data) = item.get_insert_sql()
        # 生成mysqlhelper的类
        myhelper = MysqlHelper()
        myhelper.execute_modify_sql(insert_sql, data)
        return item


class StoreImagePipeline(object):
    def process_item(self, item, spider):
        image_url = item['my_image_urls'][0]
        print(image_url)
        response = requests.get(image_url)
        if not os.path.exists('download'):
            os.mkdir('download')

        filename = 'download/' + image_url.split('/')[-1]
        item['image_file_name'] = filename
        with open(filename, 'wb') as f:
            f.write(response.content)

        return item

headers ={

}

class MyImagesPipeline(ImagesPipeline):
    # 通过函数名字我们能够翻译出来, 获取image的requests(scrapy.Request),
    # 框架回将这个requests放到scheduler
    def get_media_requests(self, item, info):
        for image_url in item['my_image_urls']:
            # req = scrapy.Request(image_url)
            # req.headers['User-Agent'] = "
            yield scrapy.Request(image_url, meta={'file_path': 'car'}, headers=headers)
        # for pretty_girl in item['pretty_girls']:
        #     yield scrapy.Request(pretty_girl, meta={'file_path': 'girl'})

    # 通过函数名字的翻译, item 执行结束. 这个函数会有result的返回值, result内部有存储的路径
    def item_completed(self, results, item, info):
        # print(results)
        if isinstance(item, dict) or self.images_result_field in item.fields:
            item[self.images_result_field] = [x for ok, x in results if ok]
        item['image_file_name'] = results[0][1]['path']
        # item 执行结束的时候回调用这个函数
        return item

        # 最终我们需要返回列表, 或者是yield Request

猜你喜欢

转载自blog.csdn.net/weixin_42958164/article/details/82154085