# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from monday_scrapy.mysqlhelper import MysqlHelper
import os
import requests
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class MondayScrapyPipeline(object):
def process_item(self, item, spider):
return item
class StoreMysqlScrapyPipeline(object):
def process_item(self, item, spider):
# 存储数据到mysql中
(insert_sql, data) = item.get_insert_sql()
# 生成mysqlhelper的类
myhelper = MysqlHelper()
myhelper.execute_modify_sql(insert_sql, data)
return item
class StoreImagePipeline(object):
def process_item(self, item, spider):
image_url = item['my_image_urls'][0]
print(image_url)
response = requests.get(image_url)
if not os.path.exists('download'):
os.mkdir('download')
filename = 'download/' + image_url.split('/')[-1]
item['image_file_name'] = filename
with open(filename, 'wb') as f:
f.write(response.content)
return item
headers ={
}
class MyImagesPipeline(ImagesPipeline):
# 通过函数名字我们能够翻译出来, 获取image的requests(scrapy.Request),
# 框架回将这个requests放到scheduler
def get_media_requests(self, item, info):
for image_url in item['my_image_urls']:
# req = scrapy.Request(image_url)
# req.headers['User-Agent'] = "
yield scrapy.Request(image_url, meta={'file_path': 'car'}, headers=headers)
# for pretty_girl in item['pretty_girls']:
# yield scrapy.Request(pretty_girl, meta={'file_path': 'girl'})
# 通过函数名字的翻译, item 执行结束. 这个函数会有result的返回值, result内部有存储的路径
def item_completed(self, results, item, info):
# print(results)
if isinstance(item, dict) or self.images_result_field in item.fields:
item[self.images_result_field] = [x for ok, x in results if ok]
item['image_file_name'] = results[0][1]['path']
# item 执行结束的时候回调用这个函数
return item
# 最终我们需要返回列表, 或者是yield Request