MongoPipeline,ImagePipeline,CsvPipeline,JsonPipeline,XmlWritePipeline

  1. 改进版的MongoPipeline
  2. MongoPipeline
  3. ImagePipeline
  4. CsvPipeline
  5. JsonPipeline
  6. XmlWritePipeline

改进版的MongoPipeline

2017/11/5

import pymongo
from scrapy.conf import settings

## 在settings.py中配置MONGO_URI,MONGO_DATABASE,MONGO_COLLECTION,如果不配置,则默认使用localhost, 项目名和爬虫名
class MongoPipeline(object):
    def __init__(self):
        self.mongo_uri = settings.get('MONGO_URI','localhost')
        self.mongo_db = settings.get('MONGO_DATABASE', settings['BOT_NAME'])
        self.mongo_collection = settings.get('MONGO_COLLECTION')
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        if not self.mongo_collection:
            self.mongo_collection = spider.name
        self.db[self.mongo_collection].insert_one(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()

MongoPipeline

来自官网,需要在settings中定义MONGO_URI和MONGO_DATABASE

import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(dict(item))
        return item

ImagePipeline

自定义的图片下载器,以列表形式保存img_urls,title为文件夹的名字,图片的名称为index+1
在settings.py中需要制定保存路径
IMAGES_STORE=’F:/images’


from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
import os

class ImagePipeline(ImagesPipeline):
    # 自定义图片下载器
    def get_media_requests(self, item, info):
        '''发生图片下载请求,其中item['front_image_url']字段是scrapy中我们自定义的url字段,
            以数组的方式存储,遍历数组请求图片'''
        for i, image_url in enumerate(item['imgs']):
            yield Request(image_url, meta={'item':item, 'index':i+1})

    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        name = item['name']
        index = request.meta['index']
        image_guid = name + str(index) + '.' + request.url.split('.')[-1]
        imagepath = 'full/{}/{}'.format(name, image_guid)
        return imagepath

CsvPipeline

import csv
class CsvPipeline(object):
    def __init__(self):
        self.csvfp = open('pipeline.csv', 'w', encoding='utf8')
        fieldnames = ['tea_hd', 'name', 'title', 'img_url', 'content']
        self.writer =csv.DictWriter(self.csvfp, fieldnames=fieldnames)
        self.writer.writeheader()

    def process_item(self, item, spider):
        self.writer.writerow(item)
        return item

    def close_spider(self, spider):
        self.csvfp.close()

JsonPipeline

import json
class JsonPipeline(object):
    def open_spider(self, spider):
        self.fp = open('itcast.json','w', encoding='utf8')
        self.fp.write('[')

    def process_item(self, item, spider):
        dict_data = dict(item)
        str_data = json.dumps(dict_data,ensure_ascii=False) + ',\n'
        self.fp.write(str_data)
        return item

    def close_spider(self, spider):
        self.fp.seek(self.fp.tell() - 3, 0)
        self.fp.write(']')
        self.fp.close()

XmlWritePipeline

from scrapy import signals
from scrapy import log
from TestSpider.items import TestspiderItem
from twisted.enterprise import adbapi
from scrapy.contrib.exporter import XmlItemExporter

# class TestspiderPipeline(object):
#     def process_item(self, item, spider):
#         return item
class XmlWritePipeline(object):
    def __init__(self):
        pass
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    def spider_opened(self, spider):
        self.file = open('bbsData.xml', 'wb')
        self.expoter = XmlItemExporter(self.file)
        self.expoter.start_exporting()
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()
        # process the crawled data, define and call dataProcess function
        # dataProcess('bbsData.xml', 'text.txt')
    def process_item(self, item, spider):
        self.expoter.export_item(item)
        return item

猜你喜欢

转载自blog.csdn.net/newdas123/article/details/78507002