scrapy crawler frame (c)

scrapy crawler frame (c)

Exporter to optimize the use of storage

Use JsonItemExporter module

pipelines.py

from scrapy.exporters import JsonItemExporter


class QsbkspiderPipeline(object):
    def __init__(self):  # 初始化方法
        # 使用二进制来写入,因此“w”-->"wb"
        self.fp = open("duanzi2.json", "wb")
        self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        print("爬虫开始了!")

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.fp.close()
        print("爬虫结束了!")

mark

mark

Use JsonLinesItemExporter module

The difference is in the json file, while such an approach grammar less;

from scrapy.exporters import JsonLinesItemExporter


class QsbkspiderPipeline(object):
    def __init__(self):  # 初始化方法
        # 使用二进制来写入,因此“w”-->"wb"
        self.fp = open("duanzi3.json", "wb")
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
        # self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        print("爬虫开始了!")

    def close_spider(self, spider):
        # self.exporter.finish_exporting()
        self.fp.close()
        print("爬虫结束了!")

mark

Guess you like

Origin www.cnblogs.com/senup/p/12319290.html