scrapy---将数据json化保存---mongo数据库---mysql数据库---excel

                                     json

方法一:通过在cmd中输命令来实现

scrapy crawl novel -o novel.json -s FEED_EXPORT_ENCIDING=UTF-8

novel为爬虫名称(name)

scrapy将数据保存为csv文件: scrapy crawl novel -o novel.csv -s FEED_EXPORT_ENCIDING=UTF-8

方法二:通过pipelines实现

1.自定义自己的pipeline

pipelines.py


import json


class JsonWriterPipeline(object):

    def __init__(self):
        self.file = open('jobbole.json', 'wb')

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line.encode('utf-8'))
        return item

2.在settings中开启自己的pipeline

settings.py

ITEM_PIPELINES = {
   # 'Jobbole.pipelines.JobbolePipeline': 300,
    #  Json数据保存
    'Jobbole.pipelines.JsonWriterPipeline':1,
}

                                                                             Mongodb

1.在pipelines.py中自定义自己的pipeline


import pymongo

# 保存到Mongo数据库里面
class MongoPipeline(object):
    def __init__(self, client, db):
        self.client = pymongo.MongoClient(client)
        self.db = self.client[db]

    @classmethod
    def from_crawler(cls, crawler):
        obj = cls(
            client=crawler.settings.get('MONGOCLIENT', 'localhost'),
            db=crawler.settings.get('DB', 'jobbole')
        )
        return obj

    def process_item(self, item, spider):
        self.db['jobbole'].update_one({'artitle_item': item['artitle_item']}, {'$set': dict(item)}, True)
        return item

 

2.在settings.py中开启自己的pipeline

ITEM_PIPELINES = {
    # 保存到Mongo数据库中
    'Jobbole.pipelines.MongoPipeline':1,
}

  MONGOCLIENT = 'localhost'
DB = 'jobbole'


                                                                            Mysql

1.在pipelines.py中自定义自己的pipeline


import pymysql

class DBPipeline(object):
    def __init__(self, host, port, db, user, passwd, charset):
        self.db = pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset=charset)
        self.cursor = self.db.cursor()

    @classmethod
    def from_crawler(cls, crawler):
        # 连接数据库
        obj = cls(
            host=crawler.settings.get('MYSQL_HOST', 'localhost'),
            port=3306,
            db=crawler.settings.get('MYSQL_DBNAME', 'jobbole'),
            user=crawler.settings.get('MYSQL_USER', 'root'),
            passwd=crawler.settings.get('MYSQL_PASSWD', '123456'),
            charset='utf8')
        return obj

    def process_item(self, item, spider):
        # 插入数据
        try:
            self.cursor.execute(
                """insert into jobbole (artitle_item, release_time, fenlei ,dianzan, num,comment, content)
                value (%s, %s, %s, %s, %s, %s, %s)""", (item['artitle_item'], item['release_time'], item['fenlei'], item['dianzan'],
                 item['num'], item['comment'], item['content']))

            # 提交sql语句
            self.db.commit()
            return item

        except Exception as e:
            print(e)
            self.db.rollback()


2.在settings中开启自己的pipeline

ITEM_PIPELINES = {
    # 保存到Mysql数据库中
    'Jobbole.pipelines.DBPipeline':1,
}


MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'jobbole'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'



                                excel

1.在pipelines.py中自定义自己的pipeline


from openpyxl import Workbook

# 将数据保存到 excel class ExcelPipeline( object): # 设置工序一 wb = Workbook() # class 实例化 ws = wb.active # 激活工作表 ws.append([ ' 文章标题 ' , ' 发布时间 ' , ' 分类 ' , ' 点赞次数 ' , ' 评论次数 ' , ' 内容 ']) # 设置表头添加一行数据 def process_item( self , item , spider): # 工序具体内容 line = [item[ 'artitle_item'] , item[ 'release_time'] , item[ 'fenlei'] , item[ 'dianzan'] , item[ 'num'] , item[ 'comment'] ,item[ 'content']] # 把数据中每一项整理出来 self.ws.append(line) # 将数据以行的形式添加到 xlsx self.wb.save( 'jobbole.xlsx') # 保存 xlsx 文件 return item
 

2.在settings.py中开启自己的pipeline

ITEM_PIPELINES = {
    # 保存到Excel    'Jobbole.pipelines.ExcelPipeline':1,
}





猜你喜欢

转载自blog.csdn.net/weixin_42312791/article/details/80948797