爬虫scrapy框架Mongodb和MySql正规写法

1.mogodb交互管道类正规写法

#MogoDB交互
import Pymongo
#管道类
class MogoDBPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        #初始化方法__new__:构造方法,在内存中开辟一块空间
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri,27017)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        #表名为goods,插入数据
        self.db['goods'].insert_one(dict(item))
        # 在一个项目中可能存在多个管道类, 如果该管道类后面还有管道类需要存储数据, 必须return item
        return item

    def close_spider(self, spider):
        self.client.close()

2.MySQL交互管道类正规写法

# 1.连接数据库

# 2.获取游标

# 3.准备SQL语句

# 4.执行SQL语句

# 5.提交commit
class MysqlPipeline(object):
    def __init__(self, host, port, user, password, db):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.db = db
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('HOST'),
            port=crawler.settings.get('PORT'),
            user=crawler.settings.get('USER'),
            password=crawler.settings.get('PASSWORD'),
            db=crawler.settings.get('DB'),
        )
    def open_spider(self, spider):
        self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db, charset='utf8')
        self.cursor = self.conn.cursor()
    def process_item(self, item, spider):
        data = dict(item)
        sql = "insert into zhegoods values ('%s','%s', '%s')"%(data['name'], data['price'], data['imgname'])
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

settings配置

#额外加上代码
MONGO_URI = 'localhost'
MONGO_DB = '自定义库名'

# MYSQL配置
HOST = 'localhost'
PORT = 3306
USER = 'root'
PASSWORD = ''
DB = '自定义库名'
ps:mongodb可以直接声明库名和表名,数据存储过程中会自己建表建库,而MYsQL则需要先准备好库和表

ITEM_PIPELINES = {
   # 其中300代表权重,	权重越小,引擎越优先运行
   # 'z8b.pipelines.Z8BPipeline': 300,          ps:爬虫名.管道类.类名:权重
   'z8b.pipelines.MysqlPipeline': 301,
}

猜你喜欢

转载自www.cnblogs.com/wonderlandlove/p/12812551.html