Python3~Scrapy框架爬取网页数据到MySql~pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import pymysql

class Py062018316Pipeline(object):
    def process_item(self, item, spider):
        return item

#数据库操作基类
class MysqlPipeline(object):
    def __init__(self):
        #链接数据库
        #self.conn=pymysql.connect(ip,用户，密码，数据库，编码字符集)
        self.conn=pymysql.connect('127.0.0.1','root','zwl123','python01',charset='utf8')#链接数据库
        self.cursor=self.conn.cursor()#获取游标

    def process_item(self, item, spider):
        pass

    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

class QiuShiPipeline(object):
    def __init__(self):
        self.file=open('qiushi.json','w',encoding='utf-8')
    def process_item(self, item, spider):
        #数据存储
        json.dump(dict(item),open('qiushi.json','a',encoding='utf-8'),ensure_ascii=False)
        return item

    def close_spider(self):
        self.file.close()

#腾讯招聘 一、json实现方法
class TencentPipeline(object):

    def __init__(self):
        self.file=open('tencent.json','w',encoding='utf-8')

    def process_item(self, item, spider):
        #数据存储
        str=json.dumps(dict(item),ensure_ascii=False)
        str=str+'\n'
        self.file.write(str)
        # json.dump(dict(item),open('tencent.json','a',encoding='utf-8'),ensure_ascii=False)
        return item

    def close_spider(self):
        self.file.close()

#腾讯招聘 二、继承MysqlPipeline数据库链接类实现方法
class TencentPipeline(MysqlPipeline):

    # def __init__(self):
    #     self.file=open('tencent.json','w',encoding='utf-8')

    def process_item(self, item, spider):
        #增删改查 ,type,num,location,date,url,duty,rq

        sql = "insert into tencent_scrapy03(id,name,type,num,location,date,url,duty,rq) VALUES (DEFAULT,%s,%s,%s,%s,%s,%s,%s,%s)"
        data=(item['name'],item['type'],item['num'],item['location'],item['date'],item['url'],item['duty'],item['rq'])
        # name=item['name']
        # type=item['type']
        # num=item['num']
        # location=item['location']
        # date=item['date']
        # url=item['url']
        # duty=item['duty']
        # rq=item['rq']
        # data=(item['name'],item['type'],item['num'],item['location'],item['date'],item['url'],item['duty'],item['rq'])
        try:
            # self.cursor.execute(sql,(name,type,num,location,date,url,duty,rq))
            self.cursor.execute(sql,(data))

            self.conn.commit()
        except Exception as e:
            print('插入数据失败',e)
            self.conn.rollback()


        return item
    # def close_spider(self):
    #     self.file.close()

MAC中：

1、系统偏好设置->打开MySql服务

2、打开Navicat for MySql图形界面（方便查看数据）

3、打开终端MySql: mysql -u root -p (有需要输入密码)

4、show databases; （显示所有数据库）

5、use python01（进入其中需要的数据库）

6、show tables; （显示所有python01数据库中的数据表）

7、

create table tencent_scrapy02(id int(4) not null primary key auto_increment,name varchar(300),type varchar(300))auto_increment=1 charset=utf8;（创建表语句）

8、

select * from XX数据表\G; （命令行里查看爬取的数据）

9、若因为某些原因删除了所有数据，是需要重新让id从1自增的

ALTER TABLE XX数据表 auto_increment = 1;

Python3~Scrapy框架爬取网页数据到MySql~pipelines.py

猜你喜欢