[python] scrapy pipeline persistent local storage and mysql storage

Local storage

In pipelines.py:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

class NewsBotPipeline:
    def __init__(self):
        self.fp = None

    #重写打开爬虫的方法
    def open_spider(self,spider):
        print('开始爬虫...')
        self.fp = open('./123.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['news_title']
        department = item['news_department']
        date = item['news_date']
        sort = item['news_sort']
        read = item['news_read']
        self.fp.write(title+'\n'+department+'\t'+date+'\n'+sort+'\n'+read +'\n')
        return item

    #重写关闭函数
    def close_spider(self,spider):
        print("结束爬虫!")
        self.fp.close()

In setting.py, change

ITEM_PIPELINES = {
    
    
   'news_bot.pipelines.NewsBotPipeline': 300,
}

Just uncomment it.

mysql storage (if there is an increase, if not, use replace into)

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class NewsBotPipeline(object):
    def __init__(self):
        self.fp = None

    #重写打开爬虫的方法
    def open_spider(self,spider):
        print('开始爬虫...')
        self.fp = open('./123.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['news_title']
        department = item['news_department']
        date = item['news_date']
        sort = item['news_sort']
        read = item['news_read']
        t = "标题:"
        self.fp.write(t + title+'\n'+department+'\t'+date+'\n'+sort+'\n'+read)
        self.fp.write("\n")
        return item

    #重写关闭函数
    def close_spider(self, spider):
        print("结束爬虫!")
        self.fp.close()

#对应存储到数据库中
class BotSpiderMysql(object):
    conn = None
    def open_spider(self,spider):
        print("数据库连接中...")
        self.conn = pymysql.connect(host="localhost", port=3306, charset="UTF8", user="root", password="123456", database="xjsql")

    def process_item(self, item, spider):
        SQL = "REPLACE INTO news VALUES('%s','%s','%s','%s','%s')"
        data = [item['news_title'],item['news_department'],item['news_date'],item['news_sort'],item['news_read']]
        self.cursor = self.conn.cursor()

        try:
            self.cursor.execute(SQL%(data[0],data[1],data[2],data[3],data[4]))
            self.conn.commit()  # 手动提交,一起成功,一起失败
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        #self.cursor.close()
        self.conn.close()
        print("数据库关闭...")

Added in setting:

ITEM_PIPELINES = {
    
    
   #'news_bot.pipelines.NewsBotPipeline': 300,
   'news_bot.pipelines.BotSpiderMysql': 300
}

Guess you like

Origin blog.csdn.net/Sgmple/article/details/112687581