Local storage
In pipelines.py:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class NewsBotPipeline:
def __init__(self):
self.fp = None
#重写打开爬虫的方法
def open_spider(self,spider):
print('开始爬虫...')
self.fp = open('./123.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['news_title']
department = item['news_department']
date = item['news_date']
sort = item['news_sort']
read = item['news_read']
self.fp.write(title+'\n'+department+'\t'+date+'\n'+sort+'\n'+read +'\n')
return item
#重写关闭函数
def close_spider(self,spider):
print("结束爬虫!")
self.fp.close()
In setting.py, change
ITEM_PIPELINES = {
'news_bot.pipelines.NewsBotPipeline': 300,
}
Just uncomment it.
mysql storage (if there is an increase, if not, use replace into)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class NewsBotPipeline(object):
def __init__(self):
self.fp = None
#重写打开爬虫的方法
def open_spider(self,spider):
print('开始爬虫...')
self.fp = open('./123.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['news_title']
department = item['news_department']
date = item['news_date']
sort = item['news_sort']
read = item['news_read']
t = "标题:"
self.fp.write(t + title+'\n'+department+'\t'+date+'\n'+sort+'\n'+read)
self.fp.write("\n")
return item
#重写关闭函数
def close_spider(self, spider):
print("结束爬虫!")
self.fp.close()
#对应存储到数据库中
class BotSpiderMysql(object):
conn = None
def open_spider(self,spider):
print("数据库连接中...")
self.conn = pymysql.connect(host="localhost", port=3306, charset="UTF8", user="root", password="123456", database="xjsql")
def process_item(self, item, spider):
SQL = "REPLACE INTO news VALUES('%s','%s','%s','%s','%s')"
data = [item['news_title'],item['news_department'],item['news_date'],item['news_sort'],item['news_read']]
self.cursor = self.conn.cursor()
try:
self.cursor.execute(SQL%(data[0],data[1],data[2],data[3],data[4]))
self.conn.commit() # 手动提交,一起成功,一起失败
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
#self.cursor.close()
self.conn.close()
print("数据库关闭...")
Added in setting:
ITEM_PIPELINES = {
#'news_bot.pipelines.NewsBotPipeline': 300,
'news_bot.pipelines.BotSpiderMysql': 300
}