数据存储第一种:本地json
import json class JsonPipeline(object): def __init__(self): self.file = open('job.json','w',encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item),ensure_ascii=False) + '\n' self.file.write(line) return item def close_spider(self,spider): self.file.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.JsonPipeline': 100}
数据存储第二种:mongoDB
import pymongo class MongoPipeline(object): def __init__(self): #mongo_uri = settings.get('MONGO_URI')# localhost:27017 #mongo_db = settings.get('MONGO_DB')#数据库名 self.client = pymongo.MongoClient(host='127.0.0.1', port=27017) self.db = self.client['lagou'] def process_item(self, item, spider): self.db['拉钩关键词招聘信息表'].insert(dict(item)) #self.db['拉钩关键词招聘信息表'].update({'positionId':item['positionId']},dict(item),True) return item def close_spider(self, spider): self.client.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.MongoPipeline': 100}
数据存储第三种:mysql(#mysql插入数据,execute、commit同步,后面爬取的item多可能造成堵塞)
import pymysql class MysqlPipeline(object):#mysql插入数据,execute、commit同步,后面爬取的item多可能造成堵塞 def __init__(self): self.conn =pymysql.connect(host='localhost', port=3306, db='lagou', user='root', passwd='123456', charset='utf8') self.cursor = self.conn.cursor()#执行数据库的操作是由cursor完成的 def process_item(self, item, spider): sql = 'INSERT INTO job_info VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' values = (item['positionId'],item['city'],item['positionName'],item['salary'],item['workYear'],item['education'], item['companyShortName'],item['companyFullName'],item['companySize'], item['industryField'],item['positionAdvantage'],item['createTime']) self.cursor.execute(sql, values) self.conn.commit() return item def close_spider(self, spider): self.conn.close()
在settings中添加管道 ITEM_PIPELINES = {'Lagou.pipelines.MysqlPipeline': 100}
数据存储第四种:mysql之异步插入(关系型)
原因:spider解析速度超过入库(关系型数据库)速度,到后期爬取的URL越来越多,也就是item越来越多,插入数据库 速度比不上,造成堵塞。
办法:twisted框架给我们提供了一种可以将mysql插入(关系型数据库)异步化的操作,将mysql的execute、commit 同步操作变成异步操作。
工具:利用的就是twisted框架提供的工具--连接池(将mysql同步操作转成异步操作)
import pymysql from twisted.enterprise import adbapi#twisted的enterprise中有一个模块adbapi,可以将我们的mysql操作变成异步的操作 class MySQLTwistedPipeline(object): def __init__(self,dbpool): self.dbpool = dbpool @classmethod def from_settings(cls,settings): dbparms = dict( host = settings['MYSQL_HOST'], db = settings['MYSQL_DB_NAME'], user = settings['MYSQL_USER'], passwd = settings['MYSQL_PASSWORD'], charset = 'utf8', cursorclass = pymysql.cursors.DictCursor )#参数名称是固定的的写法 dbpool = adbapi.ConnectionPool('pymysql',**dbparms) return cls(dbpool) def process_item(self, item, spider): #使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert,item) query.addErrback(self.handle_error)#处理异常 return item def handle_error(self,failure):#处理异步插入的异常 print(failure) def do_insert(self,cursor,item):#执行具体的插入 sql = 'INSERT INTO article VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)' values = (item['title'], item['url'], item['img_url'], item['url_object_id'], item['img_path'], item['tags'], item['like_nums'], item['collection_nums'], item['comment_nums']) cursor.execute(sql, values)
settings中的设置为:
ITEM_PIPELINES = { #'Lagou.pipelines.JsonPipeline': 100, #'Lagou.pipelines.MongoPipeline': 200, #'Lagou.pipelines.MysqlPipeline': 300, 'Lagou.pipelines.MysqlTwistedPipeline': 400 }
MYSQL_DB_NAME = 'lagou' MYSQL_HOST = 'localhost' MYSQL_USER = 'root' MYSQL_PASSWORD = '123456'