Generally, it only makes sense to store the data we crawl into the database or generate local files.
First, the crawled data directly generates files locally
1. Method 1. Generate the question directly when running the command
scrapy crawl 爬虫名字 -o 文件名
2. Method 2, using the file writing method in the pipeline
1. The code of the pipeline
import json # quotes爬取的数据写到本地 class QuotesPipelines(object): def process_item(self, item, spider): if spider.name == 'quotes': with open('quotes_test.json', 'a', encoding='utf-8') as fp: fp.write(json.dumps(dict(item), indent=2, ensure_ascii=False)+ ',\n') return item
- 2.
settings.py
Configure in - 3. Direct operation
scrapy crawl quotes --nolog
3. Store the captured data in the mongodb
database
1. Installation package
pip install pymongo
2.
settings.py
Configure the link information of the database in# 设置数据库连接 MONGO_URI = 'localhost' MONGO_DATABASE = 'test'
3. Write business logic in the pipeline
import pymongo class MongoPipeline(object): """ 定义一个存入到数据库中的类 """ collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if spider.name == 'quotes': self.db[self.collection_name].insert_one(dict(item)) return item
4.
settings.py
Configure the pipeline in- 5. Direct operation
scrapy crawl quotes --nolog
Fourth, store the crawled data in the mysql
database
1. Installation package
pip install pymysql
2. Create a table in the database (different from
mongodb
)3.
settings.py
Configuremysql
connection information in# Mysql数据库的配置信息 MYSQL_HOST = '120.24.156.230' MYSQL_DBNAME = 'nodejs' # 数据库名字,请修改 MYSQL_USER = 'root' # 数据库账号,请修改 MYSQL_PASSWD = 'root' # 数据库密码,请修改 MYSQL_PORT = 3306 # 数据库端口,在dbhelper中使用
4. Use the pipeline to store in the database
import pymysql # 西刺代理的数据入库 class XiciPipeline(object): def __init__(self, dbparams): self.connect = pymysql.connect( host=dbparams['host'], port=dbparams['port'], db=dbparams['db'], user=dbparams['user'], passwd=dbparams['passwd'], charset=dbparams['charset'], use_unicode=dbparams['use_unicode'] ) # 创建一个句柄 self.cursor = self.connect.cursor() @classmethod def from_crawler(cls, crawler): # 读取settings中的配置 dbparams = dict( host=crawler.settings.get('MYSQL_HOST'), db=crawler.settings.get('MYSQL_DBNAME'), user=crawler.settings.get('MYSQL_USER'), passwd=crawler.settings.get('MYSQL_PASSWD'), port=crawler.settings.get('MYSQL_POR'), charset='utf8', # 编码要加上,否则可能出现中文乱码问题 use_unicode=False, ) return cls(dbparams) def process_item(self, item, spider): if spider.name == 'xici': sql = 'insert into xici(ip, port, speed, proxy_type, localhost) values (%s, %s, %s, %s,%s)' self.cursor.execute(sql, (item['ip'], item['port'], item['speed'], item['proxy_type'], item['localhost'])) self.connect.commit() return item def close_spider(self, spider): self.connect.close()
5.
settings.py
Configure the pipeline in- 6. Direct operation
scrapy crawl quotes --nolog