items.py用于定义爬取的字段
import scrapy
from scrapy import Field
class PssItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pagenum = Field()
id=Field()
name = Field()
data = Field()
'''
上述的例子表示,我需要爬取的分别是pagenum,id,name,data
'''
pipelines.py用于存储数据
import pymongo
class PssPipeline(object):
#spider开启的时候自动调用,用于初始化操作,例如开启数据库等操作
def open_spider(self,spider):
self.conn = pymongo.MongoClient("127.0.0.1", 27017)
self.con = self.conn.DB13
#spider关闭的时候自动调用,用于收尾操作,例如关闭数据库连接等操作
def close_spider(self,spider):
self.conn.close()
#多用于拿全局配置,需要返回类对象,有时也用来做信号绑定
@classmethod
def from_crawler(cls,crawler):
mongo_url=crawler.settings.get('MONGO_URL')
mongo_DB = crawler.settings.get('MONGO_DB')
return cls()
#核心方法,用于拿item数据
def process_item(self, item, spider):
pagenum=item['pagenum']
coll=self.con[str(pagenum)]
item1=dict(item)
item1.pop('pagenum')
coll.insert(item1)
return item['name'] #信息回显
那么item怎么与你的spider对应呢
'''
from ..items import PssItem
from pss.pss.items import PssItem
'''
def parse2(self, response):
pagenum = response.meta['pagenum']
name = response.meta['name']
id = response.meta['id']
page = json.loads(response.body.decode('utf-8'), encoding='utf-8')
data = page['fullTextDTO']['literaInfohtml']
item=PssItem(pagenum=pagenum,name=name,id=id,data=data)
yield item