Scrapy_items.py,pipelines.py

items.py用于定义爬取的字段

import scrapy
from scrapy import Field

class PssItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pagenum = Field()
    id=Field()
    name = Field()
    data = Field()

'''
上述的例子表示,我需要爬取的分别是pagenum,id,name,data
'''

pipelines.py用于存储数据

import pymongo
class PssPipeline(object):
    #spider开启的时候自动调用,用于初始化操作,例如开启数据库等操作
    def open_spider(self,spider):
        self.conn = pymongo.MongoClient("127.0.0.1", 27017)
        self.con = self.conn.DB13
    #spider关闭的时候自动调用,用于收尾操作,例如关闭数据库连接等操作
    def close_spider(self,spider):
        self.conn.close()
    #多用于拿全局配置,需要返回类对象,有时也用来做信号绑定
    @classmethod
    def from_crawler(cls,crawler):
        mongo_url=crawler.settings.get('MONGO_URL')
        mongo_DB = crawler.settings.get('MONGO_DB')
        return cls()
    #核心方法,用于拿item数据
    def process_item(self, item, spider):
        pagenum=item['pagenum']
        coll=self.con[str(pagenum)]
        item1=dict(item)
        item1.pop('pagenum')
        coll.insert(item1)
        return item['name']   #信息回显

那么item怎么与你的spider对应呢

'''
from ..items import PssItem 
from pss.pss.items import PssItem   
'''

def parse2(self, response):
        pagenum = response.meta['pagenum']
        name = response.meta['name']
        id = response.meta['id']
        page = json.loads(response.body.decode('utf-8'), encoding='utf-8')
        data = page['fullTextDTO']['literaInfohtml']
        item=PssItem(pagenum=pagenum,name=name,id=id,data=data)
        yield item

猜你喜欢

转载自blog.csdn.net/rookie_is_me/article/details/85256754
py
今日推荐