Python3使用Scrapy2.4框架爬取数据,多spider指定pipelines配置

目前环境

python3
scrapy2.4

1、方案1: 多pipelines进行区分(scrapy版本必须是1.1以上)

settings.py

# 这个数值的范围是0-1000, 这个数值确定了他们的运行顺序(即优先级), 数字越小越优先执行
ITEM_PIPELINES = {
    
    
   'weather.pipelines.WeatherPipeline': 300,
   'weather.pipelines.WeatherHourPipeline': 302,
}

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 用于保存所抓取的数据的容器
# 定义字段内容

# 每日天气
class WeatherItem(scrapy.Item):
  # define the fields for your item here like:
  # 今日
  name = scrapy.Field()
  # 天气状态
  status = scrapy.Field()
  # 日期
  date = scrapy.Field()
  # 最高气温
  max = scrapy.Field()
  # 最低气温
  min = scrapy.Field()
  pass


# 每小时天气
class WeatherHourItem(scrapy.Item):
  # 唯一标志
  date = scrapy.Field()
  pass

WeatherSpider.py

import scrapy
from weather.items import WeatherItem
# 爬虫逻辑
class WeatherSpider(scrapy.Spider):
  # Spider名称,不能删除
  name = "weather"
  allowed_domains = ['weather.com']
  start_urls = [
    'https://weather.com/zh-CN/weather/today/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
  ]
  # 必须指定管道,否则报错
  custom_settings = {
    
    
    "ITEM_PIPELINES": {
    
    
      'weather.pipelines.WeatherPipeline': 300
    }
  }
  def parse(self, response):
    # 每日天气
    weatherItem = WeatherItem()
    yield weatherItem 
    pass

WeatherHourSpider.py

import scrapy
from weather.items import WeatherHourItem

# 爬虫逻辑
class WeatherHourSpider(scrapy.Spider):
  # Spider名称,不能删除
  name = "weatherHour"
  allowed_domains = ['weather.com']
  start_urls = [
    'https://weather.com/zh-CN/weather/hourbyhour/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
  ]
  # 必须指定管道,否则报错
  custom_settings = {
    
    
    "ITEM_PIPELINES": {
    
    
      'weather.pipelines.WeatherHourPipeline': 302
    }
  }
  def parse(self, response):
    # 每日天气
    weatherHourItem= WeatherHourItem()
    yield weatherHourItem
    pass

pipelines.py
多个管道

class WeatherPipeline(object):
    def process_item(self, item, spider):
        print("Weather")
        return item


class WeatherHourPipeline(object):
    def process_item(self, item, spider):
        print("WeatherHour")
        return item
        

2、方案2:单pipelines通过spider.name区分

settings.py

# 这个数值的范围是0-1000, 这个数值确定了他们的运行顺序(即优先级), 数字越小越优先执行
ITEM_PIPELINES = {
    
    
   'weather.pipelines.WeatherPipeline': 300
}

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

# 用于保存所抓取的数据的容器
# 定义字段内容

# 每日天气
class WeatherItem(scrapy.Item):
  # define the fields for your item here like:
  # 今日
  name = scrapy.Field()
  # 天气状态
  status = scrapy.Field()
  # 日期
  date = scrapy.Field()
  # 最高气温
  max = scrapy.Field()
  # 最低气温
  min = scrapy.Field()
  pass

# 每小时天气
class WeatherHourItem(scrapy.Item):
  # 唯一标志
  date = scrapy.Field()
  pass

WeatherSpider.py

import scrapy
from weather.items import WeatherItem
# 爬虫逻辑
class WeatherSpider(scrapy.Spider):
  # Spider名称,不能删除
  name = "weather"
  allowed_domains = ['weather.com']
  start_urls = [
    'https://weather.com/zh-CN/weather/today/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
  ]
  
  def parse(self, response):
    # 每日天气
    weatherItem = WeatherItem()
    yield weatherItem 
    pass

WeatherHourSpider.py

import scrapy
from weather.items import WeatherHourItem

# 爬虫逻辑
class WeatherHourSpider(scrapy.Spider):
  # Spider名称,不能删除
  name = "weatherHour"
  allowed_domains = ['weather.com']
  start_urls = [
    'https://weather.com/zh-CN/weather/hourbyhour/l/7f14186934f484d567841e8646abc61b81cce4d88470d519beeb5e115c9b425a',
  ]
  
  def parse(self, response):
    # 每日天气
    weatherHourItem= WeatherHourItem()
    yield weatherHourItem
    pass

pipelines.py
同一个管道

class WeatherPipeline(object):
    def process_item(self, item, spider):
        if spider.name == "Weather":
            print("Weather")
        elif spider.name == "WeatherHour":
            print("WeatherHour")
        return item

猜你喜欢

转载自blog.csdn.net/qq_26003101/article/details/113662853