21. 分布式爬虫Scrapy-Redis

1. 安装和远程链接redis

  • pip install scrapy-redis
  • ./redis-cli -h 192.168.1.27 -p 6379

2. scrapy-redis 架构图

在这里插入图片描述
在这里插入图片描述

3. scrapy-redis爬虫的相关配置

3.1. 生成当前环境已经安装的软件命令

pip freeze > requirement.txt

3.2. 创建新的环境

conda create -n fang-scrapy_python-3.7 python=3.7
conda activate fang-scrapy_python-3.7/  
pip install -r requirements.txt 

3.3. 部署scrapy-redis

  • spider 继承scrapy-redis中的 RedisSpider
from scrapy_redis.spiders import RedisSpider


class FangSpider(RedisSpider):
  • 替换start_urls 为 redis_key
    redis_key = "fang:start_urls"
  • 修改配置文件setting.py
# 14. scrapy-redis相关配置
# 确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis为item pipeline
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300,
}

# 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, 从而可以实现暂停和恢复的功能
SCHEDULER_PERSIST = True
# 设置链接redis信息
REDIS_HOST = '192.168.1.27'
REDIS_PORT = 6379
  • 启动spider,通过命令行方式启动
scrapy runspider fang.py
  • 在redis的客户端推入爬虫的起始路径
lpush fang:start_urls https://www.fang.com/SoufunFamily.htm 

4. 相关源代码

  • setting.py类
# 1.导包
import logging
import datetime
import os

# 2.项目名称 TODO 需要修改
BOT_NAME = 'fang_project'

# 3.模块名称
SPIDER_MODULES = ['{}.spiders'.format(BOT_NAME)]
NEWSPIDER_MODULE = '{}.spiders'.format(BOT_NAME)

# 4.遵守机器人协议(默认为True)
ROBOTSTXT_OBEY = False

# 5.用户代理(使用的浏览器类型)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 ' \
             'Safari/537.36 '

# 6.默认请求头信息(USER_AGENT 单独配置)
DEFAULT_REQUEST_HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
}

# 7.格式化日志输出的格式,日志文件每分钟生成一个文件
time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H-%M')
# LOG_FILE = '{}\\{}\\logs\\{}.log'.format(os.getcwd(), BOT_NAME, time_str)
# LOG_LEVEL = 'DEBUG'

# 8.设置运行多个爬虫的自定义命令
COMMANDS_MODULE = '{}.commands'.format(BOT_NAME)

# 9.scrapy输出的json文件中显示中文(https://www.cnblogs.com/linkr/p/7995454.html)
FEED_EXPORT_ENCODING = 'utf-8'

# 10.管道pipeline配置,后面的值越小,越先经过这根管道 TODO 需要修改
# ITEM_PIPELINES = {
#    '{}.pipelines.FangProjectPipeline'.format(BOT_NAME): 300,
# }

# 11.限制爬虫的爬取速度, 单位为秒
DOWNLOAD_DELAY = 3

# 12. 下载中间件 TODO 需要修改
DOWNLOADER_MIDDLEWARES = {
    '{}.middlewares.RandomUserAgent'.format(BOT_NAME): 1,
}

# 13. 禁用cookie
COOKIES_ENABLED = False

# 14. scrapy-redis相关配置
# 确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis为item pipeline
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300,
}

# 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, 从而可以实现暂停和恢复的功能
SCHEDULER_PERSIST = True
# 设置链接redis信息
REDIS_HOST = '192.168.1.27'
REDIS_PORT = 6379

  • spider类
# -*- coding: utf-8 -*-
import scrapy
import re
from fang_project.items import NewHouseItem

from scrapy_redis.spiders import RedisSpider


class FangSpider(RedisSpider):
    name = 'fang'
    allowed_domains = ['fang.com']
    # start_urls = ['https://www.fang.com/SoufunFamily.htm']
    redis_key = "fang:start_urls"

    def parse(self, response):
        trs = response.xpath("//div[@class='outCont']//tr")
        province = None  # 用来存储省份,如果当前列没有省份,就用上次的
        for tr in trs:
            tds = tr.xpath(".//td[not(@class)]")  # 查找不包含class属性的td
            province_td = tds[0]
            province_text = province_td.xpath(".//text()").get()
            province_text = re.sub(r"\s", "", province_text)  # 过滤空白字符   
            if province_text:
                province = province_text

            # 不爬取海外城市的房源
            if province == '其它':
                continue

            city_td = tds[1]
            city_links = city_td.xpath(".//a")
            for city_link in city_links:
                city = city_link.xpath(".//text()").get()
                city_url = city_link.xpath(".//@href").get()

                # 构建新房/二手房的url链接
                url_model = city_url.split("//")
                scheme = url_model[0]
                domain = url_model[1]

                if 'bj.' in domain:  # 北京的链接特殊处理
                    newhouse_url = "https://newhouse.fang.com/house/s/"
                    esf_url = "https://esf.fang.com/"
                else:
                    newhouse_url = scheme + "//" + "newhouse." + domain + "house/s/"
                    esf_url = scheme + "//" + "esf." + domain

                # 通过 meta传递一些信息到response的地方
                yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={'info': (province, city)})
                # yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={'info': (province, city)})

    def parse_newhouse(self, response):
        """解析新房的相关信息"""
        province, city = response.meta.get('info')
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            # 小区名称
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name:
                name = name.strip()
            # 居室
            house_type_list = li.xpath(".//div[contains(@class, 'house_type')]/a/text()").getall()
            house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))
            rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
            # 面积
            area = "".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
            area = re.sub(r"\s|/|-", "", area)
            # 地区/行政区域
            address = li.xpath(".//div[@class='address']/a/@title").get()
            district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
            if district_text:
                district = re.search(r".*\[(.+)\].*", district_text)
                if district:
                    district = district.group(1)
            # 在售么
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
            # 价格
            price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
            if price:
                price = re.sub(r"\s|广告", "", price)
            # 详情链接
            original_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()

            yield NewHouseItem(province=province, city=city, name=name, price=price, rooms=rooms, area=area,
                               district=district, sale=sale, original_url=original_url)

        # 获取下一页
        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse,
                                 meta={'info': (province, city)})

    # def parse_esf(self, response):
    #     """解析二手房的相关信息"""
    #     province, city = response.meta.get('info')
    #     print(province, city)

  • middlewares.py类
import random

from fang_project.conf.user_agent import USER_AGENT_LIST


class RandomUserAgent(object):

    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(USER_AGENT_LIST)


发布了85 篇原创文章 · 获赞 12 · 访问量 3732

猜你喜欢

转载自blog.csdn.net/fanjianhai/article/details/103800695