1. 安装和远程链接redis
pip install scrapy-redis
./redis-cli -h 192.168.1.27 -p 6379
2. scrapy-redis 架构图
3. scrapy-redis爬虫的相关配置
3.1. 生成当前环境已经安装的软件命令
pip freeze > requirement.txt
3.2. 创建新的环境
conda create -n fang-scrapy_python-3.7 python=3.7
conda activate fang-scrapy_python-3.7/
pip install -r requirements.txt
3.3. 部署scrapy-redis
- spider 继承scrapy-redis中的 RedisSpider
from scrapy_redis.spiders import RedisSpider
class FangSpider(RedisSpider):
- 替换start_urls 为 redis_key
redis_key = "fang:start_urls"
- 修改配置文件setting.py
# 14. scrapy-redis相关配置
# 确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis为item pipeline
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300,
}
# 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, 从而可以实现暂停和恢复的功能
SCHEDULER_PERSIST = True
# 设置链接redis信息
REDIS_HOST = '192.168.1.27'
REDIS_PORT = 6379
- 启动spider,通过命令行方式启动
scrapy runspider fang.py
- 在redis的客户端推入爬虫的起始路径
lpush fang:start_urls https://www.fang.com/SoufunFamily.htm
4. 相关源代码
- setting.py类
# 1.导包
import logging
import datetime
import os
# 2.项目名称 TODO 需要修改
BOT_NAME = 'fang_project'
# 3.模块名称
SPIDER_MODULES = ['{}.spiders'.format(BOT_NAME)]
NEWSPIDER_MODULE = '{}.spiders'.format(BOT_NAME)
# 4.遵守机器人协议(默认为True)
ROBOTSTXT_OBEY = False
# 5.用户代理(使用的浏览器类型)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 ' \
'Safari/537.36 '
# 6.默认请求头信息(USER_AGENT 单独配置)
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
}
# 7.格式化日志输出的格式,日志文件每分钟生成一个文件
time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H-%M')
# LOG_FILE = '{}\\{}\\logs\\{}.log'.format(os.getcwd(), BOT_NAME, time_str)
# LOG_LEVEL = 'DEBUG'
# 8.设置运行多个爬虫的自定义命令
COMMANDS_MODULE = '{}.commands'.format(BOT_NAME)
# 9.scrapy输出的json文件中显示中文(https://www.cnblogs.com/linkr/p/7995454.html)
FEED_EXPORT_ENCODING = 'utf-8'
# 10.管道pipeline配置,后面的值越小,越先经过这根管道 TODO 需要修改
# ITEM_PIPELINES = {
# '{}.pipelines.FangProjectPipeline'.format(BOT_NAME): 300,
# }
# 11.限制爬虫的爬取速度, 单位为秒
DOWNLOAD_DELAY = 3
# 12. 下载中间件 TODO 需要修改
DOWNLOADER_MIDDLEWARES = {
'{}.middlewares.RandomUserAgent'.format(BOT_NAME): 1,
}
# 13. 禁用cookie
COOKIES_ENABLED = False
# 14. scrapy-redis相关配置
# 确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis为item pipeline
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300,
}
# 在redis中保持scrapy-redis用到的队列,不会清理redis中的队列, 从而可以实现暂停和恢复的功能
SCHEDULER_PERSIST = True
# 设置链接redis信息
REDIS_HOST = '192.168.1.27'
REDIS_PORT = 6379
- spider类
# -*- coding: utf-8 -*-
import scrapy
import re
from fang_project.items import NewHouseItem
from scrapy_redis.spiders import RedisSpider
class FangSpider(RedisSpider):
name = 'fang'
allowed_domains = ['fang.com']
# start_urls = ['https://www.fang.com/SoufunFamily.htm']
redis_key = "fang:start_urls"
def parse(self, response):
trs = response.xpath("//div[@class='outCont']//tr")
province = None # 用来存储省份,如果当前列没有省份,就用上次的
for tr in trs:
tds = tr.xpath(".//td[not(@class)]") # 查找不包含class属性的td
province_td = tds[0]
province_text = province_td.xpath(".//text()").get()
province_text = re.sub(r"\s", "", province_text) # 过滤空白字符
if province_text:
province = province_text
# 不爬取海外城市的房源
if province == '其它':
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get()
city_url = city_link.xpath(".//@href").get()
# 构建新房/二手房的url链接
url_model = city_url.split("//")
scheme = url_model[0]
domain = url_model[1]
if 'bj.' in domain: # 北京的链接特殊处理
newhouse_url = "https://newhouse.fang.com/house/s/"
esf_url = "https://esf.fang.com/"
else:
newhouse_url = scheme + "//" + "newhouse." + domain + "house/s/"
esf_url = scheme + "//" + "esf." + domain
# 通过 meta传递一些信息到response的地方
yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={'info': (province, city)})
# yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={'info': (province, city)})
def parse_newhouse(self, response):
"""解析新房的相关信息"""
province, city = response.meta.get('info')
lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
for li in lis:
# 小区名称
name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
if name:
name = name.strip()
# 居室
house_type_list = li.xpath(".//div[contains(@class, 'house_type')]/a/text()").getall()
house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))
rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
# 面积
area = "".join(li.xpath(".//div[contains(@class, 'house_type')]/text()").getall())
area = re.sub(r"\s|/|-", "", area)
# 地区/行政区域
address = li.xpath(".//div[@class='address']/a/@title").get()
district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
if district_text:
district = re.search(r".*\[(.+)\].*", district_text)
if district:
district = district.group(1)
# 在售么
sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
# 价格
price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
if price:
price = re.sub(r"\s|广告", "", price)
# 详情链接
original_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
yield NewHouseItem(province=province, city=city, name=name, price=price, rooms=rooms, area=area,
district=district, sale=sale, original_url=original_url)
# 获取下一页
next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
if next_url:
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse,
meta={'info': (province, city)})
# def parse_esf(self, response):
# """解析二手房的相关信息"""
# province, city = response.meta.get('info')
# print(province, city)
- middlewares.py类
import random
from fang_project.conf.user_agent import USER_AGENT_LIST
class RandomUserAgent(object):
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(USER_AGENT_LIST)