目录
4.2 存储操作(以MongoDB为存储数据库)-pipelines.py
4.7 分布式操作的实现-add_category_to_redis.py
一、基础知识及相关安装包
下面是一些博客链接
HTTP基础知识理解:Python网络爬虫-原理及基础知识
Charles抓包:Windows环境下配合Charles进行抓包
JS渲染:如何结合Splash对JS进行渲染
代理池的构建:多线程结合Flask构建免费高匿代理池
MongoDB基础教程:MongoDB基础教程
Redis基础教程:Redis基础教程
scrapy基础项目开发:暂无,后续会做一个小节介绍
二、开发环境及项目结构
软件环境:Pycharm2019 + MongoDB + Redis3.20 + MongoDB+ WIndows10
三、结果展示
四、实战源码
4.1 数据模型-items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MallSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class Category(scrapy.Item):
'''
定义类型数据模型,明确抓取的字段
'''
# 大、中、小商品对应的名称及url
b_category_name = scrapy.Field()
b_category_url = scrapy.Field()
m_category_name = scrapy.Field()
m_category_url = scrapy.Field()
s_category_name = scrapy.Field()
s_category_url = scrapy.Field()
class Product(scrapy.Item):
'''
商品信息数据模型
'''
product_category = scrapy.Field() # 商品类别
product_category_id = scrapy.Field() # 类别ID
product_sku_id = scrapy.Field() # 商品ID
product_name = scrapy.Field() # 商品名称
product_img_url = scrapy.Field() # 商品图片URL
product_book_info = scrapy.Field() # 图书信息,作者,出版社
product_option = scrapy.Field() # 商品选项
product_shop = scrapy.Field() # 商品店铺
product_comments = scrapy.Field() # 商品评论数量
product_ad = scrapy.Field() # 商品促销
product_price = scrapy.Field() # 商品价格
4.2 存储操作(以MongoDB为存储数据库)-pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from mall_spider.spiders.jd_category import JdCategorySpider
from pymongo import MongoClient
from mall_spider.settings import MONGODB_URL
from mall_spider.spiders.jd_product import JdProductSpider
class CategoryPipeline(object):
'''存储操作'''
def open_spider(self,spider):
# 当爬虫启动时执行,执行一次
if isinstance(spider,JdCategorySpider):
# 连接MongoDB数据库,获取要操作的集合
self.client = MongoClient(MONGODB_URL)
self.collection = self.client['jd']['category']
def process_item(self, item, spider):
if isinstance(spider,JdCategorySpider):
# 插入数据,转换成字典
self.collection.insert_one(dict(item))
return item
def close_spider(self,spider):
# 关闭连接
if isinstance(spider, JdCategorySpider):
self.client.close()
class ProductPipeline(object):
'''存储操作'''
def open_spider(self, spider):
# 当爬虫启动时执行,执行一次
if isinstance(spider, JdProductSpider):
# 连接MongoDB数据库,获取要操作的集合
self.client = MongoClient(MONGODB_URL)
self.collection = self.client['jd']['product']
def process_item(self, item, spider):
if isinstance(spider, JdProductSpider):
# 插入数据,转换成字典
self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
# 关闭连接
if isinstance(spider, JdProductSpider):
self.client.close()
4.3 项目配置-settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for mall_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'mall_spider'
SPIDER_MODULES = ['mall_spider.spiders']
NEWSPIDER_MODULE = 'mall_spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mall_spider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'mall_spider.middlewares.MallSpiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 下载中间件进行反爬,随机头和随机IP,最好使用付费的
# DOWNLOADER_MIDDLEWARES = {
# 'mall_spider.middlewares.MallSpiderDownloaderMiddleware': 543,
# 'mall_spider.middlewares.RandomUserAgent':301,
# }
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 数字越小越先执行
'mall_spider.pipelines.CategoryPipeline': 300,
'mall_spider.pipelines.ProductPipeline':301,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 配置MongoDB的URL
MONGODB_URL = 'mongodb://127.0.0.1:27017'
# 在settings文件中配置scrapy_redis
# REDIS数据链接
REDIS_URL = 'redis://127.0.0.1:6379/0'
# 去重容器类: 用于把已爬指纹存储到基于Redis的set集合中
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 调度器: 用于把待爬请求存储到基于Redis的队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是不进行调度持久化:
# 如果是True, 当程序结束的时候, 会保留Redis中已爬指纹和待爬的请求
# 如果是False, 当程序结束的时候, 会清空Redis中已爬指纹和待爬的请求
SCHEDULER_PERSIST = True
4.4 中间件配置-middlewares.py
注释:这里以随机请求头和代理IP池返回的随机IP构成请求,但是由于京东对免费的IP池进行了反爬,所以我在settings中并没有开启middlewares的配置,仅作参考,若需要抓取万级量度的数据,可以选购付费IP池
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import requests
import re
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from twisted.web.client import ResponseFailed
from scrapy.core.downloader.handlers.http11 import TunnelError
# 实现代理IP中间件
class MallSpiderDownloaderMiddleware(object):
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
def process_request(self, request, spider):
# 从代理池随机获取一个代理IP,协议和域名
response = requests.get('http://localhost:6888/random?protocal=https&domain=jd.com')
request.meta['proxy'] = response.content.decode()
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
# 当请求出现异常的时候, 代理池哪些代理IP在本域名下是不可以用的
url = 'http://localhost:6868/disable_domain'
proxy = request.meta['proxy']
ip = re.findall('https?://(.+?):\d+', proxy)[0]
params = {
'ip': ip,
'domain': 'jd.com'
}
# 发送请求, 告诉代理池这个代理IP在本域名下是不可以用的
requests.get(url, params=params)
#1. 准备User-Agent列表
# 准备请求头
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
# 实现RandomUSerAgent类,UserAgent的中间件
class RandomUserAgent(object):
def process_request(self, request, spider):
# 如果请求是https://cdnware.m.jd.com开头,就是iPhone的user-agent
if request.url.startswith('https://cdnware.m.jd.com'):
request.headers['user-agent'] = 'JD4iPhone/164880 (iPhone; iOS 12.1.2; Scale/2.00)'
else:
request.headers['user-agent'] =random.choice(USER_AGENTS)
4.5 分类信息抓取-jd_category.py
注:抓取主页面的所有分类信息
# -*- coding: utf-8 -*-
import scrapy
import json
from mall_spider.items import Category
class JdCategorySpider(scrapy.Spider):
name = 'jd_category'
allowed_domains = ['3.cn']
start_urls = ['https://dc.3.cn/category/get'] # 修改起始url
def parse(self, response):
# print(response.body.decode('GBK')) # 指定编码方式,防止乱码
result = json.loads(response.body.decode('GBK'))
datas = result['data']
# 遍历数据列表
for data in datas:
# 利用数据模型进行存储
item = Category()
b_category = data['s'][0]
# 大分类信息
b_category_info = b_category['n']
# print("大分类:{}".format(b_category_info))
item['b_category_name'], item['b_category_url'] = self.get_actegory_name_url(b_category_info)
# 中分类信息列表
m_category_s = b_category['s']
# 遍历中分类信息列表
for m_category in m_category_s:
# 中分类信息
m_category_info = m_category['n']
item['m_category_name'], item['m_category_url'] = self.get_actegory_name_url(m_category_info)
# print("中分类:{}".format(m_category_info))
# 获取小分类数据列表
s_category_s = m_category['s']
for s_category in s_category_s:
s_category_info = s_category['n']
# print("小分类:{}".format(s_category_info))
item['s_category_name'], item['s_category_url'] = self.get_actegory_name_url(s_category_info)
# print(item)
# 把数据交给引擎
yield item
def get_actegory_name_url(self,category_info):
'''
根据分类的信息,提取名称和url
:param category_info: 分类信息
:return: 分类名称和url
'''
# 商品分类有三种数据格式,部分url需要进行拼接
# mice.jd.com;1713-9278;4938-12420-12423
category = category_info.split('|')
category_url = category[0] # 分类URL
category_name = category[1] # 分类名称
# 处理第一类url
if category_url.count('jd.com') == 1:
# url进行补全即可
category_url = 'https://'+category_url
elif category_url.count('-')==1:
# 处理第二类url
category_url = 'https://channel.jd.com/{}.html'.format(category_url)
else:
# 处理第三类,把url中-替换为,
category_url = category_url.replace('-',',')
category_url = 'https://list.jd.com/list.html?cat={}'.format(category_url)
return category_name, category_url
4.6 抓取商品详细信息-jd_product.py
注:这里主要是借用Networks面板进行分析,对网络中出现的json数据接口进行分析得出的,并没有使用selenium对页面进行操作
# -*- coding: utf-8 -*-
import time
import scrapy
import json
import pickle
from mall_spider.items import Product
from jsonpath import jsonpath
from scrapy_redis.spiders import RedisSpider
'''
分布式爬虫
1.修改爬虫类,继承RedisSpider
'''
class JdProductSpider(RedisSpider):
name = 'jd_product'
allowed_domains = ['jd.com','3.cn']
# start_urls = ['http://jd.com/']
# 2. 用于指定起始url列表,在redis数据库中的key
redis_key = 'jd_product:category'
# def start_requests(self):
# '''重写start_requests方法,根据分类信息构建列表页的请求'''
# category = {
# "b_category_name":"家用电器",
# "b_category_url":"https://jiadian.jd.com",
# "m_category_name": "电视",
# "m_category_url": "https://list.jd.com/list.html?cat=737,794,798",
# "s_category_name": "超薄电视",
# "s_category_url": "https://list.jd.com/list.html?cat=737,794,798&ev=4155_76344&sort=sort_rank_asc&trans=1&JL=2_1_0#J_crumbsBar"
# }
# # 根据小分类的url构建列表页面请求
# yield scrapy.Request(category['s_category_url'],callback=self.parse,meta={'category':category})
# 3. 重写make_request_from_data
def make_request_from_data(self, data):
'''
根据redis读取的分类信息的二进制数据,构建请求
:param data: 分类信息的二进制数据
:return: 根据小分类URl构建的请求对象
'''
# 把分类的二进制数据转换为字典
category = pickle.loads(data)
# 根据小分类的url构建列表页面请求
# 注意这里必须使用return返回请求,yield会变成生成器
return scrapy.Request(category['s_category_url'],callback=self.parse,meta={'category':category})
def parse(self, response):
category = response.meta['category']
# print(category)
# 解析列表页,提取商品的skuid
sku_ids = response.xpath('//div[contains(@class,"gl-i-wrap")]//@data-sku').extract()
for sku_id in sku_ids:
# 创建一个Product,用于保存商品数据
item = Product()
# product_category = scrapy.Field() # 商品类别
# product_sku_id = scrapy.Field() # 商品ID
# 设置商品类型
item['product_category'] = category
item['product_sku_id'] =sku_id
# 构建商品基本信息请求,通过Charles进行识别获得
# item传入无需深拷贝
product_base_url = 'https://cdnware.m.jd.com/c1/skuDetail/apple/7.3.0/{}.json'.format(sku_id) # 但是只能看前三十个
yield scrapy.Request(product_base_url,callback=self.parse_product_base,meta={'item':item})
# 获取下一页的url,这里源代码不能使用了,已经进行修改
# 最大页面数,下面未提取出来,所以换在上面提取,这里不知道为什么失效了,所以只抓取每个分类的前20页,也就是i为40
max_len = int(response.xpath('//*[@id="J_topPage"]/span/i/text()').extract_first())
# print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh",max_len)
i = 1
s = 0
# for i in range(1,max_len+1):
if i < 20:
# 进行url拼接,这里的url只为偶数,但发现为偶数时候,不显示后面30个商品,这里是用&s参数来区分的,s每次加60(s代表页指标不动) p代表上页下页
# 如果i为偶数,则代表不换页,s不变
if i %2 !=0:
s += 30
next_url = category['s_category_url'].split('#')[0]+ "&page={}&s={}".format(i,s)
# print(next_url)
yield scrapy.Request(next_url,callback=self.parse,meta={'category':category})
i += 1
def parse_product_base(self,response):
# 取出传递来的数据
item = response.meta['item']
# print(item)
# print(response.text)
# 把json字符串转化为字典
result = json.loads(response.text)
# 提取数据
# product_name = scrapy.Field() # 商品名称
# product_img_url = scrapy.Field() # 商品图片URL
# product_book_info = scrapy.Field() # 图书信息,作者,出版社
# product_option = scrapy.Field() # 商品选项
# product_shop = scrapy.Field() # 商品店铺
# product_comments = scrapy.Field() # 商品评论数量
# product_ad = scrapy.Field() # 商品促销
# product_price = scrapy.Field() # 商品价格
item['product_name'] = result['wareInfo']['basicInfo']['name']
item['product_img_url'] = result['wareInfo']['basicInfo']['wareImage'][0]['small']
item['product_book_info'] = result['wareInfo']['basicInfo']['bookInfo']
color_size = jsonpath(result,'$..colorSize')
if color_size:
# 注意colorsize值是个列表,jsonpath返回列表
color_size = color_size[0]
product_option = {}
for option in color_size:
title = option['title']
value = jsonpath(option,'$..text') # 这是一个路径表达式
product_option[title] = value
item['product_option'] = product_option
shop = jsonpath(result,'$..shop')
if shop:
shop = shop[0]
if shop:
# 无id则为京东自营
item['product_shop'] = {
'shop_id':shop['shopId'],
'shop_name': shop['name'],
'shop_score': shop['score']
}
else:
item['product_shop'] = {
'shop_name': '京东自营'
}
item['product_category_id'] = result['wareInfo']['basicInfo']['category'].replace(';',',')
# 准备促销信息的URL,搜索promotion可以获得相关api,5051427
ad_url = 'https://cd.jd.com/promotion/v2?skuId={}&area=12_919_922_0&cat={}'.format(item['product_sku_id'],item['product_category_id'])
# print(item)
# 构建促销请求
yield scrapy.Request(ad_url,callback=self.parse_product_ad,meta={'item':item})
def parse_product_ad(self,response):
item = response.meta['item']
print(item)
# print(response.body.decode['GBK'])
# 转化为字典
result = json.loads(response.body)
item['product_ad'] = jsonpath(result,'$..ad')[0] if jsonpath(result,'$..ad') else ''
# 构建评价信息请求,comments刷新页面可见
comments_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(item['product_sku_id'])
yield scrapy.Request(comments_url,callback=self.parse_product_comments,meta={'item':item})
print(item)
def parse_product_comments(self,response):
item = response.meta['item']
# print(item)
# print(response.text)
result = json.loads(response.text)
item['product_comments'] = {
'CommentCount': jsonpath(result,'$..CommentCount')[0],
'GoodCount': jsonpath(result,'$..GoodCount')[0],
'PoorCount': jsonpath(result,'$..PoorCount')[0],
'GoodRate': jsonpath(result,'$..GoodRate')[0]
}
# print(item)
# 构造价格请求,但这里scrapy会过滤非jd.com的域名,所以前面需要加上
price_url = 'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(item['product_sku_id'])
yield scrapy.Request(price_url,callback=self.parse_product_price,meta={'item':item})
def parse_product_price(self,response):
item = response.meta['item']
# print(response.text)
item['product_price'] = json.loads(response.text)[0]['p']
# print(item)
# 把商品数据交给引擎
yield item
4.7 分布式操作的实现-add_category_to_redis.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Author : {Jack Zhao}
@Time : 2020/5/14 12:26
@Contact : {[email protected]}
@Desc : 链接mongodb,链接redis,将mongodb中数据添加到redis_key,关闭mongodb
'''
from pymongo import MongoClient
from redis import StrictRedis
from mall_spider.settings import MONGODB_URL,REDIS_URL
from mall_spider.spiders.jd_product import JdProductSpider
import pickle
def add_category_to_redis():
mongo =MongoClient(MONGODB_URL)
redis =StrictRedis.from_url(REDIS_URL)
# 获取信息
collection = mongo['jd']['category']
cursor = collection.find()
for category in cursor:
# 序列化字典数据
data = pickle.dumps(category)
# 添加到redis
redis.lpush(JdProductSpider.redis_key,data)
mongo.close()
if __name__ == '__main__':
# 调用方法测试
add_category_to_redis()
五、项目使用方法
# WIn+R 输入services.msc
# 先保证已经开启MongoDB,Redis
cd ../mall_spider
scrapy crawl jd_category # 先抓目录,再进行下面的操作
scrapy crawl jd_product # 这里暂时是使用本机的IP进行爬取,不要过分爬取,防止京东封你家IP