京东图书分布式爬虫

版权声明:青春看似荒唐,没人会选择投降。 https://blog.csdn.net/sdzhr/article/details/82822882

使用scrapy_redis实现京东图书分布式爬虫

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for jdbook project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'jdbook'

SPIDER_MODULES = ['jdbook.spiders']
NEWSPIDER_MODULE = 'jdbook.spiders'

USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jdbook (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False


# 启用Redis调度存储请求队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 确保所有的爬虫通过Redis去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS:这玩意儿2.X的可以用。3.X的不能用
# SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"

# 不清除Redis队列、这样可以暂停/恢复 爬取
SCHEDULER_PERSIST = True

# 使用优先级调度请求队列 (默认使用)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# 可选用的其它队列
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'

# 最大空闲时间防止分布式爬虫因为等待而关闭
# SCHEDULER_IDLE_BEFORE_CLOSE = 10

# 将清除的项目在redis进行处理
ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300
}

REDIS_URL = 'redis://@127.0.0.1:6379'

jd.py

# -*- coding: utf-8 -*-
import scrapy
import copy
import json
import urllib
from scrapy_redis.spiders import RedisSpider


class JdSpider(RedisSpider):
    name = 'jd'
    allowed_domains = ['jd.com', 'p.3.cn']
    # start_urls = ['https://book.jd.com/booksort.html']
    redis_key = "jdbook"

    def parse(self, response):
        item = {}
        dt_list = response.xpath('//div[@class="mc"]/dl/dt')
        for dt in dt_list:
            item["dt_title"] = dt.xpath('./a/text()').extract_first()
            item["dd_title"] = dt.xpath('./following-sibling::dd[1]/em/a/text()').extract_first()
            dd_url = "https://" + dt.xpath('./following-sibling::dd[1]/em/a/@href').extract_first()
            yield scrapy.Request(
                dd_url,
                callback=self.parse_book_list,
                meta={"item": copy.deepcopy(item)}
            )


    def parse_book_list(self, response):
        item = response.meta["item"]
        book_list = response.xpath('//li[@class="gl-item"]')
        for book in book_list:
            item["book_title"] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first().strip()
            item["book_author"] = book.xpath('.//span[@class="author_type_1"]/a/text()').extract()
            item["book_author"] = [i.strip() for i in item["book_author"]]
            item["book_store"] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
            sku_id = book.xpath('./div/@data-sku').extract_first()
            if sku_id:
                yield scrapy.Request(
                    "https://p.3.cn/prices/mgets?skuIds=J_" + sku_id,
                    callback=self.parse_price,
                    meta={"item": copy.deepcopy(item)}
                )
        next_url = urllib.parse.urljoin(response.url, response.xpath('//a[@class="pn-next"]/@href').extract_first())
        yield scrapy.Request(
            next_url,
            callback=self.parse_book_list,
            meta={"item": item}
        )


    def parse_price(self, response):
        item = response.meta["item"]
        item["book_price"] = json.loads(response.text)[0].get('op')
        print(item)

猜你喜欢

转载自blog.csdn.net/sdzhr/article/details/82822882