JD图书信息爬取

# -*- coding: utf-8 -*-

from …items import JingdongItem
from copy import deepcopy
import scrapy
import json
import urllib.parse
import re

class JingdongspiderSpider(scrapy.Spider):
name = ‘jingdongspider’
# allowed_domains = [‘book.jd.com’,‘p.3.cn’] # 由于以后的访问网页会脱离这个域名
start_urls = [‘https://book.jd.com/booksort.html’]
def parse(self, response):
dt_list = response.xpath(’//div[@class=“mc”]/dl/dt’) # 大标题,大分类列表
for dt in dt_list:
item = {}
item[“d_cate”] = dt.xpath(’./a/text()’).extract_first() # 获取大分类的名字
em_list = dt.xpath(’./following-sibling::dd[1]/em’) # 获取当前大分类后面的dd 兄弟节点，小分类节点
for em in em_list:
item[‘s_href’] = em.xpath(’./a/@href’).extract_first() # 获取小分类链接网址 small herf
item[‘s_cate’] = em.xpath(’./a/text()’).extract_first() # 获取小分类的名字
if item[‘s_href’] is not None: # 判断小分类url 是否存在，如果存在则发起请求
item[‘s_href’] = self.proces_url(item[‘s_href’])
yield scrapy.Request(url=item[‘s_href’],callback=self.parse_book_list,meta={“item”:deepcopy(item)})

def proces_url(self,href):                                                       # 用于url 转换，因为重定向无法抓取到结果
    url = re.findall("(\d+)",href)
    return "https://list.jd.com/list.html?cat="+url[0]+','+url[1]+','+url[2]+'&tid='+url[2]

def parse_book_list(self,response):                                              # 解析列表页
    '''
    解析列表：进入 小分类链接，查看每本书的详细信息。
        步骤1：进入小分类，查看元素，研究查看器对应的规则。
        2：查看元素，网络，找到1713-3258-30xxx.html 查看响应状态为302为重定向，由于scrapy 能帮我们重定向所以我们就找到重定向后的目标html进行查看
        3.查看重定向后的网页源码中是否能够找到需要的数据，并和查看器中对比 一致，代表没有经过二次请求得到，我们便可以直接获取到
        4.查看结果是重新发起请求后第二次加载出来的
    '''
    item = response.meta["item"]                                                 # 此处获取上面传下来的item 共享数据
    li_list = response.xpath('//div[@id="plist"]/ul/li')                         # 获取li 标签列表
    for li in li_list:                                                           #遍历li_list
        item["book_img"] = li.xpath('.//div[@class="p-img"]//img/@src').extract_first()  #获取图书图片
        item["book_name"] = li.xpath('.//div[@class="p-name"]//em/text()').extract_first().strip() # 获取图书名,由于获取的是字符串，且取出文本两边的空白符
        item["book_author"] = li.xpath('.//span[@class="p-bi-name"]/a/text()').extract()  # 由于作者可能不是一个，这里不用取第几个,  注意：目前无法获取作者
        item["book_href"] = li.xpath('.//div[@class="p-name"]/a/@href').extract_first().strip()
        if item["book_href"]  is not None:
            item["book_href"] = "https:" + item["book_href"]
        else:
            item["book_href"] = ""
        # 价格由于是二次加载，可以使用firefox 浏览器，F12,调试器下的p.3.cn 中可以看prices
        # 由于二次加载的价格和book-sku 的内容相关，所以这里需要提取出来sku 的值。
        item["book_sku"] = li.xpath('./div/@data-sku').extract_first()
        if item["book_sku"] is not None:
            NEXTURL = "https://p.3.cn/prices/mgets?skuIds=J_"+item['book_sku']
            yield scrapy.Request(NEXTURL,callback=self.parse_book_price,meta={"item":deepcopy(item)})   # 此处为了防止每页数据相互干扰，使用深拷贝
        # https://p.3.cn/prices/mgets?skuIds=J_ 该网址可以从第二次加载数据的网址中看出，注意还要将p.3.cn加载现在域名范文内

    # 列表页翻页,此处查看下一页按钮的href ，并将其进行 url 拼接
    next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
    if next_url is not None:
        next_url = urllib.parse.urljoin(response.url,next_url)                           # 进行url 拼接，意思就是将基地址与一个相对地址形成一个绝对地址
        yield scrapy.Request(next_url,callback=self.parse_book_list,meta={"item":item})

def parse_book_price(self,response):
    item = response.meta["item"]
    item["book_price"] = json.loads(response.body.decode("utf-8"))[0]["op"]              # 第二次加载的数据是一个json 字符串
    with open("jd.txt", "a",encoding="utf-8") as f:
        res = dict(item)
        book_name = res['book_name']
        book_price = res['book_price']
        book_href = res["book_href"]
        f.write(book_name+"\t\t价格："+book_price+"\t\t地址："+book_href+"\n")

猜你喜欢