python爬取京东python书籍信息

python爬取京东python书籍信息

直接上代码镇宅。。。。。。。。。。。。。。。。。。。。。。。。。
需要用到的库:requests lxml pymongo
需要安装这些库,并且配置好mongodb数据库。

import requests
from lxml import etree
import time
import pymongo

class JdSpider(object):
    url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&click=0'
    next_url = 'https://search.jd.com/s_new.php?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&scrolling=y&log_id=1569070263.40778&tpl=2_M&show_items='
    page = 1
    s = 3
    # 拼接前30本书籍的url
    def parse_next_page(self):
        next_page_url = self.url % (self.page, self.s)  # 1 3
        next_url = self.next_url % (self.page + 1, self.s + 23) # 2 23
        self.page += 1
        self.s += 50
        if self.page <= 100:
            return next_page_url,next_url
    #发送第一次请求,获取前30本书籍的响应
    def start_request(self,url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'referer': url,
        }
        response = requests.get(url,headers = headers)
        response.encoding = 'utf-8'
        #字符串响应
        response_str = response.text
        #xpath响应
        response = etree.HTML(response.text)
        return response,response_str


    # 解析书籍信息,返回书籍信息列表
    def parse_book(self,response_str):
        #替换掉多余的html代码,利于xpath解析
        response_str = response_str.replace('<font class="skcolor_ljg">','').replace('</font>','')
        response_xpath1 = etree.HTML(response_str)
        book_names = response_xpath1.xpath("//li//div[@class='p-name']/a/em/text()")

        response_str = response_str.replace('<em>','').replace('</em>','').replace('<i>','').replace('</i>','')
        response_xpath2 = etree.HTML(response_str)
        book_prices = response_xpath2.xpath("//li//div[@class='p-price']/strong/text()")

        book_message_list = []
        for book_name,book_price in zip(book_names,book_prices):
            book_message = {
                '书籍名称':book_name,
                '书籍价格':book_price
            }
            book_message_list.append(book_message)
        return book_message_list


    # 解析构造后30本书籍的url
    def parse_url(self, response):
        url_data = response.xpath("//ul[@class='gl-warp clearfix']/li/@data-sku")
        url_data = ','.join(url_data)
        url,next_url = self.parse_next_page()
        next_url = next_url + url_data
        return next_url

    # 数据去重
    def data_clear(self,book_message_list):
        #去掉书籍价格为空的两组数据
        #可将列表转换为set集合去重复
        new_message_list = []
        for book in book_message_list:
            if book['书籍价格'] == '¥':
                book.clear()
            if book != {}:
                new_message_list.append(book)
        return new_message_list
        
    # 爬取60本书籍的信息
    def start_spider(self):
        #发送第一次请求,获取前30本书籍的信息
        url,next_url = self.parse_next_page()
        response,response_str = self.start_request(url)
        book_message_list1 = self.parse_book(response_str)
        #发送第二次请求,得到后30本书籍的信息
        next_url = self.parse_url(response)
        #获取字符串对象,xpath对象
        response2 ,response_str = self.start_request(next_url)
        book_message_list2 = self.parse_book(response_str)
        book_message_list = book_message_list1 + book_message_list2
        book_message_list = self.data_clear(book_message_list)
        return book_message_list

        #数据存储到本地
        # for book_message in book_message_list:
        #     with open('京东python书籍信息.csv','a+') as fp:
        #         fp.write(str(book_message) + '\n')
        # fp.close()

#将数据存入mongodb
class Save_data(object):
    def __init__(self):
        host = "127.0.0.1"
        port = 27017
        dbname = "jdbooks"
        sheetname = "book"
        # 创建MONGODB数据库链接
        client = pymongo.MongoClient(host=host, port=port,)
        client.admin.authenticate("数据库名称", "密码", mechanism='SCRAM-SHA-1')
        # 指定数据库
        mydb = client[dbname]
        # 存放数据的数据库表名
        self.post = mydb[sheetname]

    def process(self, message_list):
        for message in message_list:
            self.post.insert(message)
        return True

a = JdSpider()
b = Save_data()
i = int(input("请输入要获取的页数:",))
for i in range(i):
    time.sleep(2)
    book_message_list = a.start_spider()
    b.process(book_message_list)
    print('存储成功')


京东官网对于爬虫还是比较友好的,没有特别复杂的反爬机制。
主要是,ajax动态刷新页面,发现url规律并拼接构造参数,离用xpath提取信息,还有headers中的refer参数执行请求来源的url,这个可以直接在构造的headers中直接添加该参数,详细的看代码吧,有基础的很容易看明白。

发布了4 篇原创文章 · 获赞 8 · 访问量 796

猜你喜欢

转载自blog.csdn.net/weixin_45796250/article/details/102739793