python python crawling Jingdong book information

python python crawling Jingdong book information

Town house directly on the code. . . . . . . . . . . . . . . . . . . . . . . . .
Need to use the library: requests lxml pymongo
need to install these libraries, and configure mongodb database.

import requests
from lxml import etree
import time
import pymongo

class JdSpider(object):
    url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&click=0'
    next_url = 'https://search.jd.com/s_new.php?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&scrolling=y&log_id=1569070263.40778&tpl=2_M&show_items='
    page = 1
    s = 3
    # 拼接前30本书籍的url
    def parse_next_page(self):
        next_page_url = self.url % (self.page, self.s)  # 1 3
        next_url = self.next_url % (self.page + 1, self.s + 23) # 2 23
        self.page += 1
        self.s += 50
        if self.page <= 100:
            return next_page_url,next_url
    #发送第一次请求,获取前30本书籍的响应
    def start_request(self,url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'referer': url,
        }
        response = requests.get(url,headers = headers)
        response.encoding = 'utf-8'
        #字符串响应
        response_str = response.text
        #xpath响应
        response = etree.HTML(response.text)
        return response,response_str


    # 解析书籍信息,返回书籍信息列表
    def parse_book(self,response_str):
        #替换掉多余的html代码,利于xpath解析
        response_str = response_str.replace('<font class="skcolor_ljg">','').replace('</font>','')
        response_xpath1 = etree.HTML(response_str)
        book_names = response_xpath1.xpath("//li//div[@class='p-name']/a/em/text()")

        response_str = response_str.replace('<em>','').replace('</em>','').replace('<i>','').replace('</i>','')
        response_xpath2 = etree.HTML(response_str)
        book_prices = response_xpath2.xpath("//li//div[@class='p-price']/strong/text()")

        book_message_list = []
        for book_name,book_price in zip(book_names,book_prices):
            book_message = {
                '书籍名称':book_name,
                '书籍价格':book_price
            }
            book_message_list.append(book_message)
        return book_message_list


    # 解析构造后30本书籍的url
    def parse_url(self, response):
        url_data = response.xpath("//ul[@class='gl-warp clearfix']/li/@data-sku")
        url_data = ','.join(url_data)
        url,next_url = self.parse_next_page()
        next_url = next_url + url_data
        return next_url

    # 数据去重
    def data_clear(self,book_message_list):
        #去掉书籍价格为空的两组数据
        #可将列表转换为set集合去重复
        new_message_list = []
        for book in book_message_list:
            if book['书籍价格'] == '¥':
                book.clear()
            if book != {}:
                new_message_list.append(book)
        return new_message_list
        
    # 爬取60本书籍的信息
    def start_spider(self):
        #发送第一次请求,获取前30本书籍的信息
        url,next_url = self.parse_next_page()
        response,response_str = self.start_request(url)
        book_message_list1 = self.parse_book(response_str)
        #发送第二次请求,得到后30本书籍的信息
        next_url = self.parse_url(response)
        #获取字符串对象,xpath对象
        response2 ,response_str = self.start_request(next_url)
        book_message_list2 = self.parse_book(response_str)
        book_message_list = book_message_list1 + book_message_list2
        book_message_list = self.data_clear(book_message_list)
        return book_message_list

        #数据存储到本地
        # for book_message in book_message_list:
        #     with open('京东python书籍信息.csv','a+') as fp:
        #         fp.write(str(book_message) + '\n')
        # fp.close()

#将数据存入mongodb
class Save_data(object):
    def __init__(self):
        host = "127.0.0.1"
        port = 27017
        dbname = "jdbooks"
        sheetname = "book"
        # 创建MONGODB数据库链接
        client = pymongo.MongoClient(host=host, port=port,)
        client.admin.authenticate("数据库名称", "密码", mechanism='SCRAM-SHA-1')
        # 指定数据库
        mydb = client[dbname]
        # 存放数据的数据库表名
        self.post = mydb[sheetname]

    def process(self, message_list):
        for message in message_list:
            self.post.insert(message)
        return True

a = JdSpider()
b = Save_data()
i = int(input("请输入要获取的页数:",))
for i in range(i):
    time.sleep(2)
    book_message_list = a.start_spider()
    b.process(book_message_list)
    print('存储成功')


Jingdong official website for reptile is quite friendly, anti-climb is not particularly complex mechanism.
Mainly, ajax dynamic refresh the page, find url law and splice configuration parameters, extracting information, as well as execution parameters refer headers in the request url sources with xpath away, this can directly add the parameter directly in the construction of headers in detail look at the code it has based it is easy to understand.

Released four original articles · won praise 8 · views 796

Guess you like

Origin blog.csdn.net/weixin_45796250/article/details/102739793