Getting selenium's python reptile crawling Jingdong all commodity information

This blog article is only used my amateur record, publish this, only users to read reference, if infringement, please inform me and I will be deleted.

The code takes advantage of selenium in webriver module, if you want to run the following code, will have to download the browser plug-ins.
Google browser click: http: //npm.taobao.org/mirrors/chromedriver/
Firefox browser click: https: //github.com/mozilla/geckodriver/releases
click on the corresponding hyperlink, find the corresponding own browser version plug-in, and then downloaded into the plug-in installation path of python.
Man of few words said, directly on the code. There are not clear or do not understand can leave a comment below.

from selenium import webdriver
import time
import csv

class JdSpider(object):
    def __init__(self):
        self.url = 'https://www.jd.com/'          		  #设置url
        # 创建浏览器对象
        self.browser = webdriver.Chrome()           #设置浏览器

    # 打开某东,输入搜索内容,点击搜索
    def get_page(self):
        self.browser.get(self.url)
        self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬虫书籍')  # 这里填写需要搜索的知识
        self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()   #执行点击 查询 按钮
        time.sleep(2)   #睡眠两秒让页面加载

    # 提取商品信息
    def parse_page(self):
        # 执行js脚本,把进度条拉到最底部
        self.browser.execute_script(
            'window.scrollTo(0,document.body.scrollHeight)')
        # 加载网页的商品
        time.sleep(3)

        li_list = self.browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
        # li_list : [li节点1,li节点2,商品信息li]
        for li in li_list:
            product_list = li.text.split('\n')
            if product_list[0] == '单件':
                # 价格
                price = product_list[2]
                # 名字
                name = product_list[3]
                # 评论
                comment = product_list[4]
                # 商家
                market = product_list[5]
            else:
                # 价格
                price = product_list[0]
                # 名字
                name = product_list[1]
                # 评论
                comment = product_list[2]
                # 商家
                market = product_list[3]
            item=[price,market,name,comment]
            print([price,market,comment,name])
            with open('xxx.csv','a',encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(item)

    def main(self):
        self.get_page()
        while True:
            self.parse_page()

            # 判断是否应该点击下一页
            if self.browser.page_source.find('pn-next disabled') == -1:
                self.browser.find_element_by_class_name('pn-next').click()
                time.sleep(2)
            else:
                break

if __name__ == '__main__':
    spider = JdSpider()
    spider.main()

Reference in the classroom, I made some comments finishing, add a step to save the file.

Published 34 original articles · won praise 210 · views 20000 +

Guess you like

Origin blog.csdn.net/weixin_45081575/article/details/97023841