selenium自动化爬取京东电脑商品信息用于数据分析

今天使用selenium给别人写的一个自动化爬虫程序

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import json
import re
import time
import os
import csv

browser=webdriver.Firefox()
browser.maximize_window()   #将窗口最大化
wait=WebDriverWait(browser, 10)

def search():

    browser.get("https://www.jd.com/")
    input=wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))   #获取输入框
    )
    submit=wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR,"#search > div > div.form > button > i"))   #获取搜索按钮
    )
    input.send_keys("电脑")
    submit.click()
    # jiexi_page()

def next_page(page_number):
    input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))   #找到输入页码按钮
    )
    submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a"))   #找到确认按钮
    )
    input.clear()
    input.send_keys(page_number)
    submit.click()
    jiexi_page()
    # wait.until(
    #      EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#J_bottomPage > span.p-num > a.curr"),str(page_number))
    # )
    # next_page(page_number)

def write_to_file(content):
    # with open("京东.xls","a",encoding="utf-8") as f:
    #     f.write(json.dumps(content,ensure_ascii=False)+"\n")

    f = open('京东商品.csv', 'a', encoding='utf-8', newline='')
    writer = csv.writer(f)
    writer.writerow(content)

def jiexi_page():
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR,"#J_searchWrap .gl-item"))    #判断是否加载成功
    )
    html=browser.page_source
    doc=pq(html)
    items=doc("#J_searchWrap .gl-item").items()   #遍历
    for item in items:
        product={
            'name':item.find(".p-name.p-name-type-2").text().replace("\n",""),
            'price':item.find(".p-price").text()[1:].replace("\n",""),
            '评价':item.find(".p-commit").text()[:-3],
            'shop':item.find(".p-shop").text()
        }
        product_list = list(product.values())
        print(product_list)
        write_to_file(product_list)

def main():
    search()
    time.sleep(1)
    next_page(1)
    for i in range(2,101):
        next_page(i)
        time.sleep(2)  # 很关键,我调试了好几个小时,由于未找到该元素,页面未刷新,会报错

if __name__=="__main__":
    main()

在这里插入图片描述

发布了60 篇原创文章 · 获赞 6 · 访问量 7772

猜你喜欢

转载自blog.csdn.net/qq_44205272/article/details/103297643