selenium 抓取京东商品数据

代码参考崔庆才老师的selenium课程,实现对京东界面的商品数据抓取,本博客用于记录。

#-*- coding : utf-8 -*&-
# @Time     :2020/2/10 20:05
# @Author   :Zhou
# @File     :jd.py
# @Software :PyCharm
import re
import time

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq

broswer = webdriver.Chrome()
wait = WebDriverWait(broswer,10)


def Search(key):
    try:
        broswer.get('https://www.jd.com/')
        #直到加载完成获取搜索输入框和提交按钮
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#key'))
        )
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#search > div > div.form > button'))
        )
        #输入关键词并提交
        input.send_keys(key)
        submit.click()

        #获取总页数
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > em:nth-child(1) > b'))
        )
        #获取商品信息
        GetProducts()
        return total.text
    except TimeoutException:
        return Search()

#翻页
def NextPage(page_num):
    try:
        broswer.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(3)
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
        )
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a'))
        )
        input.clear()
        input.send_keys(page_num)
        broswer.execute_script("arguments[0].scrollIntoView(true);", submit)
        submit.click()

        #确定跳转到了当前页
        wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#J_bottomPage > span.p-num > a.curr'))
        )

        # 获取商品信息
        GetProducts()
    except TimeoutException:
        return NextPage(page_num)


#获取商品信息
def GetProducts():
    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li > div'))
    )
    html = broswer.page_source
    doc = pq(html)
    #提取商品所在li标签
    items = doc('#J_goodsList ul li').items()
    print(items)
    for item in items:
        # print(item)
        products = {
            'image' : item.find('img').attr('src'),
            'price' : item.find('.p-price').text(),
            'name' : item.find('.p-name a').attr('title'),
            'shop' : item.find('.p-shop .J_im_icon a').text(),
        }
        #打印商品信息,可换成存储
        print(products)


def main():
    total = Search('美食')
    print(total)
    for i in range(2,total):
        NextPage(i)


if __name__ == '__main__':
    main()

抓取结果(部分):

<generator object PyQuery.items at 0x000001B882D3F0A0>
{'image': '//img13.360buyimg.com/n7/jfs/t1/109818/12/5745/238873/5e410ebcE652d5a90/ac41889f7cb7f389.jpg', 'price': '¥163.00', 'name': '杉城 情人节休闲零食大礼包一整箱送女友女生儿童礼盒美食品网红好吃的组合装1800g', 'shop': '杉城京东自营食品专区'}
{'image': '//img12.360buyimg.com/n7/jfs/t1/66065/25/13312/171680/5da84a0cE423502e2/5fb16ea276bd1949.jpg', 'price': '¥34.90', 'name': '【百草味宅家囤货】1件减3,2件减10!满减专区满199减100,立即抢购》', 'shop': '百草味京东自营旗舰店'}
{'image': '//img11.360buyimg.com/n7/jfs/t19789/156/1695647084/405881/b96f328/5ad3331aN61d56b5e.jpg', 'price': '¥19.90¥18.80', 'name': '现烤现发,吃起来香酥脆,每袋15个', 'shop': '徽太狼食品专营店'}
{'image': '//img11.360buyimg.com/n7/jfs/t1/93066/10/12117/436643/5e40fd1bEa30dcc21/39084840940dd721.jpg', 'price': '¥69.90', 'name': '杉城 情人节鸭肉肉类休闲零食大礼包一整箱卤味鸭脖鸭掌鸭舌送女友女生礼盒网红美食品好吃的组合装550g', 'shop': '杉城京东自营食品专区'}
{'image': '//img14.360buyimg.com/n7/jfs/t1/82596/17/13947/694702/5db7ac6eEd1138dff/82de3657ffe3144d.jpg', 'price': '¥59.90', 'name': '杉城 情人节肉类休闲零食大礼包一整箱送女友女生儿童礼盒豆干美食品超市网红好吃的组合装1100g', 'shop': '杉城京东自营食品专区'}
{'image': '//img14.360buyimg.com/n7/jfs/t1/91808/26/12032/785150/5e417f7eE3181165a/3f5a4704121a7f0f.png', 'price': '¥15.90', 'name': '【三只松鼠·超级满减】神券爆发ing,先领券再满减,部分商品领券满300减200,限时活动多拍更优惠哦!快来速速抢购吧!,', 'shop': '三只松鼠旗舰店'}
{'image': '//img11.360buyimg.com/n7/jfs/t1/101190/20/11068/300617/5e251816E2542f67e/08f744e4925a1ed3.jpg', 'price': '¥31.80', 'name': '此商品将于2020-02-14,00点结束闪购特卖,良品铺子屯粮行动', 'shop': '良品铺子京东自营旗舰店'}
{'image': '//img14.360buyimg.com/n7/jfs/t1/81464/9/4278/444127/5d27f818E6e7621b5/9248b2bb377ce6d4.jpg', 'price': '¥11.80', 'name': '上好旺金华酥饼20枚 浙江特产美食梅干菜烧饼小吃零食饼干160g', 'shop': ''}
{'image': '//img10.360buyimg.com/n7/jfs/t30313/211/1355903274/123779/af43ccea/5cde44fdNcaaafee0.jpg', 'price': '¥26.80', 'name': '黄心蜜薯5斤装仅19.9元戳此抢购', 'shop': '中国特产·苏陕扶贫馆'}
{'image': '//img14.360buyimg.com/n7/jfs/t1/108859/22/4655/161448/5e218cd4Ec80f045c/cced827be7327d40.jpg', 'price': '¥41.80', 'name': '【春节不打烊,京东快递,传统工艺,香飘四海】大部分地区当/次日达', 'shop': '宝聚源食品官方旗舰店'}
{'image': '//img12.360buyimg.com/n7/jfs/t1/103176/29/11904/151214/5e3e68efEa0ec4dec/f1d686f0e41ba500.jpg', 'price': '¥14.90', 'name': '【三只松鼠·情人节守护季】自营急速达,详情领神券,限时领券满300减200,限2月11日,立即抢购》', 'shop': '三只松鼠京东自营旗舰店'}
{'image': '//img11.360buyimg.com/n7/jfs/t1/102322/30/11041/286620/5e251ab5E1519b189/4946f3baeb3e4695.jpg', 'price': '¥19.90', 'name': '【良品铺子·好物正开抢】【专区爆款限时满199减120】立即抢购》》', 'shop': '良品铺子京东自营旗舰店'}
{'image': '//img13.360buyimg.com/n7/jfs/t1/102451/8/2060/201760/5dca2221E3043db7e/ac08e1d6d48edd87.jpg', 'price': '¥49.80', 'name': '12味大份量1242g', 'shop': '多滋熊食品旗舰店'}
{'image': '//img13.360buyimg.com/n7/jfs/t6370/82/2038752062/214697/da4f5175/595c4cbcN785191b6.jpg', 'price': '¥88.00', 'name': '烧鸡为散装计量熟食,为运输方便简单塑封,非预包装熟食,当天现做,新鲜直达。', 'shop': '刘美食品官方旗舰店'}
{'image': '//img12.360buyimg.com/n7/jfs/t1/85674/9/10821/437595/5e21966cE8eefcb2b/dc78854f7a3c7668.jpg', 'price': '¥49.90', 'name': '葡记 鲜鸡蛋卷凤凰卷 1000g 礼盒装 澳门风味 手工特色美食 曲奇饼干 网红办公室休闲零食', 'shop': '葡记京东自营旗舰店'}
{'image': '//img10.360buyimg.com/n7/jfs/t1/109812/11/5630/197844/5e3e5d88E8c8c26a6/265632f65a571471.jpg', 'price': '¥23.90', 'name': '【三只松鼠·情人节守护季】自营急速达,详情领神券,限时领券满300减200,限2月11日,立即抢购》', 'shop': '三只松鼠京东自营旗舰店'}

网页源代码在我的GitHub上查看。戳这可以联系我。

发布了6 篇原创文章 · 获赞 1 · 访问量 1762

猜你喜欢

转载自blog.csdn.net/msssssss/article/details/104265640
今日推荐