获取淘宝商品数据

使用selenium3+bs4

思路:淘宝网址,如果通过selenium去访问,就会要求我们登录,我们可以让程序睡一会,自己手动扫码登录,就可以获取到商品的网页数据了,然后通过bs4解析获得商品数据。

import time
from selenium import webdriver
from bs4 import BeautifulSoup


class SpiderProduct(object):
    def __init__(self,url,obj,fp,end_page,driver):
        self.fp = fp
        self.end_page = end_page
        self.start_page = 1
        self.url = url
        self.obj = obj
        self.driver = driver

    # 入口,
    def start(self):
        # options = webdriver.ChromeOptions()
        # options.add_argument('headless')
        # driver = webdriver.Chrome(options=options)
        # driver = webdriver.Chrome()
        self.driver.get(self.url)
        self.driver.implicitly_wait(10)
        self.driver.find_element_by_id('q').send_keys(self.obj)
        self.driver.find_element_by_class_name('btn-search').click()

        # 自己扫码登录
        time.sleep(10)
        self.get_html()

    # 获取网页数据
    def get_html(self):
        page_data = self.driver.page_source
        self.get_product_detail(page_data)
        self.start_page +=1
        if self.start_page > self.end_page:
            return
        self.get_to_next_page()

    # 下一页
    def get_to_next_page(self):
        netx_page = self.driver.find_element_by_class_name('next')
        # print(netx_page)
        # 下一页没有点击事件,添加一个点击事件
        self.driver.execute_script("arguments[0].click()", netx_page)
        # 等待下一页的内容加载出来
        time.sleep(5)
        self.get_html()

    # 获取商品信息
    def get_product_detail(self,page_data):
        soup = BeautifulSoup(page_data, 'lxml')
        # 把一页的所有商品信息取出来(html)
        one_page_products = soup.find_all(class_='J_MouserOnverReq')
        product = []
        for one_product in one_page_products:
            productImg = one_product.find(class_="J_ItemPic").get('src')
            productPrice = one_product.find(class_="g_price").get_text().strip()[1:]
            productTitle = one_product.find(class_="row-2").get_text().strip()
            productShop = one_product.find(class_="shopname").get_text().strip()
            productStatus = one_product.find(class_="deal-cnt").get_text().strip()[:-3]
            productLocation = one_product.find(class_='row-3').find(class_='location').get_text().strip()
            print(productPrice)
            print(productTitle)
            print(productShop)
            print(productStatus)
            print(productLocation)
            time.sleep(0.5)
            # name = input()

            product_one = {
                '图片':productImg,
                '价格':productPrice,
                '标题':productTitle,
                '店铺':productShop,
                '销量':productStatus,
                '地点':productLocation,
            }
            product.append(product_one)
        self.save_to_file(product)
        # name = input()

    # 将商品信息保存到文件中
    def save_to_file(self,product):
        print(len(product))
        self.fp.write(str(product)+'\n')

if __name__ == '__main__':
    fp = open('taobao.json','a',encoding='utf-8')
    # url = 'https://www.tmall.com/'
    url = 'https://www.taobao.com/'
    obj = '手机'
    end_page = 10
    driver = webdriver.Chrome()
    try:
        sp = SpiderProduct(url,obj,fp,end_page,driver)
        sp.start()
    finally:
        fp.close()
        driver.close()

猜你喜欢

转载自blog.csdn.net/weixin_42825585/article/details/87902909