使用selenium抓取信息

import time
import pymongo
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from random import randint

class WpShop():
    mongoClient=pymongo.MongoClient()
    db=mongoClient['py_spider']['wp_shop']
    
    #设置浏览器
    options=webdriver.ChromeOptions()
    # 屏蔽图片，加快访问速度
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option('prefs',prefs)
    
    #加载谷歌浏览器
    brower=webdriver.Chrome(options=options)


    # 获取唯品会首页并搜索商品
    @classmethod
    def base(cls):
        cls.brower.get('https://www.vip.com/')
        
        #等待加载
        wait=WebDriverWait(cls.brower,10)

        #搜索框输入
        el_input=wait.until(EC.presence_of_element_located(
            (By.XPATH,'//input[@class="c-search-input  J-search-input"]')
        ))
        el_input.send_keys('电脑')
        
        #点击进行搜索
        el_button=wait.until(EC.presence_of_element_located(
            (By.XPATH,'//span[@class="c-search-icon"]')
        ))
        time.sleep(2)
        el_button.click()

        time.sleep(randint(1,3))


    # 滚动条滚动
    @classmethod
    def dropDown(cls):
        for i in range(1,12):
            js_code=f'document.documentElement.scrollTop = {i * 1000}'
            cls.brower.execute_script(js_code)
            time.sleep(randint(1,3))

    # 数据解析
    @classmethod
    def parseData(cls):
        cls.dropDown()#滚动页面，以加载所有数据
        div_list=cls.brower.find_elements(
            By.XPATH,'//section[@id="J_searchCatList"]/div[@class="c-goods-item  J-goods-item c-goods-item--auto-width"]')

        for div in div_list:
            item=dict()

            href=div.find_element(By.XPATH,'./a')
            price=div.find_element(By.XPATH,'.//div[@class="c-goods-item__sale-price J-goods-item__sale-price"]')
            title=div.find_element(By.XPATH,'.//div[2]/div[2]')

            item['title']=title.text
            item['price']=price.text
            item['href']=href.get_attribute("href")
            print(item)
            cls.saveData(item)
        cls.nextPage()#翻页

    # 保存数据
    @classmethod
    def saveData(cls,item):
        cls.db.insert_one(item)

    # 翻页
    @classmethod
    def nextPage(cls):
        try:
            #翻页按钮
            next_button=cls.brower.find_element(By.XPATH,'.//div//*[@id="J_nextPage_link"]')
            if next_button.is_displayed():
                next_button.click()
                cls.parseData()
            else:
                cls.brower.close()
        except Exception as e:
            print("最后一页",e)
            cls.brower.quit()

    # 启动函数
    @classmethod
    def main(cls):
        cls.base()
        cls.parseData()

if __name__=='__main__':
    ws=WpShop()
    ws.main()
使用selenium抓取信息

猜你喜欢