利用selenium批量获取蓝桥杯历年真题(仅供参考)

"""
爬取蓝桥杯历届试题
tips:
1. 提前登录账号并加入历届试题课程:https://www.lanqiao.cn/courses/2786/learning
2. 本代码还有待改善,因为蓝桥网站反扒太强,到处都是动态加载和请求,导致很多转圈圈的gif动图
3. 使用无头浏览器需要先把登录二维码截取下来并弹窗展示
4. 这里只提供一个思路
4. 有时间和能力再改进
"""
import os

from time import sleep

from urllib import request
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options

chrome_optios = Options()
chrome_optios.add_argument('--headless')
chrome_optios.add_argument('--disable-gpu')

dirName = 'TestLibs'
if not os.path.exists(dirName):
    os.mkdir(dirName)

url = 'https://www.lanqiao.cn/courses/2786/learning/'
# 无头浏览器需要先把登录二维码截取下来并弹窗展示
# bro = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_optios)
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.lanqiao.cn/login/')
# 先登录你的账号
sleep(20)
# 获取试题页面信息
bro.get(url)


num = 1
flag = True

while flag:
    # 太快了你会发现全是转圈圈的gif
    sleep(10)
    # 图片懒加载
    try:
        WebDriverWait(bro, 10, 0.5).until(lambda bro: bro.find_element_by_xpath(
            '//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[2]/div/button'))
    except:
        flag = False
    n = 1

    page_text = bro.page_source

    tree = etree.HTML(page_text)

    imgs = tree.xpath('//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[1]/div/div/div[2]/div/div/div/div')
    # //*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[1]/div/div/div[2]/div/div/div
    title = '课件' + str(num)
    for img in imgs:

        img_url = img.xpath('./img/@src')[0]
        if not os.path.exists(dirName + '/' + title):
            os.makedirs(dirName + '/' + title)
        imgPath = dirName + '/' + title + '/' + str(n) + '.png'
        print(imgPath)
        print(img_url)
        request.urlretrieve(img_url, filename=imgPath)
        n += 1
    if flag:
        bro.find_element_by_xpath('//*[@id="__layout"]/div/div[2]/div[1]/div[4]/div/div/div[2]/div/button').click()
    num += 1
    print(num)

bro.quit()


猜你喜欢

转载自blog.csdn.net/qq_31910669/article/details/114292828
今日推荐