pyppeteer打开网页、截屏、定位、上下文执行、获取元素属性值、拦截请求和响应

pyppeteer是Puppeteer的Python版,是通过CDP协议(Chrome DevTools Protocol)来控制 Chromium/Chrome 浏览器的行为。

打开网页、截屏

import asyncio
from pyppeteer import launch


async def main():
    browser = await launch(
        headless=False,  # 无头模式
        defaultViewport={
    
    "width": 1920, "height": 1080},  # 视图宽高
        isMobile=True,
        args=['--start-maximized', '--no-sandbox'],  # 最大化视图窗口,禁用沙箱
        ignoreDefaultArgs=['--enable-automation']  # 禁止显示正受自动软件控制
    )
    page = await browser.newPage()
    await page.goto('http://www.dzwww.com/')
    await page.waitFor(20000)  # 等待
    await page.screenshot({
    
    'path': 'example.png'})
    await browser.close()


asyncio.get_event_loop().run_until_complete(main())

css定位查找第一个元素

# Jeval 等价于 querySelectorEval
title = await page.Jeval("#politics h2 a", 'node => node.title')

css定位查找所有元素

# JJeval 等价于 querySelectorAllEval
urls = await page.JJeval(".city a", '(nodes => nodes.map(n => n.href))')
print(urls)

xpath定位查找

elements = await page.Jx('//div[@id="politics"]//div[@class="mid"]//ul/li/a')

获取属性

第一种方法:

elements = await page.Jx('//div[@id="politics"]//div[@class="mid"]//ul/li/a')
for element in elements:
    url_context = await element.getProperty("href")
    u = await url_context.jsonValue()
    print(u)

第二种方法:

for element in elements:
    href = await page.evaluate('item => item.href', element)
    print(href)

拦截请求和响应

page对象支持on方法用来绑定事件,如请求和响应事件,这里以响应事件为例:

import asyncio
import datetime
from urllib.parse import urlparse
import openpyxl
import pyppeteer.errors
from pyppeteer import launch

df = datetime.datetime.now().strftime("%Y%m%d%H%M%S")


def save_excel(data):
    """
    保存数据到Excel
    :param data:
    :return:
    """
    work_book = openpyxl.load_workbook("data.xlsx")
    work_sheet = work_book[work_book.sheetnames[0]]
    work_sheet.append([data["url"], data["status"]])
    work_book.save("data.xlsx")
    print("saved")
    work_book.close()


def get_domain(url):
    parse = urlparse(url)
    domain = parse.netloc
    return domain


async def intercept_response(rep):
    if rep.status == 404 or rep.status == 403:
        print(rep.status, rep.url)
        save_excel({
    
    "url": rep.url, "status": rep.status})
    return None


async def main():
    browser = await launch(
        headless=True,  # 无头模式
        defaultViewport={
    
    "width": 1920, "height": 1080},  # 视图宽高
        isMobile=True,
        args=['--start-maximized', '--no-sandbox'],  # 最大化视图窗口,禁用沙箱
        ignoreDefaultArgs=['--enable-automation']  # 禁止显示正受自动软件控制
    )
    page = await browser.newPage()
    page.on('response', lambda rep: asyncio.ensure_future(intercept_response(rep)))
    url = "http://www.xxx.com/"
    print(url)
    await page.goto(url)
    # print(res.status)

    urls = await page.JJeval("a", '(nodes => nodes.map(n => n.href))')
    for url1 in urls:
        print(url1)
        if url1.startswith("http") and "xxx.com" in get_domain(url1):
            try:
                await page.goto(url1)
            except pyppeteer.errors.TimeoutError:
                pass
            urls2 = await page.JJeval("a", '(nodes => nodes.map(n => n.href))')
            for url2 in urls2:
                print(url2)
                if url2.startswith("http") and "xxx.com" in get_domain(url2):
                    try:
                        await page.goto(url2)
                    except pyppeteer.errors.TimeoutError:
                        pass
    await browser.close()


asyncio.get_event_loop().run_until_complete(main())

参考

https://miyakogi.github.io/pyppeteer/
https://blog.csdn.net/lilongsy/article/details/82754046
https://miyakogi.github.io/pyppeteer/reference.html

猜你喜欢

转载自blog.csdn.net/lilongsy/article/details/129243814