pyppeteer是Puppeteer的Python版,是通过CDP协议(Chrome DevTools Protocol)来控制 Chromium/Chrome 浏览器的行为。
打开网页、截屏
import asyncio
from pyppeteer import launch
async def main():
browser = await launch(
headless=False, # 无头模式
defaultViewport={
"width": 1920, "height": 1080}, # 视图宽高
isMobile=True,
args=['--start-maximized', '--no-sandbox'], # 最大化视图窗口,禁用沙箱
ignoreDefaultArgs=['--enable-automation'] # 禁止显示正受自动软件控制
)
page = await browser.newPage()
await page.goto('http://www.dzwww.com/')
await page.waitFor(20000) # 等待
await page.screenshot({
'path': 'example.png'})
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
css定位查找第一个元素
# Jeval 等价于 querySelectorEval
title = await page.Jeval("#politics h2 a", 'node => node.title')
css定位查找所有元素
# JJeval 等价于 querySelectorAllEval
urls = await page.JJeval(".city a", '(nodes => nodes.map(n => n.href))')
print(urls)
xpath定位查找
elements = await page.Jx('//div[@id="politics"]//div[@class="mid"]//ul/li/a')
获取属性
第一种方法:
elements = await page.Jx('//div[@id="politics"]//div[@class="mid"]//ul/li/a')
for element in elements:
url_context = await element.getProperty("href")
u = await url_context.jsonValue()
print(u)
第二种方法:
for element in elements:
href = await page.evaluate('item => item.href', element)
print(href)
拦截请求和响应
page对象支持on方法用来绑定事件,如请求和响应事件,这里以响应事件为例:
import asyncio
import datetime
from urllib.parse import urlparse
import openpyxl
import pyppeteer.errors
from pyppeteer import launch
df = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
def save_excel(data):
"""
保存数据到Excel
:param data:
:return:
"""
work_book = openpyxl.load_workbook("data.xlsx")
work_sheet = work_book[work_book.sheetnames[0]]
work_sheet.append([data["url"], data["status"]])
work_book.save("data.xlsx")
print("saved")
work_book.close()
def get_domain(url):
parse = urlparse(url)
domain = parse.netloc
return domain
async def intercept_response(rep):
if rep.status == 404 or rep.status == 403:
print(rep.status, rep.url)
save_excel({
"url": rep.url, "status": rep.status})
return None
async def main():
browser = await launch(
headless=True, # 无头模式
defaultViewport={
"width": 1920, "height": 1080}, # 视图宽高
isMobile=True,
args=['--start-maximized', '--no-sandbox'], # 最大化视图窗口,禁用沙箱
ignoreDefaultArgs=['--enable-automation'] # 禁止显示正受自动软件控制
)
page = await browser.newPage()
page.on('response', lambda rep: asyncio.ensure_future(intercept_response(rep)))
url = "http://www.xxx.com/"
print(url)
await page.goto(url)
# print(res.status)
urls = await page.JJeval("a", '(nodes => nodes.map(n => n.href))')
for url1 in urls:
print(url1)
if url1.startswith("http") and "xxx.com" in get_domain(url1):
try:
await page.goto(url1)
except pyppeteer.errors.TimeoutError:
pass
urls2 = await page.JJeval("a", '(nodes => nodes.map(n => n.href))')
for url2 in urls2:
print(url2)
if url2.startswith("http") and "xxx.com" in get_domain(url2):
try:
await page.goto(url2)
except pyppeteer.errors.TimeoutError:
pass
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
参考
https://miyakogi.github.io/pyppeteer/
https://blog.csdn.net/lilongsy/article/details/82754046
https://miyakogi.github.io/pyppeteer/reference.html