playwright download and basic use
1. Download playwright library
pip install playwright==1.33.0
playwright install
2. playwright use
2.1 Import library
from playwright.sync_api import sync_playwright
2.2 Basic use
- Start the engine, instantiate the object
browser = playwright.chromium.launch( headless=False,channel='chrome')
- create context
context
context = browser.new_context()
- create page object
page= context.new_page()
- page request
page.goto('https://www.baidu.com/')
- page close
context.close() browser.close()
- Do not load images
import re # 禁止加载图片,加速加载 def cancel_request(route): route.abort() page.route(re.compile(r"(.png)|(.jpg)"), cancel_request)
- maximize window
- set parameters
args=['--start-maximized']
andno_viewport=True
with sync_playwright() as playwright: browser = playwright.chromium.launch( headless=False, args=['--start-maximized'], # 窗口大小打开最大 ) # 启动引擎,实例化对象 context = browser.new_context(no_viewport=True) #显示最大窗口 page = context.new_page() page.goto(url) context.close() browser.close()
- set parameters
- Request Baidu total code
from playwright.sync_api import sync_playwright url='https://www.baidu.com/' with sync_playwright() as playwright: browser = playwright.chromium.launch( headless=True, # 无头 args=['--start-maximized'], # 窗口大小打开最大 channel='chrome' # 使用google浏览器打开 ) # 启动引擎,实例化对象 # no_viewport=True显示最大窗口 context = browser.new_context(no_viewport=True) # 需要创建一个 context,这是一个新的概念,不同于selenium,以后更为方便管理 cookies 和 ip代理 page = context.new_page() page.goto(url) context.close() browser.close()
3. XPATH element positioning method
- xpath tutorial ·
3.1 xpath positioning syntax
xpath=
Just add the point before the xpath statementinput_xpath = 'xpath= //input[@id="su"]'
3.2 Related operations
-
click element
page.locator(input_xpath).click()
-
Get element attributes
attribute1 = page.get_attribute(input_xpath, 'value') # 获取元素value值 attribute2= page.get_attribute(input_xpath, 'href') # 获取元素href值 attribute3 = page.get_attribute(input_xpath, 'src') # 获取元素src值 attribute4 = page.get_attribute(input_xpath, 'id') # 获取元素id值
-
get element text
text1=page.locator(ul).all_inner_texts() # 获取所有元素列表 text2=page.locator(ul).inner_text() # 所有元素字符串,所有文本变为一整个字符串
4. Waiting and caching
4.1 Waiting for operation
- mandatory wait
import time time.sleep(n)
- playwright force wait
page.wait_for_timeout(1000) # 单位ms
- Set the maximum waiting time for access
- Request Action Set Duration
page.goto(url, timeout=1000) # 单位ms
- Click to set the duration
page.locator(href_xpath).click(timeout=1000) # 单位ms
- Request Action Set Duration
4.2 Add cache
- Add cache to improve loading speed
from playwright.sync_api import Playwright, sync_playwright def run(playwright: Playwright) -> None: browser = playwright.firefox.launch_persistent_context(headless=False, args=['--start-maximized'], user_data_dir=os.path.join(root_path, 'user_data'),# 保存缓存地址 accept_downloads=True, slow_mo=250) page = browser.new_page() page.goto('https://www.baidu.com/') browser.close() with sync_playwright() as playwright: run(playwright)