1. Description
Encrypted parameters are encountered when crawling a certain website. Since the JS code is not easily deciphered after obfuscation and compilation, selenium is used to obtain parameters, but the data we obtain from selenium is basically based on the page. For asynchronous websites initiated request, we can extract from the log
2. Set driver parameters
We first need to set the monitoring browser log through the Option object (such as ChromeOptions). The old version of Selenium is set through DesiredCapabilities. The following is the writing method of the new version
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-single-click-autofill")
options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
options.add_argument("--disable-full-form-autofill-ios")
options.add_experimental_option('perfLoggingPrefs', {
'enableNetwork': True,
'enablePage': False,
})
options.set_capability("goog:loggingPrefs", {
'browser': 'ALL',
'performance': 'ALL',
})
options.set_capability("goog:perfLoggingPrefs", {
'enableNetwork': True,
'enablePage': False,
'enableTimeline': False
})
3. Request a web page
Now instantiate a driver and initiate a webpage request. Here I use WebDriverWait
explicit waiting to wait for an element to appear. You can also wait implicitly or sleep directly. If you don’t wait, the asynchronous request will start to get before it is loaded. You may not get the data you want
service = Service(executable_path=executable_path)
driver = Chrome(service=service, options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
html = driver.execute_script("return document.documentElement.outerHTML")
4. Processing logs
Visit log_types
the properties of the driver to get all the log types, traverse it, get_log()
get the corresponding logs through the method, and then filter out the logs you want.
For example, here I filter out all Network.requestWillBeSent
the logs, that is, the data sent by the asynchronous request, because I need the request header of the request, if it is a response type log ( Network.responseReceived
), it only contains the response header. For specific supported types, please refer to the documentation of Google devtools
If you need to filter out Ajax (XHR) requests, you can judge according to the type in the params of the log, or through it
sign_dict = dict() # 用来存储自己想要的数据
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.requestWillBeSent':
continue
if message_log.get("params", {
}).get("type", "").upper() != "XHR":
continue
headers = message_log['params'].get('request', {
}).get('headers')
if not headers:
continue
x_sign = headers.get('X-Sign')
if not x_sign:
continue
x_app_id = headers.get('X-AppID')
x_ts = headers.get('X-Ts')
print("success:", x_sign, x_app_id, x_ts)
req_url = message_log['params'].get('request', {
}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
sign_dict[key] = {
"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts}
Note that if you want the response body, Network.responseReceived
the type of log response
field does not have a response body, you need to get it through params
the field requestId
, the reference code is as follows
res_body_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.responseReceived':
continue
if message_log.get("params", {
}).get("type", "").upper() != "XHR":
continue
request_id = message_log['params'].get("requestId")
if not request_id:
continue
req_url = message_log['params'].get('response', {
}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
content = driver.execute_cdp_cmd('Network.getResponseBody', {
'requestId': request_id})
body = None
try:
body = json.loads(content["body"])
except Exception as e:
print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
res_body_dict[key] = body
5. Complete code
The complete reference code for the above is as follows
import json
import os.path
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
def get_selenium_driver(executable_path=r"E:\webdriver\chromedriver.exe"):
options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-single-click-autofill")
options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
options.add_argument("--disable-full-form-autofill-ios")
options.add_experimental_option('perfLoggingPrefs', {
'enableNetwork': True,
'enablePage': False,
})
options.set_capability("goog:loggingPrefs", {
'browser': 'ALL',
'performance': 'ALL',
})
options.set_capability("goog:perfLoggingPrefs", {
'enableNetwork': True,
'enablePage': False,
'enableTimeline': False
})
service = Service(executable_path=executable_path)
driver = Chrome(service=service, options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
return driver
def get_sign_by_selenium(page_url):
driver = get_selenium_driver()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
# html = driver.execute_script("return document.documentElement.outerHTML")
# with open(r"C:\Users\admin\Desktop\test\test.html", "w") as f:
# f.write(html)
# time.sleep(10)
sign_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.requestWillBeSent':
continue
if message_log.get("params", {
}).get("type", "").upper() != "XHR":
continue
headers = message_log['params'].get('request', {
}).get('headers')
if not headers:
continue
x_sign = headers.get('X-Sign')
if not x_sign:
continue
x_app_id = headers.get('X-AppID')
x_ts = headers.get('X-Ts')
print("success:", x_sign, x_app_id, x_ts)
req_url = message_log['params'].get('request', {
}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
sign_dict[key] = {
"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts}
return sign_dict
def get_unisat_data_by_selenium(page_url):
driver = get_selenium_driver()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
res_body_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.responseReceived':
continue
if message_log.get("params", {
}).get("type", "").upper() != "XHR":
continue
request_id = message_log['params'].get("requestId")
if not request_id:
continue
req_url = message_log['params'].get('response', {
}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
content = driver.execute_cdp_cmd('Network.getResponseBody', {
'requestId': request_id})
body = None
try:
body = json.loads(content["body"])
except Exception as e:
print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
res_body_dict[key] = body
return res_body_dict
if __name__ == '__main__':
url = "https://unisat.io/brc20?q=bc1pkmnh3nj89uns3yp2mtqqxjns65vy6ca6n5jvp4s8ua8nke69cnjs987vtp"
print("get_sign_by_selenium(url):", get_sign_by_selenium(url))
# print("get_unisat_data_by_selenium(url):", get_unisat_data_by_selenium(url))
Attachment: For the use of selenium, please refer to the previous article
[Test] The use of Selenium (common attribute methods, element waiting, operating cookies, operating elements, headless mode, obtaining HTML source code) [Test]
selenium anti-crawling operation
[Test] modification Selenium option configuration parameters to optimize performance
[Test] Using selenium on a Linux (CentOS, Ubuntu) interfaceless server
[Test] Selenium operates Cookies