1. Website analysis
- 1, website, study which interface this data comes from
![Insert image description here](https://img-blog.csdnimg.cn/edbda44ac9f24a7d8277e8c771b64834.png)
- 2. Anti-crawling parameters: request header referer/origin checksum x-api-key
![Insert image description here](https://img-blog.csdnimg.cn/f901b4c4e20e4b9fa8f30fe283105c8e.png)
- 3. For a detailed analysis of the process, watch Eleven Sister Shiyi’s video at Station B, or Knowledge PlanetTime’s Longest Graphic Article
![Insert image description here](https://img-blog.csdnimg.cn/c8e7e66e2b8b44ac8f0504f08493b678.png)
2. Final code
from loguru import logger
import requests
import re
headers = {
"authority": "api.regulations.gov",
"accept": "application/vnd.api+json",
"accept-language": "zh-CN,zh;q=0.9",
"referer": "https://www.regulations.gov/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
}
doc_url = "https://www.regulations.gov/docket/FDA-2016-D-1399/document"
res = requests.get(doc_url, headers=headers, timeout=20)
logger.info(f"req请求: {
res.status_code}")
api_key = re.search(r"apiKey%22%3A%22(.*?)%22%2C%22api", res.text).group(1)
doc_id = re.search(r"/(FDA.*?)/document", doc_url).group(1)
headers.update({
"X-Api-Key": api_key})
doc_true_url = f"https://api.regulations.gov/v4/documents?filter[docketId]={
doc_id}&page[number]=1&sort=-commentEndDate"
res = requests.get(doc_true_url, headers=headers, timeout=20)
logger.info(f"req请求: {
res.status_code}")
for file_a in res.json()['data']:
file_title = file_a['attributes']['title']
logger.info(f">>>file_id is {
file_a['id']}, title is {
file_title}")