1. Website analysis
- 1, website, study which interface this data comes from
- 2. Anti-crawling parameters: request header referer/origin checksum x-api-key
- 3. For a detailed analysis of the process, watch Eleven Sister Shiyi’s video at Station B, or Knowledge PlanetTime’s Longest Graphic Article
2. Final code
from loguru import logger
import requests
import re
headers = {
"authority": "api.regulations.gov",
"accept": "application/vnd.api+json",
"accept-language": "zh-CN,zh;q=0.9",
"referer": "https://www.regulations.gov/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
}
doc_url = "https://www.regulations.gov/docket/FDA-2016-D-1399/document"
res = requests.get(doc_url, headers=headers, timeout=20)
logger.info(f"req请求: {
res.status_code}")
api_key = re.search(r"apiKey%22%3A%22(.*?)%22%2C%22api", res.text).group(1)
doc_id = re.search(r"/(FDA.*?)/document", doc_url).group(1)
headers.update({
"X-Api-Key": api_key})
doc_true_url = f"https://api.regulations.gov/v4/documents?filter[docketId]={
doc_id}&page[number]=1&sort=-commentEndDate"
res = requests.get(doc_true_url, headers=headers, timeout=20)
logger.info(f"req请求: {
res.status_code}")
for file_a in res.json()['data']:
file_title = file_a['attributes']['title']
logger.info(f">>>file_id is {
file_a['id']}, title is {
file_title}")