Crawler novice-how to identify the request header referer/origin and anti-crawl

1. Website analysis

  • 1, website, study which interface this data comes from
    Insert image description here
  • 2. Anti-crawling parameters: request header referer/origin checksum x-api-key
    Insert image description here
  • 3. For a detailed analysis of the process, watch Eleven Sister Shiyi’s video at Station B, or Knowledge PlanetTime’s Longest Graphic Article
    Insert image description here

2. Final code

# -*- coding: utf-8 -*-
# @Time : 2023-08-13
# @Author: sy
# @公众号: 逆向OneByOne
# @url: https://www.regulations.gov/docket/FDA-2016-D-1399/document
# @desc: 请求头referer/origin与X-Api-Key反爬校验
from loguru import logger
import requests
import re


headers = {
    
    
    "authority": "api.regulations.gov",
    "accept": "application/vnd.api+json",
    "accept-language": "zh-CN,zh;q=0.9",
    "referer": "https://www.regulations.gov/",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",

}
# 第一次请求
doc_url = "https://www.regulations.gov/docket/FDA-2016-D-1399/document"
res = requests.get(doc_url, headers=headers, timeout=20)
logger.info(f"req请求: {
      
      res.status_code}")
api_key = re.search(r"apiKey%22%3A%22(.*?)%22%2C%22api", res.text).group(1)
doc_id = re.search(r"/(FDA.*?)/document", doc_url).group(1)
headers.update({
    
    "X-Api-Key": api_key})
# 第二次请求
doc_true_url = f"https://api.regulations.gov/v4/documents?filter[docketId]={
      
      doc_id}&page[number]=1&sort=-commentEndDate"
res = requests.get(doc_true_url, headers=headers, timeout=20)
logger.info(f"req请求: {
      
      res.status_code}")
for file_a in res.json()['data']:
    file_title = file_a['attributes']['title']
    logger.info(f">>>file_id is {
      
      file_a['id']}, title is {
      
      file_title}")

Guess you like

Origin blog.csdn.net/weixin_43411585/article/details/132260402