Practice python reverse crawling

Reference: What should I do if the crawler encounters the anti-crawling mechanism? See how I solved it!

Directly upload the code

import requests
import random
from bs4 import BeautifulSoup

# 访问神龙http链接地址,获取IP
url = 'http://api.shenlongip.com/ip?key={your_key}&pattern=json&count=10&need=1011&mr=1&protocol=2&sign={your_sign}'
req_data =  requests.get(url).json()
resp_data = req_data['data']

"""设置IP"""
iplist=[]
for data in resp_data:
    iplist.append( str(data['ip'])+":"+str(data['port']))
# print(iplist)

"""获取ip代理"""
# 随机抽取IP代理,并封装成proxies格式(requests的ip代理规定格式)
def getip():
    proxy= iplist[random.randint(0,len(iplist)-1)]
    proxy = proxy.replace("\n","")
    proxies={
        'http':'http://'+str(proxy),
    }
    return proxies

headers = {
            'Host':'movie.douban.com',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
            'cookie':'bid=uVCOdCZRTrM; douban-fav-remind=1; __utmz=30149280.1603808051.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __gads=ID=7ca757265e2366c5-22ded2176ac40059:T=1603808052:RT=1603808052:S=ALNI_MYZsGZJ8XXb1oU4zxzpMzGdK61LFA; _pk_ses.100001.4cf6=*; __utma=30149280.1867171825.1603588354.1603808051.1612839506.3; __utmc=30149280; __utmb=223695111.0.10.1612839506; __utma=223695111.788421403.1612839506.1612839506.1612839506.1; __utmz=223695111.1612839506.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=223695111; ap_v=0,6.0; __utmt=1; dbcl2="165593539:LvLaPIrgug0"; ck=ZbYm; push_noty_num=0; push_doumail_num=0; __utmv=30149280.16559; __utmb=30149280.6.10.1612839506; _pk_id.100001.4cf6=e2e8bde436a03ad7.1612839506.1.1612842801.1612839506.',
            'accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'upgrade-insecure-requests': '1',
        }
# 示例url
url = "http://fund.eastmoney.com/data/fundranking.html#tall;c0;r;s1nzf;pn50;ddesc;qsd20220505;qed20230505;qdii;zq;gg;gzbd;gzfs;bbzt;sfbb"
r = requests.get(url, proxies=getip(), headers=headers, verify=False)
print("响应值:{}".format(r.status_code))
print("----- -----")
print("获取编码:{}".format(r.apparent_encoding))
# 设置编码
r.encoding = r.apparent_encoding

soup = BeautifulSoup(r.text, 'html.parser')
# 通过table标签ID,找内容
table = soup.find('table', {'id': 'dbtable'})
rows = table.find_all('tr')
# 遍历每个单元格的值
for row in rows:
    cells = row.find_all('th')
    row_values = [cell.text for cell in cells]
    print(row_values)

Get IP test IP

Pass postman test

 

Guess you like

Origin blog.csdn.net/qq_23938507/article/details/130499545