版权声明:归所有菜鸟所有 https://blog.csdn.net/weixin_41752475/article/details/89439719
https://www.xicidaili.com/nn/1
import re
import requests
# 取ip,防止自己ip被反爬
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
#爬1-100
for i in range(1,100):
url = "https://www.xicidaili.com/nn/{}".format(i)
response = requests.get(url, headers=headers)
# print(response.text)
html = response.text
# re.S忽略换行符的干扰
ips = re.findall("<td>(\d+\.\d+\.\d+..\d+\d+)</td>", html, re.S)
ports = re.findall("<td>(\d+)</td>", html, re.S)
print(ips)
print(ports)
for ip in zip(ips, ports):
# 验证ip是否可用
proxies = {
"http": "http://" + ip[0] + ":" + ip[1],
"https": "https://" + ip[0] + ":" + ip[1],
}
try:
res = requests.get("http://www.baid.com", proxies=proxies, timeout=3)
print(ip, "能使用")
with open("ipok.text", mode="a+")as f:
f.write(":".join(ip))
except Exception as e:
with open("ipnot.text", mode="a+")as fn:
fn.write(":".join(ip))
print(ip, "不能使用")
# print(ip)
# 验证ip能不能用