代码
import subprocess as sp
import requests,json,random,re,os
User_Agent = [
'Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; KTXN)',
'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0'
]
headers = {
}
headers['User-Agent'] = random.choice(User_Agent)
ip_pools=[]
new_pools=[]
"""
函数说明:获取IP代理
"""
def pro():
url = "http://proxylist.fatezero.org/proxy.list"
r = requests.get(url,headers=headers)
lists = r.text.split('\n')
for i in lists:
try:
li = json.loads(i,strict=False)
if str(li['anonymity']) == 'high_anonymous' and str(li['type']) == 'http':
ip_port = str(li['host'])+":"+str(li['port'])
ip_pools.append(ip_port)
except:
continue
"""
函数说明:检查代理IP的连通性
Parameters:
ip - 代理的ip地址
lose_time - 匹配丢包数
waste_time - 匹配平均时间
Returns:
average_time - 代理ip平均耗时
"""
def check_ip(ip, lose_time, waste_time):
cmd = "ping -n 3 -w 3 %s"
p = sp.Popen(cmd % ip, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
out = p.stdout.read().decode("gbk")
lose_time = lose_time.findall(out)
if len(lose_time) == 0:
lose = 3
else:
lose = int(lose_time[0])
if lose > 2:
return 1000
else:
average = waste_time.findall(out)
if len(average) == 0:
return 1000
else:
average_time = int(average[0])
return average_time
"""
函数说明:进一步检查代理IP的可用性
Parameters:
ip_port - 代理ip
"""
def check_ip2(ip_port):
num=0
proxy = {
'http':ip_port
}
try:
for i in range(10):
r = requests.get('http://www.baidu.com',headers=headers,proxies=proxy,timeout=5)
if r.status_code != 200:
print("二次验证失败:{}".format(ip_port)+" code:"+r.status_code)
return -1
break
else:
num+=1
if num==10:
return 200
except:
print("二次验证失败:{}".format(ip_port))
return -1
"""
函数说明:初始化正则表达式
Parameters:
无
Returns:
lose_time - 匹配丢包数
waste_time - 匹配平均时间
"""
def initpattern():
lose_time = re.compile(u"丢失 = (\d+)", re.IGNORECASE)
waste_time = re.compile(u"平均 = (\d+)ms", re.IGNORECASE)
return lose_time, waste_time
"""
函数说明:保存代理
"""
def save_proxy():
f = open('ip.html','w')
f.write(str(new_pools))
if __name__ == '__main__':
lose_time, waste_time = initpattern()
pro()
for proxy in ip_pools:
split_proxy = proxy.split(':')
ip = split_proxy[0]
average_time = check_ip(ip, lose_time, waste_time)
if average_time < 200:
code = check_ip2(proxy)
if code==200:
print(proxy+"验证成功")
new_pools.append(proxy)
else:
print("一次验证失败:{}".format(proxy))
save_proxy()
print('可用代理保存成功')
print("保存地址为:"+os.getcwd()+'\\'+'ip.html')