2.验证并找出可用的代理服务器
以下用python语言编写
从http://www.xicidaili.com/nn/1抓取到代理服务器地址
这个网上说的是高匿,实际不是的,原来的ip还是会被forward过去的。
将代理信息抓取保存到proxy.txt里
#encoding=utf8 import urllib import urllib.request import http.cookiejar from bs4 import BeautifulSoup User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' header = {} header['User-Agent'] = User_Agent hosturl = 'http://www.xicidaili.com/nn/1' request = urllib.request.Request(hosturl, None, header) response = urllib.request.urlopen(request) htmldata = response.read() soup = BeautifulSoup(htmldata,"html.parser") ips = soup.findAll('tr') f = open("proxy.txt","w") for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_temp = tds[6].contents[0].lower()+"\t"+tds[2].contents[0]+"\t"+tds[3].contents[0]+"\n" f.write(ip_temp)
读取proxy.txt,并验证,将可以使用的代理信息保存到proxy-tested.txt里
#encoding=utf8 import urllib import urllib.request import socket socket.setdefaulttimeout(3) f = open("proxy.txt") lines = f.readlines() proxys = [] fo = open("proxy-tested.txt","w") for i in range(0,len(lines)): ip = lines[i].strip("\n").split("\t") proxy_host = str(ip[0])+"://"+ip[1]+":"+ip[2] proxy_temp = {str(ip[0]):proxy_host} proxys.append(proxy_temp) index=0; for proxy in proxys: index=index+1 print("index:"+str(index)) try: proxy_support = urllib.request.ProxyHandler(proxy) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) response=urllib.request.urlopen("http://ip.chinaz.com/getip.aspx").read() for i in proxy: ip_temp = proxy[i].split(":")[0]+"\t"+proxy[i].split(":")[1][2:]+"\t"+proxy[i].split(":")[-1]+"\n" fo.write(ip_temp) print("success:"+str(proxy)+"result:"+str(response)) except Exception as e: print("error:"+str(proxy)) print(e) continue