要点:(环境Python3.5,额外库bs4)
1.从免费代理ip获取ip和端口号 http://www.xicidaili.com/nn/
2.使用代理
import urllib.request proxy_support = urllib.request.ProxyHandler({'http': 'ip:port'}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener)
3.urlopen测试ip能否使用 http://ip.chinaz.com/getip.aspx
4.返回包含可以使用的ip和端口组成的字典的list
贴代码:
from urllib.request import urlopen import re import requests from bs4 import BeautifulSoup as bs from urllib import request import socket #init timeout = 3 socket.setdefaulttimeout(3) #request the xiciURL and get the response def request_to_get(url): hearder = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Host":"www.xicidaili.com", "Referer":"http://www.xicidaili.com/", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", } response = requests.get(url,headers=hearder).content content = str(response,encoding = "utf-8") bs_obj = bs(content,"html.parser") return bs_obj #get ip port and return a list format:{"https://":"ip:port"} def find_ip_port(bs_obj): ip_list = [] port_list = [] ips = bs_obj.findAll('tr') for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_list.append(tds[1].text) port_list.append(tds[2].text) proxys = [] for i in range(len(ip_list)): proxy_host = "http://"+ip_list[i]+":"+port_list[i] proxy_temp = {"http":proxy_host} proxys.append(proxy_temp) return proxys #test the proxy and return proxy that can be used def return_ok_proxys(proxys): test_url = "http://ip.chinaz.com/getip.aspx" alright_proxys = [] for i in range(len(proxys)): try: proxy_support = request.ProxyHandler(proxys[i]) opener = request.build_opener(proxy_support) opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')] request.install_opener(opener) # response = urlopen(url,proxies = proxys[i]).read() response = request.urlopen(test_url) alright_proxys.append(proxys[i]) print(proxys[i]) print("is alright") except Exception as e: print(proxys[i]) print(e) pass return alright_proxys #main function def main_fun(): url = "http://www.xicidaili.com/nn/" bs_obj = request_to_get(url) proxys = find_ip_port(bs_obj) alright_proxys = return_ok_proxys(proxys) print(alright_proxys) main_fun()
补充多线程版本:
from urllib.request import urlopen import re import requests from bs4 import BeautifulSoup as bs from urllib import request import socket import threading import time #init timeout = 3 socket.setdefaulttimeout(5) test_url = "http://ip.chinaz.com/getip.aspx" #request the xiciURL and get the response def request_to_get(url): hearder = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Host":"www.xicidaili.com", "Referer":"http://www.xicidaili.com/", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", } response = requests.get(url,headers=hearder).content content = str(response,encoding = "utf-8") bs_obj = bs(content,"html.parser") return bs_obj #get ip port and return a list format:{"https://":"ip:port"} def find_ip_port(bs_obj): ip_list = [] port_list = [] ips = bs_obj.findAll('tr') for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_list.append(tds[1].text) port_list.append(tds[2].text) proxys = [] for i in range(len(ip_list)): proxy_host = "http://"+ip_list[i]+":"+port_list[i] proxy_temp = {"http":proxy_host} proxys.append(proxy_temp) return proxys #check ip alright def check_ip(alright_proxys,proxy): try: proxy_support = request.ProxyHandler(proxy) opener = request.build_opener(proxy_support) opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')] request.install_opener(opener) response = request.urlopen(test_url).read() content = str(response,encoding = "utf-8") alright_proxys.append(proxy) #print(proxy) #print(content) #print("is alright") except Exception as e: #print(proxy) #print(e) pass #test the proxy and return proxy that can be used def return_ok_proxys(proxys): alright_proxys = [] for i in range(len(proxys)): t = threading.Thread(target = check_ip,args = (alright_proxys,proxys[i],)) t.start() time.sleep(5) return alright_proxys #main function def main_function(): url = "http://www.xicidaili.com/nn/" bs_obj = request_to_get(url) proxys = find_ip_port(bs_obj) alright_proxys = return_ok_proxys(proxys) return alright_proxys print(main_function())效果图: