python 爬虫获取代理Ip

要点：(环境Python3.5,额外库bs4)

1.从免费代理ip获取ip和端口号 http://www.xicidaili.com/nn/

2.使用代理

import urllib.request
proxy_support = urllib.request.ProxyHandler({'http': 'ip:port'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

3.urlopen测试ip能否使用 http://ip.chinaz.com/getip.aspx

4.返回包含可以使用的ip和端口组成的字典的list

贴代码:

from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup as bs
from urllib import request
import socket

#init timeout = 3
socket.setdefaulttimeout(3)

#request the xiciURL and get the response
def request_to_get(url):
	hearder = {
		"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
		"Accept-Encoding":"gzip, deflate",
		"Accept-Language":"zh-CN,zh;q=0.9",
		"Connection":"keep-alive",
		"Host":"www.xicidaili.com",
		"Referer":"http://www.xicidaili.com/",
		"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
	}
	response = requests.get(url,headers=hearder).content
	content = str(response,encoding = "utf-8")
	bs_obj = bs(content,"html.parser")
	return bs_obj

#get ip port and return a list format:{"https://":"ip:port"}
def find_ip_port(bs_obj):
	ip_list = []
	port_list = []
	ips = bs_obj.findAll('tr')
	for x in range(1,len(ips)):
	    ip = ips[x]
	    tds = ip.findAll("td")
	    ip_list.append(tds[1].text)
	    port_list.append(tds[2].text)
	proxys = []
	for i in range(len(ip_list)):
		proxy_host = "http://"+ip_list[i]+":"+port_list[i]
		proxy_temp = {"http":proxy_host}
		proxys.append(proxy_temp)
	return proxys

#test the proxy and return proxy that can be used
def return_ok_proxys(proxys):
	test_url = "http://ip.chinaz.com/getip.aspx"
	alright_proxys = []
	for i in range(len(proxys)):
		try:
			proxy_support = request.ProxyHandler(proxys[i])
			opener = request.build_opener(proxy_support)
			opener.addheaders =  [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
			request.install_opener(opener)		
	#		response = urlopen(url,proxies = proxys[i]).read()
			response = request.urlopen(test_url)
			alright_proxys.append(proxys[i])
			print(proxys[i])
			print("is alright")
		except Exception as e:
			print(proxys[i])		
			print(e)
			pass
	return alright_proxys

#main function
def main_fun():
	url = "http://www.xicidaili.com/nn/"
	bs_obj = request_to_get(url)
	proxys = find_ip_port(bs_obj)
	alright_proxys = return_ok_proxys(proxys)
	print(alright_proxys)
main_fun()

补充多线程版本：

from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup as bs
from urllib import request
import socket
import threading
import time

#init timeout = 3
socket.setdefaulttimeout(5)
test_url = "http://ip.chinaz.com/getip.aspx"

#request the xiciURL and get the response
def request_to_get(url):
	hearder = {
		"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
		"Accept-Encoding":"gzip, deflate",
		"Accept-Language":"zh-CN,zh;q=0.9",
		"Connection":"keep-alive",
		"Host":"www.xicidaili.com",
		"Referer":"http://www.xicidaili.com/",
		"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
	}
	response = requests.get(url,headers=hearder).content
	content = str(response,encoding = "utf-8")
	bs_obj = bs(content,"html.parser")
	return bs_obj

#get ip port and return a list format:{"https://":"ip:port"}
def find_ip_port(bs_obj):
	ip_list = []
	port_list = []
	ips = bs_obj.findAll('tr')
	for x in range(1,len(ips)):
		ip = ips[x]
		tds = ip.findAll("td")
		ip_list.append(tds[1].text)
		port_list.append(tds[2].text)
	proxys = []
	for i in range(len(ip_list)):
		proxy_host = "http://"+ip_list[i]+":"+port_list[i]
		proxy_temp = {"http":proxy_host}
		proxys.append(proxy_temp)
	return proxys

#check ip alright
def check_ip(alright_proxys,proxy):
	try:
		proxy_support = request.ProxyHandler(proxy)
		opener = request.build_opener(proxy_support)
		opener.addheaders =  [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
		request.install_opener(opener)
		response = request.urlopen(test_url).read()
		content = str(response,encoding = "utf-8")
		alright_proxys.append(proxy)
		#print(proxy)
		#print(content)
		#print("is alright")
	except Exception as e:
		#print(proxy)
		#print(e)
		pass

#test the proxy and return proxy that can be used
def return_ok_proxys(proxys):
	alright_proxys = []
	for i in range(len(proxys)):
			t = threading.Thread(target = check_ip,args = (alright_proxys,proxys[i],))
			t.start()
	time.sleep(5)
	return alright_proxys

#main function
def main_function():
	url = "http://www.xicidaili.com/nn/"
	bs_obj = request_to_get(url)
	proxys = find_ip_port(bs_obj)
	alright_proxys = return_ok_proxys(proxys)
	return alright_proxys

print(main_function())

效果图：

python 爬虫获取代理Ip

猜你喜欢