python-获得代理ip

使用代理ip,能提高我们的爬虫的效率,花一点时间用python写了爬取代理ip,代理ip网站是西刺的,一页就可以。


引入模块

import reuqests
import re

获得网页

def get_html():
	url = 'https://www.xicidaili.com/wt/'   # 西刺免费代理IP,http代理ip
	
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
	#请求头
    req = requests.get(url,headers = header)
    return req.text

获得ip

def get_ip_and_port(html):
	ip_list = re.findall(r'<td>(\d+\.\d+\.\d+\.\d+|\d+)</td>', html) # ip地址 | 端口 
	#网站格式都很标准,简单过正则匹配就行
	
	return ip_list

ip格式

def save_proxy_ip(ip_list):
	file_object = open('proxy_ip.txt','w') #新建一个文本(proxy_ip.txt)来调整ip格式
    num = 1
	# num为偶数就换行,奇数加 :,调整为 ip:port 标准格式
    for each in ip_list:
        if num % 2 == 0:
            file_object.write(each + '\n')
        else:
            file_object.write(each + ':')
        num += 1

    file_object.close()

测试保存ip是否可用

def test_ip():    # 测试保存ip是否可用,可用保存到新文件ok_ip.txt
    ip_file = open('proxy_ip.txt','r+') #打开所有ip
    new_file = open('ok_ip.txt','w') #新建文本来保存可用ip
    for ip in ip_file:
    	num = 0
    	if num < 3: #测试三次
	        test_url = 'http://httpbin.org/get' #测试http网址
	        h_ip = 'http://'+str(ip) 
	
	        try:
	            resp = requests.get(test_url,proxies = {'http':h_ip},timeout = 1) #超时一秒
	        except:
	            num ++
	        else:
	            new_file.write(ip)
	            break
	
	    ip_file.close()
	    new_file.close()

至此已经保存好可用ip在ok_ip.txt

所有代码

import reuqests
import re

def get_html():
	url = 'https://www.xicidaili.com/wt/'   # 西刺免费代理IP,http代理ip
	
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
	#请求头
    req = requests.get(url,headers = header)
    return req.text


def get_ip_and_port(html):
	ip_list = re.findall(r'<td>(\d+\.\d+\.\d+\.\d+|\d+)</td>', html) # ip地址 | 端口 
	#网站格式都很标准,简单过正则匹配就行
	
	return ip_list

def save_proxy_ip(ip_list):
	file_object = open('proxy_ip.txt','w') #新建一个文本(proxy_ip.txt)来调整ip格式
    num = 1
	# num为偶数就换行,奇数加 :,调整为 ip:port 标准格式
    for each in ip_list:
        if num % 2 == 0:
            file_object.write(each + '\n')
        else:
            file_object.write(each + ':')
        num += 1

    file_object.close()


def test_ip():    # 测试保存ip是否可用,可用保存到新文件ok_ip.txt
    ip_file = open('proxy_ip.txt','r+') #打开所有ip
    new_file = open('ok_ip.txt','w') #新建文本来保存可用ip
    for ip in ip_file:
    	num = 0
    	if num < 3: #测试三次
	        test_url = 'http://httpbin.org/get' #测试http网址
	        h_ip = 'http://'+str(ip) 
	
	        try:
	            resp = requests.get(test_url,proxies = {'http':h_ip},timeout = 1) #超时一秒
	            r.raise_for_status()
	        except:
	            num ++
	        else:
	            new_file.write(ip)
	            break
	
	    ip_file.close()
	    new_file.close()

if __name__ = '__main__':
	html = get_html()
	ip_list = get_ip_and_port(html)
	save_proxy_ip(ip_list)
	test_ip()

猜你喜欢

转载自blog.csdn.net/weixin_43821663/article/details/86554091