python-爬免费ip并验证其可行性

前言

最近在重新温习python基础-正则,感觉正则很强大,不过有点枯燥,想着,就去应用正则,找点有趣的事玩玩

00xx01---代理IP

有好多免费的ip,不过一个一个保存太难了,也不可能,还是用我们的python爬取吧

00xx02---正则提取ip

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 #print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
17 ports = re.findall(("<td>(\d+)</td>"),html,re.S)
18 print(ips)
19 print(ports)

 

00xx03---拼接IP和端口

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 # print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
17 ports = re.findall(("<td>(\d+)</td>"),html,re.S)
18 #print(ips)
19 #print(ports)
20 for ip in zip(ips,ports ):  #提取拼接ip和端口
21     print(ip)

00xx03---验证IP可行性

思路:带着ip和端口去访问一个网站,百度就可以

 1 import requests
 2 import re
 3 
 4 
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 for i in range(1,1000):
 7     #网址
 8     url = "https://www.xicidaili.com/nn/{}".format(i)
 9 
10     response = requests.get(url,headers=headers)
11     # print(response.text)
12 
13     html = response.text
14 
15     #re.S忽略换行的干扰
16     ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
17     ports = re.findall(("<td>(\d+)</td>"),html,re.S)
18     # print(ips)
19     # print(ports)
20     for ip in zip(ips,ports ):  #提取拼接ip和端口
21         proxies = {
22             "http":"http://" + ip[0] + ":" + ip[1],
23             "https":"http://" + ip[0] + ":" + ip[1]
24         }
25         try:
26             res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应,自动断开
27             print(ip,"能使用")
28             with open("ip.text",mode="a+") as f:
29                 f.write(":".join(ip))  #写入ip.text文本
30                 f.write("\n") #换行
31         except Exception as e:   #捕捉错误异常
32             print(ip,"不能使用")

00xx04---写入文本

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 # print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
17 ports = re.findall(("<td>(\d+)</td>"),html,re.S)
18 #print(ips)
19 #print(ports)
20 for ip in zip(ips,ports ):  #提取拼接ip和端口
21     print(ip)
22     proxies = {
23             "http":"http://" + ip[0] + ":" + ip[1],
24             "https":"http://" + ip[0] + ":" + ip[1]
25         }
26     try:
27         res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应,自动断开
28         print(ip,"能使用")
29         with open("ip.text",mode="a+") as f:
30             f.write(":".join(ip))  #写入ip.text文本
31             f.write("\n") #换行
32     except Exception as e:   #捕捉错误异常
33         print(ip,"不能使用")

爬了一页,才几个能用,有3000多页,不可能手动的

00xx05---批量爬

 1 #!/usr/bin/env python3
 2 # coding:utf-8
 3 # 2019/11/18 22:38
 4 #lanxing
 5 import requests
 6 import re
 7 
 8 #防反爬
 9 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
10 for i in range(1,3000):  #爬3000个网页
11     #网站
12     url = "https://www.xicidaili.com/nn/{}".format(i)
13 
14     response = requests.get(url,headers=headers)
15         # print(response.text)
16 
17     html = response.text
18     # print(html)
19 
20     #re.S忽略换行的干扰
21     ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>",html,re.S)
22     ports = re.findall(("<td>(\d+)</td>"),html,re.S)
23     #print(ips)
24     #print(ports)
25     for ip in zip(ips,ports ):  #提取拼接ip和端口
26         print(ip)
27         proxies = {
28                 "http":"http://" + ip[0] + ":" + ip[1],
29                 "https":"http://" + ip[0] + ":" + ip[1]
30             }
31         try:
32             res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应,自动断开
33             print(ip,"能使用")
34             with open("ip.text",mode="a+") as f:
35                 f.write(":".join(ip))  #写入ip.text文本
36                 f.write("\n") #换行
37         except Exception as e:   #捕捉错误异常
38             print(ip,"不能使用")

00xx06---最后

哈哈,感觉爬的速度太慢了,毕竟是单线程,如果要快速爬,可以试试用多线程爬取,

以后再补充完善代码吧

猜你喜欢

转载自www.cnblogs.com/lanyincao/p/11886678.html