话不多说直接贴代码
import requests,re,telnetlib,chardet,random
class GetProxy:
def __init__(self):
self.header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43"}
#获取网页源码(解决网页乱码问题)
def get_pageResource(self,url):
temp_content=requests.get(url=url,headers=self.header).content
encode=chardet.detect(temp_content)['encoding'].casefold()
if encode!="utf-8":
page_content=requests.get(url=url,headers=self.header).text
page_content=page_content.encode("ISO-8859-1").decode(f"{encode}")
else:
page_content=requests.get(url=url,headers=self.header).text
return page_content
#解析网页源码
def get_IpaddrAndPort(self,page_content):
pattern_allInfo='<td>(.*?)</td>'
allReault=re.findall(pattern_allInfo,page_content)#模糊匹配
pattern_address="\d{2,3}\.\d{2,3}\.\d{2,3}\.\d{2,3}"#在结果里面匹配ip地址('161.35.161.38', '80', '美国', '高匿代理', '2022年03月27日14时 验证',)
pattern_port="^\d{2,4}$"#在结果里面匹配端口号
address=[]
port=[]
for i in allReault:
temp_address=re.findall(pattern_address,i)
temp_port=re.findall(pattern_port,i)
if len(temp_address)!=0:
address.append(temp_address)
if len(temp_port)!=0:
port.append(temp_port)
finalResult=[]
for i in range(len(address)):
try:
finalResult.append(f"{address[i][0]}:{port[i][0]}")
except Exception as a:
finalResult.clear()
for j in range(len(port)):
finalResult.append(f"{address[i][0]}:{port[i][0]}")
self.test_Agent(finalResult)
#测试代理地址是否可用
def test_Agent(self,ipAddress):
proxy=[]
for i in range(len(ipAddress)):
ipaddrsss,port=ipAddress[i].split(":")
try:
telnetlib.Telnet(host=ipaddrsss,port=port,timeout=3)
print(f"{ipaddrsss}:{port}可用")
except Exception as e:
print(f"{ipaddrsss}:{port}不可用")
return proxy
getPro=GetProxy()
proxy=['http://ip.yqie.com/ipproxy.htm','http://www.ip3366.net/']
for i in range(len(proxy)):
page_content=getPro.get_pageResource(proxy[i])
ipProxy=getPro.get_IpaddrAndPort(page_content)