Python爬虫 免费爬取代理并测试可用性

     话不多说直接贴代码

import requests,re,telnetlib,chardet,random


class GetProxy:
    def __init__(self):
        self.header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43"}
    #获取网页源码(解决网页乱码问题)
    def get_pageResource(self,url):
        temp_content=requests.get(url=url,headers=self.header).content
        encode=chardet.detect(temp_content)['encoding'].casefold()
        if encode!="utf-8":
           page_content=requests.get(url=url,headers=self.header).text
           page_content=page_content.encode("ISO-8859-1").decode(f"{encode}")
        else:
           page_content=requests.get(url=url,headers=self.header).text
        return page_content
      

    #解析网页源码
    def get_IpaddrAndPort(self,page_content):
        pattern_allInfo='<td>(.*?)</td>'
        allReault=re.findall(pattern_allInfo,page_content)#模糊匹配

        pattern_address="\d{2,3}\.\d{2,3}\.\d{2,3}\.\d{2,3}"#在结果里面匹配ip地址('161.35.161.38', '80', '美国', '高匿代理', '2022年03月27日14时 验证',)
        pattern_port="^\d{2,4}$"#在结果里面匹配端口号
        address=[]
        port=[]
        for i in allReault:
            temp_address=re.findall(pattern_address,i)
            temp_port=re.findall(pattern_port,i)
            if len(temp_address)!=0:
                address.append(temp_address)
            if len(temp_port)!=0:
                port.append(temp_port)
        finalResult=[]
        for i in range(len(address)):
            try:
               finalResult.append(f"{address[i][0]}:{port[i][0]}")
            except Exception as a:
                finalResult.clear()
                for j in range(len(port)):
                    finalResult.append(f"{address[i][0]}:{port[i][0]}")
        self.test_Agent(finalResult)

    #测试代理地址是否可用
    def test_Agent(self,ipAddress):
        proxy=[]
        for i in range(len(ipAddress)):
            ipaddrsss,port=ipAddress[i].split(":")
            try:
               telnetlib.Telnet(host=ipaddrsss,port=port,timeout=3)
               print(f"{ipaddrsss}:{port}可用")
            except Exception as e:
               print(f"{ipaddrsss}:{port}不可用")
        return proxy


getPro=GetProxy()
proxy=['http://ip.yqie.com/ipproxy.htm','http://www.ip3366.net/']
for i in range(len(proxy)):
    page_content=getPro.get_pageResource(proxy[i])
    ipProxy=getPro.get_IpaddrAndPort(page_content)

猜你喜欢

转载自blog.csdn.net/weixin_55109596/article/details/123847419