爬取某代理网站代理IP

import re
from urllib.parse import urlparse
import requests
from fake_useragent import UserAgent
class ProxySpider:
    def __init__(self,url_init='https://www.xicidaili.com/nn/'):
        '''
        初始化
        :param url_init:某匿代理ip
        '''
        self.headers = {'User-Agent': UserAgent().random}#随机useragent
        self.url_seed = url_init#~匿
        self.timeout = 5#测试ip是否可用超时时间
    def myRequest(self):
        '''
        简单的请求
        :return:
        '''
        return requests.get(self.url_seed, headers=self.headers)

    def save_resource(self,path_file):
        res = self.myRequest()
        with open(path_file,'w+',encoding='utf-8') as f:
            f.write(res.content.decode('utf-8'))

    def parse(self):
        '''
        解析某匿
        :return:
        '''
        content_html = self.myRequest().content.decode('utf-8')
        content_html = content_html.replace('\n','')
        res = re.findall('<tr\sclass="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(HTTP|HTTPS)</td>',content_html,re.I)
        return [proxy_parts[2]+'://'+proxy_parts[0] + ':' +proxy_parts[1] for proxy_parts in res]

    def url_parts(self,domain):
        '''
        将url切割成多部分
        :param domain:
        :return:
        '''
        return urlparse(domain)

    def ip_filter_available(self):
        '''
        过滤一个可用的代理ip
        :return:
        '''
        ips = self.parse()
        for ip in ips:
            pr = self.url_parts(ip)
            scheme = pr.scheme
            netloc = pr.netloc
            ip_new = scheme + '://' + netloc
            proxies = {scheme:ip_new}
            try:
                res = requests.get('https://www.baidu.com/',proxies=proxies,timeout=self.timeout)
                if res.status_code == 200:
                    return proxies
            except Exception as e:
                continue

测试：

proxy_spider = ProxySpider()
print(proxy_spider.ip_filter_available())

结果：

爬取某代理网站代理IP

猜你喜欢