Python crawler crawling, free high hidden fast proxy IP

Just upload the code directly, there is no explanation for every step, copy and paste it can be used. Can not run directly reply "code" to your source code

import requests
from lxml import etree
import json


class XiciProxiesSpider(object):

    def __init__(self):
        self.num = 1
        self.start_url = 'https://www.kuaidaili.com/free/inha/{}'.format(self.num)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
        }

    def get_page_from_url(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def get_data_from_page(self, page):
        # 把page转换为Element对象
        html = etree.HTML(page)
        # 获取包含代理信息的tr列表
        trs = html.xpath('//tbody//tr')
        # 遍历trs, 获取数据信息
        data = {
            'http': [],
            # 'https': []
        }
        for tr in trs:
            try:
                ip = tr.xpath('./td[1]/text()')[0]  # IP地址
                port = tr.xpath('./td[2]/text()')[0]  # 端口
                ip_type = tr.xpath('./td[4]/text()')[0].lower()  # 类型 以及大小写转换
                # 如果ip不是http或https直接返回
                if ip_type not in data.keys():
                    return
                # 构建代理数据
                item = {ip_type: '{}:{}'.format(ip, port)}
                # 检查代理IP是否可用, 如果可用添加到列表中
                if self.validate_ip(item, ip_type):
                    data[ip_type].append(item)
            except Exception as ex:
                print(ex)
                print(etree.tostring(tr))
        print("222",data)
        return data

    def validate_ip(self, item, ip_type):
        try:
            test_url = "{}://blog.csdn.net/weixin_43407092/article/details/89743502".format(ip_type)
            response = requests.get(test_url, proxies=item, timeout=2)
            if response.status_code == 200:
                return True
            return False
        except Exception as ex:
            return False

    def save_data(self, data):
        with open('快代理.txt', 'a') as f:
            json.dump(data, f, indent=2)
        self.num += 1

    def run(self):
        while True:
            # 获取页面内宽容
            page = self.get_page_from_url(self.start_url)
            # 获取可用代理IP
            data = self.get_data_from_page(page)
            # 保存数据
            self.save_data(data)



if __name__ == '__main__':
    fps = XiciProxiesSpider()
    fps.run()

 The execution results are as follows, and there are not many useful agents.

 

Guess you like

Origin blog.csdn.net/weixin_43407092/article/details/89785002