构建自己的IP代理池

借助免费的西刺IP代理构建自己IP代理池

需要安装的Python库
  • requests
  • scrapy.Selector
  • pymysql

数据库表

create table proxy_ip (
  no BIGINT AUTO_INCREMENT,
  ip VARCHAR(20) UNIQUE NOT NULL,
  port VARCHAR(255) NOT NULL,
  address VARCHAR(20) DEFAULT '',
  proxy_type VARCHAR(5),
  speed DECIMAL DEFAULT 0,
  PRIMARY KEY (no)
) DEFAULT CHARSET = utf8;

代码如下:

import threading
import requests
import time
from scrapy import Selector
import pymysql
import sys

DB_URL = 'localhost'
DB_USER = 'username'
DB_PASSWORD = 'password'
DB_NAME = 'spider_data'
DB_CHARSET = 'utf8'


class MyProxy():

    conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD, DB_NAME, charset=DB_CHARSET)
    cursor = conn.cursor()

    def __init__(self):
        DeleteIPThread().start()

    def get_ip(self):
        '''
        从数据库中随机拿一个有效IP
        返回None时表示没有地址可用了
        :return: (ip, port, speed, type) or None
        '''
        sql = '''
            select ip,port,speed,proxy_type from proxy_ip order by rand() limit 1;
        '''
        self.cursor.execute(sql)
        if self.cursor.arraysize > 0:
            # (ip, port, speed, type)
            res = self.cursor.fetchone()
            if self.judge_ip(res[0], res[1]):
                return res
            else:
                return self.get_ip()
        self.crawl_ips()
        return self.get_ip()

    def crawl_ips(self):
        '''
        爬取西刺免费代理的地址池
        :return: 无返回
        '''
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Cookie": "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZjNDNmNjgzZWY5OWQ4ZWRmNTA5MzU3YWJiOGJlYWMwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMVBsU3h6aU0xa25KWlZXZE5qZ0tGd21xYkJtc3J0K2w0YlEwdUhlNjFBN009BjsARg%3D%3D--abe7f4154a205b8515bfb204e3fe924006ae1d68",
            "Host": "www.xicidaili.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"
        }
        url = "http://www.xicidaili.com/nn/1"
        response = None
        for i in range(10):
            try:
                response = requests.get(url, headers=headers, timeout=10)
            except requests.exceptions.Timeout:
                print("请求超时,第%d次重新请求..." % (i+1))
                response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                break
        if response is None:
            print("网络太差,或者地址被封,11次请求均超时")
            return
        s = Selector(response)
        all_list = s.xpath('//table[@id="ip_list"]/tr')[1:]
        for item in all_list[1:]:
            try:
                line = item.xpath('./td')
                ip = line[1].xpath('string(.)').extract_first()
                port = line[2].xpath('string(.)').extract_first()
                address = ''
                if len(line[3].xpath('./a')) > 0:
                    address = line[3].xpath('./a/text()').extract_first()
                    address = str(address)
                type = line[5].xpath('string(.)').extract_first()
                speed = 0.0
                if len(line[6].xpath('./div/@title')) > 0:
                    speed_str = line[6].xpath('./div/@title').extract_first()
                    speed = float(speed_str[:-1])

                print(ip, port, address, type, speed)

                sql = '''
                    INSERT 
                    INTO proxy_ip(ip, port, address, proxy_type, speed) 
                    VALUES ('{0}', '{1}', '{2}', '{3}', '{4}');
                '''
                self.cursor.execute(sql.format(ip, port, address, type, speed))
                self.conn.commit()
            except:
                print(sys.exc_info())

    def judge_ip(self, ip, port):
        '''
        判断给出的代理 ip 是否可用
        :param ip:
        :param port:
        :return:
        '''
        http_url = 'https://www.baidu.com/'
        proxy_url = 'http://{0}:{1}'.format(ip, port)

        try:
            proxy_dict = {
                'http': proxy_url
            }
            print("正在测试代理IP是否可用 => ", proxy_url)
            response = requests.get(http_url, proxies=proxy_dict, timeout=5)

        except Exception as e:
            print("代理:", proxy_url, "不可用,即将从数据库中删除")
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 or code < 300:
                print("代理 => ", proxy_url, "可用")
                return True
            else:
                self.delete_ip(ip)
                return False

    def delete_ip(self, ip):
        '''
        删除不可用的IP
        :param ip:
        :return:
        '''
        sql = '''
            delete from proxy_ip WHERE ip='{0}';
        '''
        self.cursor.execute(sql.format(ip))
        self.conn.commit()

class DeleteIPThread(threading.Thread):

    def __init__(self):
        super().__init__()
        self.daemon = True

    def run(self):
        conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD, DB_NAME, charset=DB_CHARSET)
        cursor = conn.cursor()
        sql = "select ip,port from spider_data.proxy_ip;"
        proxy = MyProxy()
        while True:
            cursor.execute(sql)
            all_list = cursor.fetchall()
            for ip,port in all_list:
                print(ip, port)
                if proxy.judge_ip(ip, port) is False:
                    proxy.delete_ip(ip)
                time.sleep(1)
            time.sleep(20)

    def circle_judge(self):
        pass

if __name__ == '__main__':
    my_proxy = MyProxy()
    my_proxy.crawl_ips()
    # my_proxy.get_ip()

使用

创建对象后,调用crawl_ips()开始爬取代理IP,调用get_ip从数据库中随机选择一条IP,并验证是否可用,如果不可用则递归获取可用的代理IP,当数据库中的代理地址被用完后(全都不可用),则自动开始重新爬取代理。

在创建对象后,启动守护线程维护代理池中的所有地址,将无效地址剔除。

猜你喜欢

转载自blog.csdn.net/goldlone/article/details/81415959