Python爬虫——建立IP代理池

在使用Python爬虫时，经常遇见具有反爬机制的网站。我们可以通过伪装headers来爬取，但是网站还是可以获取你的ip，从而禁掉你的ip来阻止爬取信息。
在request方法中，我们可以通过proxies参数来伪装我们的ip，一些网站上有免费的ip代理网站，可以通过爬取这些ip，经检测后建立ip代理池。

ip代理网站：
（https://www.xicidaili.com/nt/）
（https://www.kuaidaili.com/free/intr/）

推荐一种常用的伪装头方法

from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent':ua.random}

接下来进入正题

爬取ip（IPPool.py）

import requests
from lxml import etree
from fake_useragent import UserAgent
#伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
    ip_list = []
    #路径
    url = 'https://www.xicidaili.com/nt/' #ip是有时效的，只爬取第一页
    #请求
    response = requests.get(url=url,headers=headers)
    #设置编码
    response.encoding = response.apparent_encoding
    response = response.text

    response = etree.HTML(response)

    tr_list = response.xpath('//tr[@class="odd"]')
    for i in tr_list:
        #ip
        ip = i.xpath('./td[2]/text()')[0]
        #端口号
        port = i.xpath('./td[3]/text()')[0]
        #协议
        agreement = i.xpath('./td[6]/text()')[0]
        agreement = agreement.lower()
        #拼装完整路径
        ip = agreement + '://' + ip + ':' + port
        ip_list.append(ip)
    return ip_list
if __name__ == '__main__':
    ip_list = get_ip()
    print(ip_list)

测试ip

测试方法一（from multiprocessing.dummy import Pool）

import requests
from multiprocessing.dummy import Pool
#获取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定义一个全局列表，用来存放有效ip
ip_list = []
#ip测试网站
url = 'http://icanhazip.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
def ip_test(ip):
    try:
        if ip.split(":")[0] == 'http':
            proxies = {
                'http': ip
            }
        else:
            proxies = {
                'https': ip
            }
        response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)
        ip_list.append(ip)
        print(ip + "可用")
    except:
        print(ip + "不可用")
if __name__ == '__main__':
    pool = Pool(4)
    pool.map(ip_test, test_list)
    print(ip_list)
    print("总共爬取%s个ip，可用ip为：%s，不可用ip为：%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))

测试结果：
在这里插入图片描述

测试方法二（Threading多线程队列）

import threading
import requests
import queue
from fake_useragent import UserAgent

#获取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定义一个全局列表，用来存放有效ip
ip_pool = []
#随机头伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}

url = 'https://www.csdn.net/'
# url = 'http://icanhazip.com/'

def test_ip(queue_list):
    while True:
        if queue_list.empty():
            break
        else:
            ip = queue_list.get()
            if ip.split(":")[0] == 'http':
                proxies = {
                    'http' : ip
                }
            else:
                proxies = {
                    'https': ip
                }
            try:
                response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)
                if response.status_code == 200:
                    print("【%s】测试%s,测试结果【可用】" % (threading.current_thread().name, proxies))
                    ip_pool.append(ip)
            except:
                print("【%s】测试%s,测试结果【不可用】" % (threading.current_thread().name, proxies))

if __name__ == '__main__':
    queue_list = queue.Queue()#创建队列
    #将爬取的ip放入队列中
    for i in test_list:
        queue_list.put(i)
    #创建线程
    out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="进程%s" % item) for item in range(5)]
    for thread in out_thread:
        thread.start()
    for thread in out_thread:
        thread.join()
    print('测试完成')
    print(ip_pool)
    print("总共爬取%s个ip，可用ip为：%s，不可用ip为：%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))

结果：
在这里插入图片描述
测试网址不需要那么复杂，www.baidu.com一类的都可以，有一位博主推荐了一个测试网站：http://icanhazip.com/

在测试时遇到了一个坑，没有太注意协议是http还是https，统一用了http，然后发现每一个ip都可以用，当然这是不可能的，经过修改后，测试成功的ip大概在二十五个左右。

https://www.kuaidaili.com/free/intr/这个网址的ip爬取也写了（ip还没有处理），但是这个网址的一页ip有点少，所以就没有测试

IPPool2.py

import requests
from lxml import etree
from fake_useragent import UserAgent
#伪装
ua = UserAgent()
headers = {'User-Agent':ua.random}

def get_ip():
    ip_list = []
    #路径
    url = 'https://www.kuaidaili.com/free/intr/'
    #请求
    response = requests.get(url=url,headers=headers)
    #设置编码
    response.encoding = response.apparent_encoding
    response = response.text

    response = etree.HTML(response)

    tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')
    for i in tr_list:
        ip = i.xpath('./td[1]/text()')[0]
        ip_list.append(ip)
    return ip_list
if __name__ == '__main__':
    ip_list = get_ip()
    # print(ip_list)

王里木目心

发布了27 篇原创文章 · 获赞 9 · 访问量 1938

私信关注