python:多线程抓取西刺和快站 高匿代理IP

  一开始是打算去抓取一些数据,但是总是访问次数多了之后被封IP,所以做了一个专门做了个工具用来抓取在西刺和快站的高匿IP。

  运行环境的话是在python3.5下运行的,需要requests库

  在制作的过程中也参考的以下网上其他人的做法,但是发现很大一部分都不是多线程去抓取有点浪费时间了,又或者或网上已经有人做好了轮子了,但是现在的技术还有点看不懂,所以就做了这只在一个py文件上运行的代理池。

  对于旧IP的处理,我这下面的代码是没有让他运行的,如果要运行可以开启,不过必须要在同一个文件夹里创建一个名为“old_ip.txt”的文档,本来可以做一个自动判定,没有就可以生成的,到最后又没弄了。

  有一点是要重点注意一下的就是:不建议一下子抓取超过10页,因为我没有设定request访问限定,西刺或者快站都会因为你访问速度太快而判定你为爬虫封24小时的IP......我已经尝试过。 

  以下是代码:

# -*- coding: UTF-8 -*-
import threading, requests, datetime
from bs4 import BeautifulSoup
import random
import queue

# ------------------------------IP多线程设置--------------------------
class Mythread(threading.Thread):  #

    def __init__(self, ip, path, url, type="new_ip"):
        super(Mythread, self).__init__()
        self.ip = ip
        self.path = path
        self.url = url
        self.type = type

    def run(self):
        if self.type == "new_ip":
            if semaphoer.acquire():
                target1 = check_ip(self.ip, self.url)
                if target1 == True:
                    write(self.ip, self.path)
                    all_IP.add(self.ip)
                    print("这个ip可以使用", self.ip)
                semaphoer.release()
        else:
            if semaphoer.acquire():
                target2 = check_ip(self.ip, self.url)
                if target2 == True:
                    all_IP.add(self.ip)
                    print("这个旧IP可使用", self.ip)
                semaphoer.release()


# ------------------------------------时间计算-----------------------------------------

def cost(start, end):
    seconds = (end - start).seconds
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    cost_time = ("%s:%s:%s" % (h, m, s))
    return cost_time


# -----------------随机选择请求头参数--------------------

def getheaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {"User-Agent": UserAgent}
    return headers


# --------------------并发IP验证-------------------------------

def check_ip(ip, url):
    header = getheaders()
    proxies = {"http": "http://" + ip, "https": "http://" + ip}
    print("开始测试这个ip", ip)  # 实时反映测试IP,不希望被IP刷屏  可关闭。
    try:
        response = requests.get(url=url, proxies=proxies, headers=header, timeout=5).status_code  # 定为5秒内有响应即为可用ip
        if response == 200:
            return True
        else:
            return False
    except:
        return False


# ---------------------清空函数-----------------------------

def clearing_txt(path):
    with open(path, 'w', encoding="utf-8") as f:
        f.truncate()


# --------------------读取函数(并返回一个列表)-------------------

def read_txt(path):
    txt = []
    with open(path, "r", encoding="utf-8") as h:
        for line in h.readlines():
            txt.append(line.strip())
    return txt


# --------------------写入函数----------------------------

def write(ip, path):
    with open(path, "a", encoding="utf8") as f:
        f.writelines(ip)
        f.write("\n")


# --------------------------阻塞主线程-------------------------
def join(list):
    for i in list:
        i.start()
    for i in list:
        i.join()


# -------------------------网站爬虫抓取IP---------------------------------

def call_net(num, pagenum):
    scrapy_url = {1: 'https://www.kuaidaili.com/free/inha/',  # 快代理
                  2: 'http://www.xicidaili.com/nn/'  # 西刺
                  }
    get_url = scrapy_url[num] + str(pagenum)  # URL组合
    header = getheaders()  # 请求头
    html = requests.get(url=get_url, headers=header, timeout=6).text  # 访问并提取网页源代码
    soup = BeautifulSoup(html, 'lxml')  # 格式化
    if num == 2:
        all = soup.find_all("tr", class_="odd")  # 西刺代理的网站源代码IP有两种不同的tr标签,所以要两种方法处理
        all2 = soup.find_all("tr", class_="")
        for i in all:
            t = i.find_all('td')
            ip_1 = t[1].text + ':' + t[2].text
            q.put(ip_1)  # 放入IP队列
        for h in all2:
            x = h.find_all('td')
            if x == []:
                continue
            ip_2 = x[1].text + ':' + x[2].text
            q.put(ip_2)  # 放入IP队列
    else:
        all_ip = soup.find_all(attrs={"data-title": "IP"})  # 快代理 IP  和 port 都是不同的标签参数,所以要用两种方法处理
        all_port = soup.find_all(attrs={"data-title": "PORT"})
        all = len(all_ip)
        for i in range(all):
            ip_3 = all_ip[i].text + ":" + all_port[i].text
            q.put(ip_3)  # 放入IP队列


# ---------------------------获取IP主程序------------------------------
def get_ip(url, path, path_old, page):
    clearing_txt(path)  # 清空文件内容
    threads = []  # 爬虫线程列表
    threads_ip = []  # IP线程测试列表
    threads_old_ip = []  # 旧IP线程测试列表
    start_time = datetime.datetime.now()  # 记录开始时间
    for num in range(2):
        for page_num in range(page):
            net_threads = threading.Thread(target=call_net, args=(num + 1, page_num + 1))  # 创建爬虫线程进行访问
            threads.append(net_threads)
    print("开始抓取西刺、快站的高匿代理")
    join(threads)
    # print("测试旧存储IP")
    # for i in old_IP_list:
    #     threads_old_ip.append(Mythread(i,path,url,type="old"))       #创建旧IP线程进行访问
    # join(threads_old_ip)
    # print("一共%s个旧IP可使用" %len(all_IP))
    while not q.empty():  # 提取q队列内的IP,并创建IP测试线程
        i = q.get()
        threads_ip.append(Mythread(i, path, url, ))  # 创建新IP线程进行访问
    join(threads_ip)
    print("成功爬取")
    end_time = datetime.datetime.now()  # 记录结束时间
    cost_time = cost(start_time, end_time)
    clearing_txt(path_old)
    for all_ip in all_IP:  # 所有IP写入
        write(all_ip, path_old)
    new_ip = read_txt(path)
    old_ip = read_txt(path_old)
    print("耗时:%s 一共抓取:%s个新IP  以保存:%s个IP" % (cost_time, len(new_ip), len(old_ip)))


'''
1、抓取西刺代理、快代理的ip
2、提取之前保存的ip,重新验证ip池内ip的可用性
3、使用ip去访问指定网站 即:url,设定其响应时间为5秒,并且要在规定时间内返回200,即为可用IP 。
4、old_path为IP地址堆叠处,保存旧IP。
4、最后都保存到指定文件夹
'''


# --------------------------主要参数配置点------------------------------------ if __name__ == '__main__': semaphoer = threading.Semaphore(15) # 线程池数量大小,默认为15 q = queue.Queue() # 创建队列 page = 2 # 爬取西刺和快代理的页数 不能设置过大 path = "ip.txt" # 设定新爬取IP保存的文件名 path_old = "ip_old.txt" # 设定旧IP保存点 url = "https://www.baidu.com/" # 设定验证IP可用性的url参数 # old_IP_list = read_txt(path_old) #旧IP提取 all_IP = set() # 全IP集合 get_ip(url, path, path_old, page)

猜你喜欢

转载自www.cnblogs.com/chenansen/p/10007162.html