SKlearn - ROC and AUC - 代码天地

ROC、AUC 的理论知识请参考我的博客分类模型评估

本文旨在总结其在 SKlearn 中的用法

基础用法

先看源码

def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
              drop_intermediate=True):
    """Compute Receiver operating characteristic (ROC)
    y_true : array, shape = [n_samples]
        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
        pos_label should be explicitly given.

    y_score : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    pos_label : int or str, default=None
        The label of the positive class.
        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
        ``pos_label`` is set to 1, otherwise an error will be raised.
        设置 label 中 哪个 label 是 正例，比如 label 为 [1， 2]，2 为正例，那 pos_label=2
        当 pos_label为 None 时，如果 y_true 为 {-1, 1} or {0, 1}， pos_label 自动被设定为 1

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    drop_intermediate : boolean, optional (default=True)
        Whether to drop some suboptimal thresholds which would not appear
        on a plotted ROC curve. This is useful in order to create lighter
        ROC curves.

        .. versionadded:: 0.17
           parameter *drop_intermediate*.

    Returns
    -------
    fpr : array, shape = [>2]
        Increasing false positive rates such that element i is the false
        positive rate of predictions with score >= thresholds[i].

    tpr : array, shape = [>2]
        Increasing true positive rates such that element i is the true
        positive rate of predictions with score >= thresholds[i].

    thresholds : array, shape = [n_thresholds]
        Decreasing thresholds on the decision function used to compute
        fpr and tpr. `thresholds[0]` represents no instances being predicted
        and is arbitrarily set to `max(y_score) + 1`.
    """

然后看一个最普通的示例，包括 ROC 的计算、AUC 的计算、ROC 曲线绘制

import numpy as np
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt


################################### ROC and AUC ###################################
y = np.array([1, 1, 2, 2])
scores = np.array([0.1, 0.4, 0.35, 0.8])

######## 计算 ROC ########
fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)        ### pos_label 表示 哪个 label 属于 正例
print(fpr)      # array([0. , 0. , 0.5, 0.5, 1. ])
print(tpr)      # array([0. , 0.5, 0.5, 1. , 1. ])
print(thresholds)       # array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])

##### 解释
## 1. 当 阈值 为 0.1 时，所有的负样本识别为正样本，所有的正样本识别为正样本，也就是 阈值太低，全部识别为正样本了；
## 2. 当 阈值 为 0.35时，0.5的负样本识别为正样本，所有的正样本识别为正样本，太多负样本识别为正样本了，如果是 刷脸 取款，那你的钱会被别人取光的；
## 3. 当 阈值 为 0.4 时，0.5的负样本识别为正样本，0.5的正样本识别为正样本，感觉好差啊，什么乱七八糟的；
## 4. 当 阈值 为 0.8 时，没有负样本识别为正样本，0.5的正样本识别为正样本，不咋的啊，如果是 刷脸 取款，至少你的钱不会被别人取走，不过你自己可能也取不出来；
## 5. 当 阈值 为 1.8 时，所有样本都是负样本，阈值太高了；

######## 计算 AUC ########
print(auc(fpr, tpr))            # 0.75      ### ROC 曲线下面积 AUC
print(roc_auc_score(y, scores)) # 0.75

######## 画 ROC 曲线 ########
plt.plot(fpr, tpr)
plt.show()

输出

EER 选择模型阈值

ROC 用于优化模型

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt
import numpy as np

iris = load_iris()
iris.target[iris.target==1], iris.target[iris.target==2] = 0, 1   #将iris的三类数据转化为二类数据,labels=1与labels=0合并为0，labels=2转化为1
x_train,x_test,y_train,y_test = train_test_split(iris.data, iris.target, test_size=0.3)

model = LogisticRegression(solver='newton-cg', multi_class='ovr')
model.fit(x_train ,y_train)
y_pre = model.predict_proba(x_test)
print('predict_proba is', y_pre)

y_0 = list(y_pre[:,1])    #取第二列数据，因为第二列概率为趋于0时分类类别为0，概率趋于1时分类类别为1
fpr, tpr, thresholds = roc_curve(y_test, y_0)
print(thresholds)
# [1.98964087e+00 9.89640873e-01 6.03375665e-01 5.68953989e-01, 4.81061404e-01 2.40418592e-01 2.24425917e-01 3.43507028e-06]

auc = roc_auc_score(y_test, y_0) #计算auc

####### 计算ks
KS_max = 0
best_thr = 0
for i in range(len(fpr)):
    if(i == 0):
        KS_max = tpr[i] - fpr[i]
        best_thr = thresholds[i]
    elif (tpr[i] - fpr[i] > KS_max):
        KS_max = tpr[i] - fpr[i]
        best_thr = thresholds[i]
print('最大KS为：',KS_max)          # 最大KS为： 1.0
print('最佳阈值为：',best_thr)      # 最佳阈值为： 0.6998150731799142

###### 画曲线图
plt.figure()
plt.plot(fpr, tpr)
plt.plot(fpr, tpr, 'o')
plt.plot([0, 1], [1, 0], 'r')
plt.title('$ROC curve$')
plt.show()

输出

显然第 3 个点(圆圈内) 离 y=-x 最近

one vs rest 多分类 ROC

每个二分类都有一个 ROC

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# 导入鸢尾花数据集
iris = datasets.load_iris()
X = iris.data  # X.shape==(150, 4)
y = iris.target  # y.shape==(150, )

# 二进制化输出
y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
n_classes = y.shape[1]  # n_classes==3

# 添加噪音特征，使问题更困难
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape  # n_samples==150, n_features==4
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# 学习区分某个类与其他的类
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# 为每个类别计算ROC曲线和AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
lw = 2
color = ['r', 'g', 'b']
for i in range(3):
    plt.plot(fpr[i], tpr[i], color=color[i], lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[i])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

输出

多分类 - 宏 ROC 微 ROC

宏 ROC 和微 ROC 好像有点绕，个人觉得参考资料中这两个搞反了，本人做如下解释来区分这两个概念

宏 ROC：先让每个二分类独自计算，再算总的

微 ROC：先把每个二分类综合(加)起来，再算总的

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from scipy.interpolate import lagrange, interp1d


iris = datasets.load_iris()
X = iris.data  # X.shape==(150, 4)
y = iris.target  # y.shape==(150, )

# 二进制化输出
y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
n_classes = y.shape[1]  # n_classes==3

# 添加噪音特征，使问题更困难
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape  # n_samples==150, n_features==4
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)

# 打乱数据集并切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# 为每个类别计算ROC曲线和AUC
fpr = dict()        ### 假正例率
tpr = dict()        ### 真正例率
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

########################### 计算宏平均ROC曲线和AUC ###########################
### 每个二分类，各自算各自的，再综合
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

########################### 计算微平均ROC曲线和AUC ###########################
### 先综合每个二分类的，再综合
# 汇总所有FPR
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
print(all_fpr.shape)        # (42,)

# 然后再用这些点对ROC曲线进行插值
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    #### 把每个 二分类 结果 加起来了
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])     ### 版本不同
    # f = interp1d(fpr[i], tpr[i])                  ### 这两句和上面一句是一个作用
    # mean_tpr += f(all_fpr)

# 最后求平均并计算AUC
mean_tpr /= n_classes
print(mean_tpr)

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

########################### 绘制所有ROC曲线 ###########################
plt.figure()
lw = 2
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

输出

参考资料：

https://blog.csdn.net/hfutdog/article/details/88079934

SKlearn - ROC and AUC

基础用法

EER 选择模型阈值

one vs rest 多分类 ROC

多分类 - 宏 ROC 微 ROC

猜你喜欢