利用ROC和AUC度量分类器性能

关于ROC及AUC的介绍，可以参考：https://blog.csdn.net/qq_41080850/article/details/86560645

本文主要讨论用Python3实现ROC的绘制。另：以下代码均运行于Jupyter Notebook中。

代码实现：

# 读入岩石-水雷数据集：
data = pd.read_csv('rock_mine.csv',header=None,prefix='V')
data.head()

data表结构如下所示：

# 导入要用到的相关包
import numpy as np
import random
from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl


# 定义混淆矩阵函数：
def confusionMatrix(predicted, actual, threshold):
    if len(predicted) != len(actual): return -1
    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0
    for i in range(len(actual)):
        if actual[i] > 0.5: # 实际值是正例
            if predicted[i] > threshold:
                tp += 1.0 # 预测值也是正例(真正例)
            else:
                fn += 1.0 # 预测值是反例(假反例)
        else:              # 实际值是反例
            if predicted[i] < threshold:
                tn += 1.0 # 预测值是反例(真反例)
            else:
                fp += 1.0 # 预测值是正例(假正例)
    rtn = [tp, fn, fp, tn]
    return rtn


# 提取特征集和标签集
xList = list(data.iloc[:,0:-1].values)
labels = data['V60'].map({'M':1,'R':0}).tolist()


# 采用留出法将原始数据集划分成训练集(2/3)和测试集(1/3)  //准确的说，此处的测试集应该叫做验证集
indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0 ]
xListTrain = [xList[i] for i in indices if i%3 != 0 ]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]


# 将训练集和测试集分别转化成与scikit-learn中linear model的输入相匹配的数组对象
xTrain = np.array(xListTrain)
yTrain = np.array(labelsTrain)
xTest = np.array(xListTest)
yTest = np.array(labelsTest)


# 训练线性回归模型
rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain,yTrain)

# 利用训练好的模型计算在训练集上的预测值：
trainingPredictions = rocksVMinesModel.predict(xTrain)


# 利用混淆函数计算训练集上的混淆矩阵：
confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
tp_train = confusionMatTrain[0]
fn_train = confusionMatTrain[1] 
fp_train = confusionMatTrain[2] 
tn_train = confusionMatTrain[3]

# 打印出tp_train、fn_train、fp_train和tn_train的值
print("tp_train = " + str(tp_train) + "\tfn_train = " + str(fn_train) + "\n" + \
      "fp_train = " + str(fp_train) + "\ttn_train = " + str(tn_train) + '\n')
# 打印结果为：
# tp_train = 68.0	fn_train = 6.0
# fp_train = 7.0	tn_train = 57.0


# 利用训练好的模型计算在测试集上的测试值：
testPredictions = rocksVMinesModel.predict(xTest)

# 利用混淆函数计算测试集上的混淆矩阵：
conMatTest = confusionMatrix(testPredictions, yTest, 0.5)
tp_test = conMatTest[0]
fn_test = conMatTest[1]
fp_test = conMatTest[2]
tn_test = conMatTest[3]

# 打印出tp_test,fn_test,fp_test和tn_test的值
print("tp_test = " + str(tp_test) + "\tfn_test = " + str(fn_test) + "\n" + \
      "fp_test = " + str(fp_test) + "\ttn_test = " + str(tn_test) + '\n')
# 打印结果为：
# tp_test = 28.0	fn_test = 9.0
# fp_test = 9.0	        tn_test = 24.0

# 利用roc_curve绘制训练集上的ROC
fpr_train, tpr_train, thresholds_train = roc_curve(yTrain,trainingPredictions)
roc_auc_train = auc(fpr_train, tpr_train)
print( 'AUC for in-sample ROC curve: %f' % roc_auc_train)

pl.clf()  # 清除当前figure
pl.plot(fpr_train, tpr_train, label='ROC curve (area = %0.2f)' % roc_auc_train)
pl.plot([0, 1], [0, 1], 'k--')   # 绘制随机猜测分类器对应的ROC
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="best")

# 利用roc_curve绘制测试集上的ROC
fpr_test, tpr_test, thresholds_test = roc_curve(yTest,testPredictions)
roc_auc_test = auc(fpr_test, tpr_test)
print( 'AUC for out-of-sample ROC curve: %f' % roc_auc_test)

pl.clf()
pl.plot(fpr_test, tpr_test, label='ROC curve (area = %0.2f)' % roc_auc_test)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out-of-sample ROC rocks versus mines')
pl.legend(loc="best")

参考：《Python机器学习——预测分析核心算法》Michael Bowles著

利用ROC和AUC度量分类器性能

猜你喜欢