绘制ROC曲线

绘制ROC曲线首选肯定是查阅sklearn.metrics.roc_curve(y_truey_scorepos_label=Nonesample_weight=Nonedrop_intermediate=True)
但是所谓y_score到底指的是什么?查找网上相关经验时,发现几乎所有文章都用的同一个例子,但是并没有解决我的疑惑,于是多次尝试该值的计算方法,发现probas_=model.predict_proba(X[test])  可以计算得到每个样本为正例的概率,该函数经测试多个模型发现都是适用的,虽然官方文档指出decision_function()适用于某些分类器,但是目前我所看到的只有svc分类器使用了该函数计算y_score,遇到画roc曲线可以先尝试使用.predict_proba函数

接下来fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])  该函数传入的参数一定要注意是一列,否则会报错(数组类型错误)该函数则得到我们想要的roc曲线的横纵坐标数组,

另外 roc_auc = auc(fpr, tpr) 计算该model的auc值。

接下来附上完整代码:

def work1():
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import svm, datasets
    from sklearn.metrics import roc_curve, auc  ###计算roc和auc
    from sklearn import cross_validation

  from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import MultinomialNB, GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn import neighbors

    # Import some data to play with
    data = pd.read_excel("orange_train.xls")#读取数据('Var11', 'Var21', 'Var22','Var28','Var39','Var46','Var47','Var50'为特征名,每列为一个特征)
    xdata = (data.ix[0:10000, ['Var11', 'Var21', 'Var22','Var28','Var39','Var46','Var47','Var50']])##只取了文档前9999个
    # print(xdata.shape)
    ydata = pd.read_csv("orange_large_train_churn.txt", header=None)#label文档(只有一列)
    y = np.array(ydata[0].tolist()[:9999])#只取了文档前9999个
    # print(ydata.shape)
    X = np.array(xdata)
    print(X.shape)
    print(y.shape)
    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
   

#构建模型
    # svm = svm.SVC(kernel='linear', probability=True, random_state=random_state)
    clf_NB = GaussianNB()
    clf = LogisticRegression(random_state=12)

    clf_Tree = DecisionTreeClassifier(random_state=0)

    kn_clf = neighbors.KNeighborsClassifier()

   
   # ###通过predict_proba()   decision_function()计算得到的y_score的值,用在roc_curve()函数中
    y_score = clf.fit(X_train, y_train)
    y_pred = y_score.predict_proba(X_test)   

    y_score0 = clf_NB.fit(X_train, y_train).predict_proba(X_test)
    y_score1 = clf_Tree.fit(X_train, y_train).predict_proba(X_test)
    y_score2 = kn_clf.fit(X_train, y_train).predict_proba(X_test)
   

   # Compute ROC curve and ROC area for each class
    fpr, tpr, threshold = roc_curve(y_test, y_pred[:, 1])  ###计算真正率和假正率
    print(y_test,y_pred)
    print(fpr, tpr, threshold)
    roc_auc = auc(fpr, tpr)  ###计算auc的值
    fpr0, tpr0, threshold0 = roc_curve(y_test, y_score0[:, 1])  ###计算真正率和假正率
    roc_auc0 = auc(fpr0, tpr0)
    fpr1, tpr1, threshold1 = roc_curve(y_test, y_score1[:, 1])  ###计算真正率和假正率
    roc_auc1 = auc(fpr1, tpr1)
    fpr2, tpr2, threshold2 = roc_curve(y_test, y_score2[:, 1])  ###计算真正率和假正率
    roc_auc2 = auc(fpr2, tpr2)
   

#画图

plt.plot(fpr, tpr, color='darkorange',
             label='LR ROC curve (area = %0.2f)' % roc_auc)  ###假正率为横坐标,真正率为纵坐标做曲线
    plt.plot(fpr0, tpr0, "r-",
              label='NB ROC curve (area = %0.2f)' % roc_auc0)
    plt.plot(fpr1, tpr1, "y-",
              label='TREE ROC curve (area = %0.2f)' % roc_auc1)
    plt.plot(fpr2, tpr2, "b-",
             label='KNN ROC curve (area = %0.2f)' % roc_auc2)
    plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

猜你喜欢

转载自www.cnblogs.com/kjkj/p/9234986.html