from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
    ,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
    ,roc_curve
from sklearn.datasets import  load_iris
from sklearn.multiclass import  OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import numpy as np


def test_accuracy_score():
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,1,1,0,0]
    print('accuracy_score<比例>:',accuracy_score(y_true,y_pred,normalize=True))
    print('accuracy_score<数字>:',accuracy_score(y_true,y_pred,normalize=False))
test_accuracy_score()

精确率，召回率，f1-score

def test_precision_score():
    '''
    查准率：预测结果为正类的那些样本中，有多少比例为正类
    '''
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('accuracy_score:',accuracy_score(y_true,y_pred,normalize=True))#准确率
    print('precision_score:',precision_score(y_true,y_pred))#查准率：预测结果为正类的那些样本中，有多少比例为正类
test_precision_score()

def test_recall_score():
    '''
    f1:查准率和查全率的调和均值
    '''
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('accuracy_score:',accuracy_score(y_true,y_pred,normalize=True))#准确率
    print('precision_score:',precision_score(y_true,y_pred))#查准率：预测结果为正类的那些样本中，有多少比例为正类
    print('recall_score:',recall_score(y_true,y_pred))#查全率
test_recall_score()

def test_f1_score():
    '''
    f1：真实的正类中，有多少比例被预测为正类
    '''
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('accuracy_score:',accuracy_score(y_true,y_pred,normalize=True))#准确率
    print('precision_score:',precision_score(y_true,y_pred))#查准率：预测结果为正类的那些样本中，有多少比例为正类
    print('recall_score:',recall_score(y_true,y_pred))#查全率
    print('f1_score:',f1_score(y_true,y_pred))
test_f1_score()


def test_fbeta_score():
    '''
    beta=0时，为precision score
    beta=1,  为f1-score
    beta=无穷大 为 recall
    '''
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('accuracy_score:',accuracy_score(y_true,y_pred,normalize=True))#准确率
    print('precision_score:',precision_score(y_true,y_pred))#查准率：预测结果为正类的那些样本中，有多少比例为正类
    print('recall_score:',recall_score(y_true,y_pred))#查全率
    print('f1_score:',f1_score(y_true,y_pred))
    print('fbeta score(beta=0.001):',fbeta_score(y_true,y_pred,beta=0.001))
    print('fbeta score(beta=1):',fbeta_score(y_true,y_pred,beta=1))
    print('fbeta score(beta=10):',fbeta_score(y_true,y_pred,beta=10))
    print('fbeta score(beta=10000):',fbeta_score(y_true,y_pred,beta=10000))
test_fbeta_score()

def test_classification_report():
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('Classification Report:\n',classification_report(y_true,y_pred,target_names=['class_0','class_1']))
    
test_classification_report()   


def test_confusion_matrix():
    '''
    测试confusion_matrix的用法
    输出混淆矩阵
    
        预测
          yes    no    合计
实际 yes   TP     FN     P
     no    FP     TN     N
     合计  P       N     P+N
    
    '''
    y_true=[1,1,1,1,1,0,0,0,0,0]
    y_pred=[0,0,1,1,0,0,0,0,0,0]
    print('Confusion Matrix:\n',confusion_matrix(y_true,y_pred,labels=[0,1]))
    
test_confusion_matrix()

绘制P-R曲线
返回值：一个元组，元组内的元素为：
1.p-r曲线的查准率序列，该序列是递增序列，序列的第i个元素是正类的阈值为thresholds【i】时查准率的
2.p-r曲线的查全率序列，该序列是递减序列，序列的第i个元素是正类的阈值为thresholds【i】时查全率的
3.p-r曲线的阈值序列，该序列是递增序列，给出判定位正例时的Probas——pred的阈值

OneVsRestClassifier:多分类器
使用过程中要指明使用的二项分类器是什么
在one—vs—all策略中，假设有n个类别，那么就建立n个二项分类器，每个分类器针对其中一个类别和剩余类型进行分类
进行预测时，利用这n个二项分类器进行分类，得到的数据属于当前类的概率，选择其中概率最大的一个类别作为最终的预测结果

%matplotlib inline
def test_precision_recall_curve():

    iris=load_iris()
    X=iris.data
    y=iris.target
    y=label_binarize(y,classes=[0,1,2])
    n_classes = y.shape[1]
    
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=0)
    

    
    clf=OneVsRestClassifier(SVC(kernel='linear',probability=True,random_state=0))
    clf.fit(X_train,y_train)
    y_score=clf.fit(X_train,y_train).decision_function(X_test)
    #获取P-R
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    precision=dict()
    recall=dict()
    for i in range(n_classes):
        precision[i],recall[i],_=precision_recall_curve(y_test[:,i],
                                                       y_score[:,i])
        ax.plot(recall[i],precision[i],label='target=%s' %i)
        
    ax.set_xlabel('Recall Score')
    ax.set_ylabel('Precision Score')
    ax.set_title('P-R')
    ax.legend(loc='best')
    ax.set_xlim(0,1.1)
    ax.set_ylim(0,1.1)
    ax.grid()
    plt.show()
test_precision_recall_curve()

precision和recall双高当然是最好的
但实际运用中往往precision 和 recall 成反比关系
比如只检索出一条且相关，则precision为100%而recall则很低
而实际运用中则根据需要调整指标，比如如果是做搜索，那就是保证召回的情况下提升准确率
如果做疾病监测，反垃圾，则是保证精确率的条件下，提升召回

为了找precision和recall的平衡点，往往会通过绘制precision-recall curve
prc纵轴是precision，而横轴是recall，所以PRC曲线越往右上凸越好（双高）
而F1指则是结合precision和recall的综合评估：2PR/(P+R)

通常还会用ROC曲线衡量分类器效果，ROC纵轴是true positive rate ，横轴是false positive rate
通常tpr越高，fpr越低，分类器效果越好，所以ROC曲线越往凸越好。

测试 roc_curve 的用法，并绘制ROC曲线

预测
yes no 合计
实际 yes TP FN P
no FP TN N
合计 P N P+N

准确率：accuracy=(TP+TN)/(P+N)
recall=TP/(TP+FN)=TP/P
f1=2*precision*recall/(precision+recall)
F=(1+beta^2)*precision*recall/(beta*precision+recall)
beta=0时：为precision score
beta=1时：f1-score
beta=无穷大 recall



ROC曲线：
纵坐标：真正率或敏感度召回率
TPR=TP/(TP+FN)（正样本预测结果数/正样本实际数）
横坐标：假正率
FPR=FP/(FP+TN)(被预测为正的负样本结果数/负样本实际数）

def test_roc_curve():
    iris=load_iris()
    X=iris.data
    y=iris.target
    #二元化标记
    y=label_binarize(y,classes=[0,1,2])
    n_classes = y.shape[1]
    
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=0)
    
    clf=OneVsRestClassifier(SVC(kernel='linear',probability=True,random_state=0))
    clf.fit(X_train,y_train)
    y_score=clf.fit(X_train,y_train).decision_function(X_test)
    #获取P-R
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    fpr=dict()
    tpr=dict()
    for i in range(n_classes):
        fpr[i],tpr[i],thresholds=roc_curve(y_test[:,i],
                                                       y_score[:,i])
        ax.plot(fpr[i],tpr[i],label='target=%s' %i)
        
    ax.set_xlabel('fpr')
    ax.set_ylabel('tpr')
    ax.set_title('roc score')
    ax.legend(loc='best')
    ax.set_xlim(0,1.1)
    ax.set_ylim(0,1.1)
    ax.grid()
    plt.show()
test_roc_curve()

机器学习——性能度量_分类

精确率，召回率，f1-score

猜你喜欢