[python] Dibujo personalizado de ROC y umbral óptimo basado en datos

Trazar la curva ROC y el umbral óptimo

El valor de umbral óptimo aquí es el valor óptimo obtenido al evaluar utilizando el índice de Youdan (sensibilidad + especificidad - 1).

Datos categóricos, ingrese el valor categórico verdadero (data_true: {0,1}) y el valor predicho (data_pred: {valor continuo})

def stat_tfpr(data_true, data_pred, thr, tol=6):
    TP, FP, TN, FN = 0, 0, 0, 0
    for i, j in zip(data_true, data_pred):
        if j >= thr:  # pred: positive
            # pred = 1
            if i == 1:  # real positive
                TP += 1
            else:  # real negative
                FP += 1
        else:
            # pred = 0
            if i == 0:  # real negative
                TN += 1
            else:  # real positive
                FN += 1
    # 真阳率:TPR = TP/(TP+FN)
    # 假阳率:FPR = FP/(FP+TN)
    tpr = round(float(TP)/float(TP+FN), tol)
    fpr = round(float(FP)/float(FP+TN), tol)
    return tpr, fpr


def my_range(minx, maxx, n, tol=6):
    span = maxx - minx
    step = round(float(span) / n, tol)
    # print(span, step, minx)
    x = minx
    for i in range(n):
        x += step
        yield round(x, tol)
        

def my_tpr_fpr(data_true, data_pred, n, tol=6):
    max_prd = max(data_pred)
    min_prd = min(data_pred)
    step = float(max_prd-min_prd) / n
    tpr_lst, fpr_lst, thr_lst = [], [], []
    result_lst = []
    for i in my_range(min_prd, max_prd, n, tol):
        tpr, fpr = stat_tfpr(data_true, data_pred, i, tol)
        tpr_lst.append(tpr)
        fpr_lst.append(fpr)
        thr_lst.append(i)
    return tpr_lst, fpr_lst, thr_lst


def all_max_idx(lst):
    mx = max(lst)
    idx_lst = []
    for i, j in enumerate(lst):
        if j == mx:
            idx_lst.append(i)
    return idx_lst


def find_optimal_cutoff_lst(TPR, FPR, threshold):
    # y = TPR - FPR
    # Youden_index = np.argmax(y)  # Only the first occurrence is returned.
    yd_lst = [round(i, 6) for i in np.array(TPR) - np.array(FPR)]
    mx_idxs = all_max_idx(yd_lst)
    lst = []
    for Youden_index in mx_idxs:
        optimal_threshold = threshold[Youden_index]
        point = [FPR[Youden_index], TPR[Youden_index]]
        lst.append([optimal_threshold, point])
    return lst


def roc_thr_plot(fpr, tpr, thrs):
    roc_auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(6, 6))
    plt.title('Validation ROC')
    plt.plot(fpr, tpr, label='Val AUC = %0.3f' % roc_auc)  # 'b', 
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1]) # , 'r--')
    # plt.xlim([0, 1])
    # plt.ylim([0, 1])
    
    # optimal_th, optimal_point = find_optimal_cutoff(TPR=tpr, FPR=fpr, threshold=thrs)
    optimal_lst = find_optimal_cutoff_lst(TPR=tpr, FPR=fpr, threshold=thrs)
    for i, optimal_v in enumerate(optimal_lst):
        optimal_th, optimal_point = optimal_v
        plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')
        plt.text(optimal_point[0]+0.02*(i+1), optimal_point[1]-0.08*(i+1), 
                 'Threshold:{optimal_th:.2f} [{a:.4f}, {b:.4f}]'
                 ''.format(optimal_th=optimal_th, a=optimal_point[0], b=optimal_point[1]))

    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# 根据真实分类数据 my_true_data 和预测值 my_pred_data,获取TPR/FPR和阈值:
tlst, flst, thr_lst = my_tpr_fpr(my_true_data, my_pred_data*1000, n=2000)  # 这里如果预测值在小数6位(比如)以内差异敏感,这里可进行1000倍扩大;参数n是将阈值分割的份数。
# 根据数据进行绘图:
roc_thr_plot(np.array(flst), np.array(tlst), thr_lst)

Ejemplo de dibujo: el punto rojo en la figura es el punto óptimo, y la esquina inferior correspondiente es el valor de umbral, abscisa y ordenada. Estos umbrales arrojaron el mismo índice de Youdan óptimo.
inserte la descripción de la imagen aquí

Trazar ROC usando sklearn.metrics

metrics.roc_curceDibuje la curva ROC usando

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
import matplotlib.pyplot as plt

def my_roc_plt(model, testx, testy):
    pred_decs_y = model.decision_function(testx)
    fpr, tpr, threshold = metrics.roc_curve(testy, pred_decs_y )
    roc_auc = metrics.auc(fpr,tpr)

    plt.figure(figsize=(3, 3))
    plt.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend()
    plt.show()

def evaluate_model_report(model, testx, testy):
    predy = model.predict(testx)
    print(predy)
    # true_pred_df = pd.DataFrame()
    # true_pred_df['true'] = list(testy)
    # true_pred_df['pred'] = list(predy)
    accuracy_score = metrics.accuracy_score(testy, predy)
    recall_score = metrics.recall_score(testy, predy)
    roc_score = metrics.roc_auc_score(testy, predy)
    print('accuracy\trecall\troc\n{:.4f}\t{:.4f}\t{:.4f}'
          ''.format(accuracy_score, recall_score, roc_score), end='')

    # class_report = metrics.classification_report(testy, predy)
    # print(class_report)

    conf_mat = metrics.confusion_matrix(testy, predy)
    conf_df = pd.DataFrame(conf_mat, index=['True:0', 'True:1'], columns=['pred:0', 'pred:1'])
    
    my_roc_plt(model, testx, testy)
    return conf_df

# 划分数据集:
X = df.drop(columns=['label'])  # 或 X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=1)

# 使用LinearSVC模型
clf = LinearSVC(loss='hinge').fit(X_train, y_train)
score = clf.score(X_train, y_train)
# 绘制ROC
evaluate_model_report(clf_1, X_test, y_test)

Ejemplo de resultado de la trama:
inserte la descripción de la imagen aquí

Ejemplo de regresión logística

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from scipy.stats import zscore

from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA


def lst_idxs(lst, itm, nround=6):
    midxs = []
    for i, m in enumerate(lst):
        if round(m, nround) == round(itm, nround):
            midxs.append(i)
    return midxs


def choose_optidx(fpr, tpr):
    """
    优先考虑选择fpr尽量小<=0.2时,youden尽可能大
    """
    idxs = []
    tpr_ilst, fpr_ilst = [], []
    for i, a in enumerate(fpr):
        if float(a) < 0.2:  # 特异度>=80%
            idxs.append(i)
            tpr_ilst.append(tpr[i])
            fpr_ilst.append(fpr[i])
    
    # 所有满足条件的最大youden值
    yd_arr = np.array(tpr_ilst) - np.array(fpr_ilst)
    max_yd = round(max(yd_arr), 6)
    maxyd_idxs = lst_idxs(yd_arr, max_yd, nround=6)
    
    # 最大特异度(最小fpr)
    midx = maxyd_idxs[0]
    min_fpr = fpr[midx]
    for k in maxyd_idxs:
        idx = idxs[k]
        if fpr[idx] < min_fpr:
            min_fpr = fpr[idx]
            midx = idx
    
    return midx


def lr_train_test_result(train_df, ntest_df, label, outdir='./', 
                         penalty='l2', solver='liblinear', max_iter=100, C=100, 
                         use_opt=False, scaler=False, usepca=False, use_tsne=False, n_components=3):
    
    X = train_df.drop(label, axis=1)
    y = train_df[label]
    
    X_ntest = ntest_df.drop(label, axis=1)
    y_ntest = ntest_df[label]
    
    if scaler:
        X = X.apply(zscore)
        # 使用训练数据的均值和标准差对测试数据进行标准化(注意使用训练数据的统计量)
        X_ntest = (X_ntest - X.mean()) / X.std()
        
    if usepca:
        # 创建PCA对象,并设置降维后的维度为3
        pca = PCA(n_components=n_components)
        # 创建新的DataFrame来存储降维后的特征
        columns = ['pca_'+str(i+1) for i in range(n_components)]
        
        # 对特征数据进行PCA降维
        X_pca = pca.fit_transform(X)
        X = pd.DataFrame(X_pca, columns=columns)
        
        # 对新样本进行降维
        X_ntest_pca = pca.transform(X_ntest)
        X_ntest = pd.DataFrame(X_ntest_pca, columns=columns)
        
    if use_tsne:
        # 使用t-SNE进行降维
        tsne = TSNE(n_components=n_components, perplexity=10, random_state=42)
        X_tsne = tsne.fit_transform(X)
        columns = ['pca_'+str(i+1) for i in range(n_components)]
        X = pd.DataFrame(X_tsne, columns=columns)
        
        # 对新样本进行降维
        X_ntest_tsne = tsne.fit_transform(X_ntest)
        X_ntest = pd.DataFrame(X_ntest_tsne, columns=columns)
        
    best_params = {
    
    
        'penalty': penalty, 
        'solver': solver, 
        'max_iter': max_iter, 
        'C': C,
    }
    
    # **输出预测概率值**
    traindata_predscore_df = train_df.copy(deep=True)
    ntestdata_predscore_df = ntest_df.copy(deep=True)
    
    # 利用最优参数训练和预测:
    clf_a = LogisticRegression(**best_params)
    
    # 训练分类器
    clf_a.fit(X, y)
    
    # 斜率:
    coefs = clf_a.coef_
    # 截距:
    inter = clf_a.intercept_
    
    # -----------------
    # all train: 计算acc., 混淆矩阵, AUC, sen., spe., ppv, npv
    # -----------------
    all_score_lst = []
    y_allproba = clf_a.predict_proba(X)[:,1]
    # all_auc_scores
    all_score_lst.append(roc_auc_score(y, y_allproba))
    # all_acc_scores
    all_score_lst.append(clf_a.score(X, y))
    
    # 找最优阈值:
    all_fpr, all_tpr, all_thresholds = roc_curve(y, y_allproba)
    if use_opt:
        # all_optimal_idx = max_yds_idx(all_fpr, all_tpr)
        all_optimal_idx = choose_optidx(all_fpr, all_tpr)
        all_opt_thr = all_thresholds[all_optimal_idx] # .round(6)
    else:
        all_opt_thr = 0.5

    # y_allpred = clf_a.predict(X)  # 0.5
    y_allpred = np.where(y_allproba >= all_opt_thr, 1, 0)
    alltn, allfp, allfn, alltp = confusion_matrix(y, y_allpred).ravel()
    # sensitivity 
    all_sen = alltp / (alltp + allfn)
    all_score_lst.append(all_sen)
    # specificity 
    all_spe = alltn / (alltn + allfp)
    all_score_lst.append(all_spe)
    traindata_predscore_df['Merge:pred'] = y_allproba
    
    # -----------------
    # 新的数据测试:test
    # -----------------
    an_score_lst = []
    y_anproba = clf_a.predict_proba(X_ntest)[:,1]
    #an_roc_scores
    an_score_lst.append(roc_auc_score(y_ntest, y_anproba))
    # an_acc_scores
    an_score_lst.append(clf_a.score(X_ntest, y_ntest))
    
    # y_anpred = clf_a.predict(X_ntest)  # 0.5的默认阈值
    y_anpred = np.where(y_anproba >= all_opt_thr, 1, 0)  # 使用训练集的opt_thr
    antn, anfp, anfn, antp = confusion_matrix(y_ntest, y_anpred).ravel()
    # ansensitivity
    an_sen = antp / (antp + anfn)
    an_score_lst.append(an_sen)
    # anspecificity
    an_spe = antn / (antn + anfp)
    an_score_lst.append(an_spe)
    ntestdata_predscore_df['ntest:pred'] = y_anproba
    
    # 输出文件
    traindata_predscore_df.to_csv(outdir+'/train_data.predscore.txt', sep='\t')
    ntestdata_predscore_df.to_csv(outdir+'/ntest_data.predscore.txt', sep='\t')
    
    # 特定参数的:ROC曲线(固定train,test)
    # my_rocplots({'train': (y, y_allproba), 
    #              'ntest[%s]' % all_opt_thr: (y_ntest, y_anproba)}, )
    train_label = 'train_thr:{thr:.6f}\nspe.={spe:.4f},sen.={sen:.4f}'.format(thr=all_opt_thr, spe=all_spe, sen=all_sen)
    ntest_label = 'ntest_thr:{thr:.6f}\nspe.={spe:.4f},sen.={sen:.4f}'.format(thr=all_opt_thr, spe=an_spe, sen=an_sen)
    my_rocplots_opt({
    
    train_label: (y, y_allproba, 1-all_spe, all_sen),  
                     ntest_label: (y_ntest, y_anproba, 1-an_spe, an_sen)})
    
    return all_score_lst, an_score_lst, all_opt_thr, coefs, inter

# 输入数据:df_train_x,df_ntest_x 为dataframe格式
# 第一列为样本名,中间列为各特征数,最后一列为目标值label列
lr_train_test_result(df_train_x, df_ntest_x, label, outdir=outdir, 
                     penalty='l2', solver='liblinear', max_iter=999, C=999,   # newton-cg, liblinear
                     use_opt=True, scaler=False, usepca=False, use_tsne=False, n_components=3)

ROC de validación cruzada k-fold

Dibuje la curva ROC y la ROC media para cada pliegue

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

def perform_kfold_cv(X, y, n_splits=5):
    # k-fold划分数据集
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    optimal_thresholds = []

    fig, ax = plt.subplots()

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = LogisticRegression()
        model.fit(X_train, y_train)

        fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        aucs.append(roc_auc)

        # 找到最优约登指数对应的阈值
        optimal_threshold_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_threshold_idx]
        optimal_thresholds.append(optimal_threshold)

        ax.plot(fpr, tpr, lw=1, alpha=0.3)

        # 标记最优阈值对应的FPR和TPR点
        optimal_fpr = fpr[optimal_threshold_idx]
        optimal_tpr = tpr[optimal_threshold_idx]
        ax.scatter(optimal_fpr, optimal_tpr, color='red', marker='o')

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    ax.plot(mean_fpr, mean_tpr, color='b', label=f'Mean ROC (AUC = {
      
      mean_auc:.2f})', lw=2)
    ax.plot([0, 1], [0, 1], color='r', linestyle='--', lw=2, label='Random Guess')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver Operating Characteristic')
    ax.legend(loc="lower right")

    plt.show()

    print("Optimal Thresholds:", optimal_thresholds)

# # 示例使用
# X = ...  # 特征矩阵
# y = ...  # 目标向量

# perform_kfold_cv(X, y, n_splits=5)

Ejemplo de uso y trazado del resultado:

# df = ...  # dataframe格式的数据,target是目标值列
X = df.drop(columns=[target], axis=1)
y = df[target]
perform_kfold_cv(X, y, n_splits=5)  # 5-fold交叉验证

inserte la descripción de la imagen aquí

Criterios de evaluación del modelo para problemas de clasificación binaria


matriz de confusión:

real\pronóstico 1 pronóstico positivo 0 pronóstico nublado total
1 yang real TP FN TP + FN
0 Yin real FP Tennesse FP+TN

Métricas de evaluación del modelo:

  • Valor AUC: el área bajo la curva ROC.

  • TPR (Tasa de verdaderos positivos) Tasa de verdaderos positivos/Tasa de verdaderos positivos: TPR = TP / (TP + FN) TPR = TP/(TP+FN)TPR=TP / ( TP+FN ) , la proporción de todaspositivas verdaderasque se prevé que sean positivas. Es decir, Sensibilidad (Sensitivity, Sen.) o recordar (recordar).

  • FPR (tasa de falsos positivos) FPR = FP / ( FP + TN ) FPR = FP/(FP+TN)FPR=FP / ( FP+TN ) , la proporción de todasnegativas verdaderasque se prevé que sean positivas.

  • 特异度(Especificidad):S pe . = TN / ( TN + FP ) = 1 − FPR Spe. = TN / (TN + FP) = 1 - FPRespe . _=TN / ( TN+FP )=1FPR , la proporción de todasnegativas verdaderasque se predice que serán negativas (correctas).

  • Función (Precisión): precisión = TP / ( TP + FP ) precisión = TP/(TP+FP)p rec i s i ó n=TP / ( TP+FP ) , entre los positivos predichos, es la proporción de verdaderos positivos.

  • Puntuación F1: F 1 = 2 ∗ TP / ( 2 TP + FP + FN ) F1=2*TP/(2TP+FP+FN)Q1 _=2TP / ( 2 TP+FP+FN ) , una métrica mixta que es efectiva para clases desequilibradas. [2 veces positivos verdaderos/(2 veces positivos verdaderos + positivos verdaderos predichos como negativos [falsos negativos] + positivos predichos realmente negativos [falsos positivos])]

  • Relación (Índice de Youden): Youden = S in . + S pe . − 1 = TPR − FPR Youden=Sen.+Spe.-1=TPR-FPRusted n _ _ _ _=S e n .+espe . _1=TPRFPR , un índice completo de sensibilidad y especificidad.

  • KS值: KS = máx (TPR − FPR) KS=máx(TPR-FPR)K S=máx . ( TPRFPR )

curva ROC, curva KS


Referencia:
https://blog.csdn.net/qq_42433311/article/details/124124893
https://blog.csdn.net/weixin_43543177/article/details/107565947

Supongo que te gusta

Origin blog.csdn.net/sinat_32872729/article/details/129296895
Recomendado
Clasificación