Directorio de artículos
Trazar la curva ROC y el umbral óptimo
El valor de umbral óptimo aquí es el valor óptimo obtenido al evaluar utilizando el índice de Youdan (sensibilidad + especificidad - 1).
Datos categóricos, ingrese el valor categórico verdadero (data_true: {0,1}) y el valor predicho (data_pred: {valor continuo})
def stat_tfpr(data_true, data_pred, thr, tol=6):
TP, FP, TN, FN = 0, 0, 0, 0
for i, j in zip(data_true, data_pred):
if j >= thr: # pred: positive
# pred = 1
if i == 1: # real positive
TP += 1
else: # real negative
FP += 1
else:
# pred = 0
if i == 0: # real negative
TN += 1
else: # real positive
FN += 1
# 真阳率:TPR = TP/(TP+FN)
# 假阳率:FPR = FP/(FP+TN)
tpr = round(float(TP)/float(TP+FN), tol)
fpr = round(float(FP)/float(FP+TN), tol)
return tpr, fpr
def my_range(minx, maxx, n, tol=6):
span = maxx - minx
step = round(float(span) / n, tol)
# print(span, step, minx)
x = minx
for i in range(n):
x += step
yield round(x, tol)
def my_tpr_fpr(data_true, data_pred, n, tol=6):
max_prd = max(data_pred)
min_prd = min(data_pred)
step = float(max_prd-min_prd) / n
tpr_lst, fpr_lst, thr_lst = [], [], []
result_lst = []
for i in my_range(min_prd, max_prd, n, tol):
tpr, fpr = stat_tfpr(data_true, data_pred, i, tol)
tpr_lst.append(tpr)
fpr_lst.append(fpr)
thr_lst.append(i)
return tpr_lst, fpr_lst, thr_lst
def all_max_idx(lst):
mx = max(lst)
idx_lst = []
for i, j in enumerate(lst):
if j == mx:
idx_lst.append(i)
return idx_lst
def find_optimal_cutoff_lst(TPR, FPR, threshold):
# y = TPR - FPR
# Youden_index = np.argmax(y) # Only the first occurrence is returned.
yd_lst = [round(i, 6) for i in np.array(TPR) - np.array(FPR)]
mx_idxs = all_max_idx(yd_lst)
lst = []
for Youden_index in mx_idxs:
optimal_threshold = threshold[Youden_index]
point = [FPR[Youden_index], TPR[Youden_index]]
lst.append([optimal_threshold, point])
return lst
def roc_thr_plot(fpr, tpr, thrs):
roc_auc = metrics.auc(fpr, tpr)
plt.figure(figsize=(6, 6))
plt.title('Validation ROC')
plt.plot(fpr, tpr, label='Val AUC = %0.3f' % roc_auc) # 'b',
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1]) # , 'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# optimal_th, optimal_point = find_optimal_cutoff(TPR=tpr, FPR=fpr, threshold=thrs)
optimal_lst = find_optimal_cutoff_lst(TPR=tpr, FPR=fpr, threshold=thrs)
for i, optimal_v in enumerate(optimal_lst):
optimal_th, optimal_point = optimal_v
plt.plot(optimal_point[0], optimal_point[1], marker='o', color='r')
plt.text(optimal_point[0]+0.02*(i+1), optimal_point[1]-0.08*(i+1),
'Threshold:{optimal_th:.2f} [{a:.4f}, {b:.4f}]'
''.format(optimal_th=optimal_th, a=optimal_point[0], b=optimal_point[1]))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# 根据真实分类数据 my_true_data 和预测值 my_pred_data,获取TPR/FPR和阈值:
tlst, flst, thr_lst = my_tpr_fpr(my_true_data, my_pred_data*1000, n=2000) # 这里如果预测值在小数6位(比如)以内差异敏感,这里可进行1000倍扩大;参数n是将阈值分割的份数。
# 根据数据进行绘图:
roc_thr_plot(np.array(flst), np.array(tlst), thr_lst)
Ejemplo de dibujo: el punto rojo en la figura es el punto óptimo, y la esquina inferior correspondiente es el valor de umbral, abscisa y ordenada. Estos umbrales arrojaron el mismo índice de Youdan óptimo.
Trazar ROC usando sklearn.metrics
metrics.roc_curce
Dibuje la curva ROC usando
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
import matplotlib.pyplot as plt
def my_roc_plt(model, testx, testy):
pred_decs_y = model.decision_function(testx)
fpr, tpr, threshold = metrics.roc_curve(testy, pred_decs_y )
roc_auc = metrics.auc(fpr,tpr)
plt.figure(figsize=(3, 3))
plt.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend()
plt.show()
def evaluate_model_report(model, testx, testy):
predy = model.predict(testx)
print(predy)
# true_pred_df = pd.DataFrame()
# true_pred_df['true'] = list(testy)
# true_pred_df['pred'] = list(predy)
accuracy_score = metrics.accuracy_score(testy, predy)
recall_score = metrics.recall_score(testy, predy)
roc_score = metrics.roc_auc_score(testy, predy)
print('accuracy\trecall\troc\n{:.4f}\t{:.4f}\t{:.4f}'
''.format(accuracy_score, recall_score, roc_score), end='')
# class_report = metrics.classification_report(testy, predy)
# print(class_report)
conf_mat = metrics.confusion_matrix(testy, predy)
conf_df = pd.DataFrame(conf_mat, index=['True:0', 'True:1'], columns=['pred:0', 'pred:1'])
my_roc_plt(model, testx, testy)
return conf_df
# 划分数据集:
X = df.drop(columns=['label']) # 或 X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=1)
# 使用LinearSVC模型
clf = LinearSVC(loss='hinge').fit(X_train, y_train)
score = clf.score(X_train, y_train)
# 绘制ROC
evaluate_model_report(clf_1, X_test, y_test)
Ejemplo de resultado de la trama:
Ejemplo de regresión logística
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from scipy.stats import zscore
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
def lst_idxs(lst, itm, nround=6):
midxs = []
for i, m in enumerate(lst):
if round(m, nround) == round(itm, nround):
midxs.append(i)
return midxs
def choose_optidx(fpr, tpr):
"""
优先考虑选择fpr尽量小<=0.2时,youden尽可能大
"""
idxs = []
tpr_ilst, fpr_ilst = [], []
for i, a in enumerate(fpr):
if float(a) < 0.2: # 特异度>=80%
idxs.append(i)
tpr_ilst.append(tpr[i])
fpr_ilst.append(fpr[i])
# 所有满足条件的最大youden值
yd_arr = np.array(tpr_ilst) - np.array(fpr_ilst)
max_yd = round(max(yd_arr), 6)
maxyd_idxs = lst_idxs(yd_arr, max_yd, nround=6)
# 最大特异度(最小fpr)
midx = maxyd_idxs[0]
min_fpr = fpr[midx]
for k in maxyd_idxs:
idx = idxs[k]
if fpr[idx] < min_fpr:
min_fpr = fpr[idx]
midx = idx
return midx
def lr_train_test_result(train_df, ntest_df, label, outdir='./',
penalty='l2', solver='liblinear', max_iter=100, C=100,
use_opt=False, scaler=False, usepca=False, use_tsne=False, n_components=3):
X = train_df.drop(label, axis=1)
y = train_df[label]
X_ntest = ntest_df.drop(label, axis=1)
y_ntest = ntest_df[label]
if scaler:
X = X.apply(zscore)
# 使用训练数据的均值和标准差对测试数据进行标准化(注意使用训练数据的统计量)
X_ntest = (X_ntest - X.mean()) / X.std()
if usepca:
# 创建PCA对象,并设置降维后的维度为3
pca = PCA(n_components=n_components)
# 创建新的DataFrame来存储降维后的特征
columns = ['pca_'+str(i+1) for i in range(n_components)]
# 对特征数据进行PCA降维
X_pca = pca.fit_transform(X)
X = pd.DataFrame(X_pca, columns=columns)
# 对新样本进行降维
X_ntest_pca = pca.transform(X_ntest)
X_ntest = pd.DataFrame(X_ntest_pca, columns=columns)
if use_tsne:
# 使用t-SNE进行降维
tsne = TSNE(n_components=n_components, perplexity=10, random_state=42)
X_tsne = tsne.fit_transform(X)
columns = ['pca_'+str(i+1) for i in range(n_components)]
X = pd.DataFrame(X_tsne, columns=columns)
# 对新样本进行降维
X_ntest_tsne = tsne.fit_transform(X_ntest)
X_ntest = pd.DataFrame(X_ntest_tsne, columns=columns)
best_params = {
'penalty': penalty,
'solver': solver,
'max_iter': max_iter,
'C': C,
}
# **输出预测概率值**
traindata_predscore_df = train_df.copy(deep=True)
ntestdata_predscore_df = ntest_df.copy(deep=True)
# 利用最优参数训练和预测:
clf_a = LogisticRegression(**best_params)
# 训练分类器
clf_a.fit(X, y)
# 斜率:
coefs = clf_a.coef_
# 截距:
inter = clf_a.intercept_
# -----------------
# all train: 计算acc., 混淆矩阵, AUC, sen., spe., ppv, npv
# -----------------
all_score_lst = []
y_allproba = clf_a.predict_proba(X)[:,1]
# all_auc_scores
all_score_lst.append(roc_auc_score(y, y_allproba))
# all_acc_scores
all_score_lst.append(clf_a.score(X, y))
# 找最优阈值:
all_fpr, all_tpr, all_thresholds = roc_curve(y, y_allproba)
if use_opt:
# all_optimal_idx = max_yds_idx(all_fpr, all_tpr)
all_optimal_idx = choose_optidx(all_fpr, all_tpr)
all_opt_thr = all_thresholds[all_optimal_idx] # .round(6)
else:
all_opt_thr = 0.5
# y_allpred = clf_a.predict(X) # 0.5
y_allpred = np.where(y_allproba >= all_opt_thr, 1, 0)
alltn, allfp, allfn, alltp = confusion_matrix(y, y_allpred).ravel()
# sensitivity
all_sen = alltp / (alltp + allfn)
all_score_lst.append(all_sen)
# specificity
all_spe = alltn / (alltn + allfp)
all_score_lst.append(all_spe)
traindata_predscore_df['Merge:pred'] = y_allproba
# -----------------
# 新的数据测试:test
# -----------------
an_score_lst = []
y_anproba = clf_a.predict_proba(X_ntest)[:,1]
#an_roc_scores
an_score_lst.append(roc_auc_score(y_ntest, y_anproba))
# an_acc_scores
an_score_lst.append(clf_a.score(X_ntest, y_ntest))
# y_anpred = clf_a.predict(X_ntest) # 0.5的默认阈值
y_anpred = np.where(y_anproba >= all_opt_thr, 1, 0) # 使用训练集的opt_thr
antn, anfp, anfn, antp = confusion_matrix(y_ntest, y_anpred).ravel()
# ansensitivity
an_sen = antp / (antp + anfn)
an_score_lst.append(an_sen)
# anspecificity
an_spe = antn / (antn + anfp)
an_score_lst.append(an_spe)
ntestdata_predscore_df['ntest:pred'] = y_anproba
# 输出文件
traindata_predscore_df.to_csv(outdir+'/train_data.predscore.txt', sep='\t')
ntestdata_predscore_df.to_csv(outdir+'/ntest_data.predscore.txt', sep='\t')
# 特定参数的:ROC曲线(固定train,test)
# my_rocplots({'train': (y, y_allproba),
# 'ntest[%s]' % all_opt_thr: (y_ntest, y_anproba)}, )
train_label = 'train_thr:{thr:.6f}\nspe.={spe:.4f},sen.={sen:.4f}'.format(thr=all_opt_thr, spe=all_spe, sen=all_sen)
ntest_label = 'ntest_thr:{thr:.6f}\nspe.={spe:.4f},sen.={sen:.4f}'.format(thr=all_opt_thr, spe=an_spe, sen=an_sen)
my_rocplots_opt({
train_label: (y, y_allproba, 1-all_spe, all_sen),
ntest_label: (y_ntest, y_anproba, 1-an_spe, an_sen)})
return all_score_lst, an_score_lst, all_opt_thr, coefs, inter
# 输入数据:df_train_x,df_ntest_x 为dataframe格式
# 第一列为样本名,中间列为各特征数,最后一列为目标值label列
lr_train_test_result(df_train_x, df_ntest_x, label, outdir=outdir,
penalty='l2', solver='liblinear', max_iter=999, C=999, # newton-cg, liblinear
use_opt=True, scaler=False, usepca=False, use_tsne=False, n_components=3)
ROC de validación cruzada k-fold
Dibuje la curva ROC y la ROC media para cada pliegue
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
def perform_kfold_cv(X, y, n_splits=5):
# k-fold划分数据集
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
optimal_thresholds = []
fig, ax = plt.subplots()
for train_index, test_index in skf.split(X, y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y[train_index], y[test_index]
model = LogisticRegression()
model.fit(X_train, y_train)
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
tprs.append(np.interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
aucs.append(roc_auc)
# 找到最优约登指数对应的阈值
optimal_threshold_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_threshold_idx]
optimal_thresholds.append(optimal_threshold)
ax.plot(fpr, tpr, lw=1, alpha=0.3)
# 标记最优阈值对应的FPR和TPR点
optimal_fpr = fpr[optimal_threshold_idx]
optimal_tpr = tpr[optimal_threshold_idx]
ax.scatter(optimal_fpr, optimal_tpr, color='red', marker='o')
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
ax.plot(mean_fpr, mean_tpr, color='b', label=f'Mean ROC (AUC = {
mean_auc:.2f})', lw=2)
ax.plot([0, 1], [0, 1], color='r', linestyle='--', lw=2, label='Random Guess')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver Operating Characteristic')
ax.legend(loc="lower right")
plt.show()
print("Optimal Thresholds:", optimal_thresholds)
# # 示例使用
# X = ... # 特征矩阵
# y = ... # 目标向量
# perform_kfold_cv(X, y, n_splits=5)
Ejemplo de uso y trazado del resultado:
# df = ... # dataframe格式的数据,target是目标值列
X = df.drop(columns=[target], axis=1)
y = df[target]
perform_kfold_cv(X, y, n_splits=5) # 5-fold交叉验证
Criterios de evaluación del modelo para problemas de clasificación binaria
matriz de confusión:
real\pronóstico | 1 pronóstico positivo | 0 pronóstico nublado | total |
---|---|---|---|
1 yang real | TP | FN | TP + FN |
0 Yin real | FP | Tennesse | FP+TN |
Métricas de evaluación del modelo:
-
Valor AUC: el área bajo la curva ROC.
-
TPR (Tasa de verdaderos positivos) Tasa de verdaderos positivos/Tasa de verdaderos positivos: TPR = TP / (TP + FN) TPR = TP/(TP+FN)TPR=TP / ( TP+FN ) , la proporción de todaspositivas verdaderasque se prevé que sean positivas. Es decir, Sensibilidad (Sensitivity, Sen.) o recordar (recordar).
-
FPR (tasa de falsos positivos) FPR = FP / ( FP + TN ) FPR = FP/(FP+TN)FPR=FP / ( FP+TN ) , la proporción de todasnegativas verdaderasque se prevé que sean positivas.
-
特异度(Especificidad):S pe . = TN / ( TN + FP ) = 1 − FPR Spe. = TN / (TN + FP) = 1 - FPRespe . _=TN / ( TN+FP )=1−FPR , la proporción de todasnegativas verdaderasque se predice que serán negativas (correctas).
-
Función (Precisión): precisión = TP / ( TP + FP ) precisión = TP/(TP+FP)p rec i s i ó n=TP / ( TP+FP ) , entre los positivos predichos, es la proporción de verdaderos positivos.
-
Puntuación F1: F 1 = 2 ∗ TP / ( 2 TP + FP + FN ) F1=2*TP/(2TP+FP+FN)Q1 _=2∗TP / ( 2 TP+FP+FN ) , una métrica mixta que es efectiva para clases desequilibradas. [2 veces positivos verdaderos/(2 veces positivos verdaderos + positivos verdaderos predichos como negativos [falsos negativos] + positivos predichos realmente negativos [falsos positivos])]
-
Relación (Índice de Youden): Youden = S in . + S pe . − 1 = TPR − FPR Youden=Sen.+Spe.-1=TPR-FPRusted n _ _ _ _=S e n .+espe . _−1=TPR−FPR , un índice completo de sensibilidad y especificidad.
-
KS值: KS = máx (TPR − FPR) KS=máx(TPR-FPR)K S=máx . ( TPR−FPR )。
curva ROC, curva KS
Referencia:
https://blog.csdn.net/qq_42433311/article/details/124124893
https://blog.csdn.net/weixin_43543177/article/details/107565947