記事ディレクトリ
関連する API
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
交差検証モデル
- ホールドアウト検証
- k分割交差検証 k分割交差検証 KFCV
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
digits = datasets.load_digits()
digits_features = digits.data
digits_target = digits.target
type(digits), digits.keys()
(sklearn.utils.Bunch,
dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR']))
iris = datasets.load_iris()
iris_features = iris.data
iris_target = iris.target
# 创建标准化对象
standardizer = StandardScaler()
# 创建逻辑回归对象
logit = LogisticRegression()
# 创建包含数据标准化和逻辑回归的流水线
pipeline = make_pipeline(standardizer, logit)
# 创建 K 者交叉验证对象
kf = KFold(n_splits=10, shuffle=True, random_state=1)
# 执行k折交叉验证
cv_results = cross_val_score(pipeline, digits_features, digits_target, cv=kf, scoring="accuracy", n_jobs=-1)
cv_results.mean()
# 0.9693916821849783
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(digits_features, digits_target, test_size=0.1, random_state=1 )
features_train.shape, features_test.shape, target_train.shape, target_test.shape
# ((1617, 64), (180, 64), (1617,), (180,))
standardizer = StandardScaler()
standardizer.fit(features_train) # StandardScaler()
# 将标准化操作,应用到训练集和测试集
features_train_std = standardizer.transform(features_train)
features_test_std = standardizer.transform(features_test)
pipeline = make_pipeline(standardizer, logit)
pipeline
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression())])
# k 折交叉验证
cv_results = cross_val_score(pipeline, digits_features, digits_target, cv=kf, scoring="accuracy", n_jobs=-1)
cv_results.mean()
# 0.9693916821849783
ベースライン回帰モデルを作成する
boston = datasets.load_boston()
boston_features = boston.data
boston_target = boston.target
features_train, features_test, target_train, target_test = train_test_split(boston_features, boston_target, test_size=0.1, random_state=1 )
features_train.shape, features_test.shape, target_train.shape, target_test.shape
# ((455, 13), (51, 13), (455,), (51,))
from sklearn.dummy import DummyRegressor
dummy = DummyRegressor(strategy="mean")
# 训练回归模型
dummy.fit(features_train, target_train)
# DummyRegressor()
dummy.score(features_test, target_test)
# -0.00044055572703283197
# 训练自己的模型
from sklearn.linear_model import LinearRegression
# 训练简单的线性回归模型
ols = LinearRegression()
ols.fit(features_train, target_train)
# 计算 R 方得分
ols.score(features_test, target_test)
# 0.7786386580298952
# 创建一个将所有样本预测为 20的 DummyRegressor
clf = DummyRegressor(strategy='constant', constant=20)
clf.fit(features_train, target_train)
# 计算模型的得分
clf.score(features_test, target_test)
# -0.05955373175370782
ベースライン分類モデルを作成する
方法:
- 一様、ランダムに一様予測を生成
- トレーニング セット内の予測結果とデータ カテゴリの割合が同じになるように階層化
from sklearn.dummy import DummyClassifier
features_train, features_test, target_train, target_test = train_test_split(iris_features, iris_target, test_size=0.1, random_state=0)
dummy = DummyClassifier(strategy='uniform', random_state=1)
dummy.fit(features_train, target_train)
# DummyClassifier(random_state=1, strategy='uniform')
dummy.score(features_test, target_test)
# 0.3333333333333333
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(features_train, target_train)
# RandomForestClassifier()
clf.score(features_test, target_test)
# 0.9333333333333333
バイナリ分類器を評価する
交差検証には cross_val_score メソッドを使用し、同時に、スコアリング パラメータを使用してパフォーマンス評価指標を決定します。
精度、適合率、再現率、F1 スコアなど、さまざまな指標から選択できます。
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1 )
logit = LogisticRegression()
cross_val_score(logit, X, y, scoring='accuracy')
# array([0.9555, 0.95 , 0.9585, 0.9555, 0.956 ])
バイナリ分類器を評価するためのしきい値
from sklearn.metrics import roc_curve, roc_auc_score
features, target = make_classification(n_samples=10000, n_features=10, n_informative=3, n_classes=2, random_state=3 )
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=1 )
logit = LogisticRegression()
logit.fit(features_train, target_train) # LogisticRegression()
# 获取预测的概率
target_probabilities = logit.predict_proba(features_test)
target_probabilities
array([[0.86891533, 0.13108467],
[0.46315541, 0.53684459],
[0.03395355, 0.96604645],
...,
[0.46720208, 0.53279792],
[0.67447765, 0.32552235],
[0.16098342, 0.83901658]])
target_probabilities = target_probabilities[:, 1]
# 计算真阳性和假阳性的概率
fp, tp, threshold = roc_curve(target_test, target_probabilities)
import matplotlib.pyplot as plt
plt.title('ROC')
plt.plot(fp, tp)
plt.plot([0, 1], ls='--')
plt.plot([0, 0], [1, 0], c='.7'), plt.plot([1, 1], c='.7')
plt.ylabel('TP Rate')
plt.xlabel('FP Rate')
plt.show()
# 查看第一个样本的预测概率
logit.predict_proba(features_test)[0:1]
# array([[0.86891533, 0.13108467]])
# 查看分类
logit.classes_
# array([0, 1])
多変量分類器の評価
features, target = make_classification(n_samples=10000, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1 )
logit = LogisticRegression()
cross_val_score(logit, features, target, scoring='accuracy')
# array([0.9555, 0.95 , 0.9585, 0.9555, 0.956 ])
cross_val_score(logit, features, target, scoring='f1_macro')
# array([0.9554991 , 0.9499998 , 0.95849874, 0.95549812, 0.9559989 ])
分類器の性能の可視化
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
class_names = iris.target_names
features_train, features_test, target_train, target_test = train_test_split(iris.data, iris.target, test_size=0.1, random_state=1 )
clf = LogisticRegression()
clf.fit(features_train, target_train)
# LogisticRegression()
target_pred = clf.predict(features_test)
target_pred
# array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1])
# 创建混淆矩阵
matrix = confusion_matrix(target_test, target_pred)
matrix
array([[5, 0, 0],
[0, 6, 0],
[0, 0, 4]])
df = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(df, annot=True, cbar=None, cmap='Blues' )
plt.title('Confuse Matrix '), plt.tight_layout()
plt.ylabel('True Class '), plt.xlabel('Predicted Class ')
plt.show()
回帰モデルを評価する
平均二乗誤差を使用 MSE
MSE: 平均二乗誤差
from sklearn.datasets import make_regression
features, target = make_regression(n_samples=100, n_features=3, n_informative=3, n_targets=1, noise=50, coef=False, random_state=1 )
ols = LinearRegression()
cross_val_score(ols, features, target, scoring='neg_mean_squared_error')
# array([-1974.65337976, -2004.54137625, -3935.19355723, -1060.04361386, -1598.74104702])
cross_val_score(ols, features, target, scoring='r2')
# array([0.8622399 , 0.85838075, 0.74723548, 0.91354743, 0.84469331])
クラスタリング モデルを評価する
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
features, _ = make_blobs(n_samples=100, n_features=10, centers=2, cluster_std=0.5, shuffle=True, random_state=1)
model = KMeans(n_clusters=2, random_state=1 ).fit(features)
# KMeans(n_clusters=2, random_state=1)
# 获取预测的分类
target_pred = model.labels_
target_pred
array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int32)
# 评估模型
silhouette_score(features, target_pred)
# 0.8934850469838235
カスタム評価指標を作成する
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import Ridge
features, target = make_regression(n_samples=100, n_features=3, random_state=1 )
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=1 )
def custom_metric(target_test, target_predicted):
r2 = r2_score(target_test, target_predicted)
return r2
score = make_scorer(custom_metric, greater_is_better=True)
score
# make_scorer(custom_metric)
clf = Ridge()
model = clf.fit(features_train, target_train)
# Ridge()
target_pred = model.predict(features_test)
'''
array([ 188.32324198, 4.23162169, 160.13772594, -40.40610041,
-143.18667984, 119.09559067, -21.36984852, -108.72175738,
-44.12601854, -43.98261236])
'''
r2_score(target_test, target_pred)
# 0.9997906102882058
2023-03-25