（五）skrean交叉验证：评估评估器的性能

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
iris.data.shape, iris.target.shape

((150, 4), (150,))

1、划分40%的数据作为训练数据

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.4, random_state=0)

X_train.shape, y_train.shape

((90, 4), (90,))

X_test.shape, y_test.shape

((60, 4), (60,))

2、交叉验证介绍
只有训练和测试集的情况，测试集存在过度拟合的风险，因为可以调整参数直到估算器执行最佳。
这样，关于测试集的知识可以“泄漏”到模型中，评估指标不再具有泛化性
能。为了解决这个问题，数据集的另一部分可以作为所谓的“验证集”保持：
训练集上的训练继续进行，之后在验证集上进行评估，这个似乎成功解决了问题，
最终评估可以在测试集上完成。但是，通过将可用数据划分为三组，我们大
大减少了可用于学习模型的样本数量。这个问题的解决方案是一个称为交叉
验证的过程（简称CV）。仍应保留测试集以进行最终评估，但在进行CV时
不再需要验证集。

from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

#scores.std()代表方差
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.98 (+/- 0.03)

#样本在目标类别之间平衡，因此准确性和F1分数几乎相等。
from sklearn import metrics
#scoring='f1_macro':f1是准确率和召回率的调和平均，macro计算每个类别的F1，然后做平均（各类别F1的权重相同）
scores = cross_val_score(
    clf, iris.data, iris.target, cv=5, scoring='f1_macro')
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

#通过传递交叉验证迭代器来使用其他交叉验证策略
from sklearn.model_selection import ShuffleSplit
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([0.97777778, 0.97777778, 1.        ])

#加上数据预处理过程
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

0.9333333333333333

#通过pipeline
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([0.97777778, 0.93333333, 0.95555556])

3、cross_validate函数和多个度量评估

from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,
                        cv=5, return_train_score=False)

sorted(scores.keys())

['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

scores['test_recall_macro']

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

#通过交叉验证获得预测
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)
metrics.accuracy_score(iris.target, predicted)

0.9733333333333334

4、交叉验证迭代器

#(1)k折交叉验证
import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))#打印第几个数据id

[2 3] [0 1]
[0 1] [2 3]

#(2)重复k折交叉验证
import numpy as np
from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
for train, test in rkf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]
[0 2] [1 3]
[1 3] [0 2]

#(3)留一法

from sklearn.model_selection import LeaveOneOut

X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]

#（4） Leave P Out (LPO) 测试集将重叠

from sklearn.model_selection import LeavePOut

X = np.ones(4)
lpo = LeavePOut(p=2)
for train, test in lpo.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]

#随机排列交叉验证又名Shuffle
from sklearn.model_selection import ShuffleSplit
X = np.arange(5)
ss = ShuffleSplit(n_splits=3, test_size=0.25,
    random_state=0)
for train_index, test_index in ss.split(X):
    print("%s %s" % (train_index, test_index))
#ShuffleSplit是KFold交叉验证的一个很好的替代方案，它允许更精细地控制数据的
#份数和测试集大小

[1 3 4] [2 0]
[1 4 3] [0 2]
[4 0 2] [1 3]

#基于类标签进行分层
#些分类问题可能在目标类别的分布中表现出很大的不平衡：
#例如，负样本可能比正样本多几倍。在这种情况下，建议使用分层采样
from sklearn.model_selection import StratifiedKFold

X = np.ones(10)
y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print("%s %s" % (train, test))

[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]

（五）skrean交叉验证：评估评估器的性能

猜你喜欢