from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone, BaseEstimator, TransformerMixin
class stratified_cross_val_score(BaseEstimator, TransformerMixin):
"""实现基于分层抽样的k折交叉验证"""
def __init__(self, model, data, labels, random_state=0, cv=5):
"""
:model: 训练的模型(回归或分类)
:data: 只含特征值的完整数据集
:labels: 只含标签值的完整数据集
:random_state: 模型的随机种子值
:cv: 交叉验证的次数
"""
self.model = model
self.data = data
self.labels = labels
self.random_state = random_state
self.cv = cv
self.score = []
self.i = 0
def fit(self, X, y):
"""
:param X: 含有特征值和聚类结果的完整数据集
:param y: 含有聚类结果的完整数据集
"""
skfolds = StratifiedKFold(n_splits=self.cv, random_state=self.random_state)
for train_index, test_index in skfolds.split(X, y):
clone_model = clone(self.model)
strat_X_train_folds = self.data.loc[train_index]
strat_y_train_folds = self.labels.loc[train_index]
strat_X_test_fold = self.data.loc[test_index]
strat_y_test_fold = self.labels.loc[test_index]
clone_model.fit(strat_X_train_folds, strat_y_train_folds)
test_labels_pred = clone_model.predict(strat_X_test_fold)
score_fold = f1_score(labels.loc[test_index], test_labels_pred, average="weighted")
if self.i < self.cv:
self.score.append(score_fold)
else:
None
self.i += 1
def transform(self, X, y=None):
return self
def mean(self):
"""返回交叉验证评分的平均值"""
return np.array(self.score).mean()
def std(self):
"""返回交叉验证评分的标准差"""
from sklearn.linear_model import SGDClassifier
clf_model = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
clf_cross_val = stratified_cross_val_score(clf_model, data, labels, cv=5, random_state=42)
clf_cross_val.fit(data2, data2["km_clustering_label"])
clf_cross_val.score
[0.751211138417513,
0.6227780418250951,
0.12935004693798663,
0.536341797966456,
0.09408178282350468]
clf_cross_val.mean()
0.42675256159411107
clf_cross_val.std()
0.26639341601261735