Article directory
Data reading and deep copy
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("./练习数据/ch03_practice_1.csv")
train
train_saved = train.copy()
def load_data():
train = train_saved.copy()
return train
train = load_data()
characteristic change
normalized(-1,1)
# 1.1特征变化练习--标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train)
train_x = scaler.transform(train)
train_x
Normalize(0,1)
# 特征变化练习-max-min归一化
train = load_data()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x = scaler.fit(train)
train_x = scaler.transform(train)
train_x
nonlinear change
logarithmic transformation
# 1.2特征变化练习-非线性变化
x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0])
# 取对数
x1 = np.log(x)
# 加1后取对数
x2 = np.log1p(x)
# 对绝对值取对数并加上原本符号(正负)
x3 = np.sign(x) * np.log(np.abs(x))
import matplotlib.pyplot as plt
import seaborn as sns
# 对数变换
plt.subplot(1, 4, 1)
sns.distplot(x)
plt.subplot(1, 4, 2)
sns.distplot(x1)
plt.subplot(1, 4, 3)
sns.distplot(x2)
plt.subplot(1, 4, 4)
sns.distplot(x3)
plt.show()
box-cox change
# box-cox变化
train = load_data()
cols = [c for c in train.columns.tolist() if (train[c] > 0.0).all()]
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method="box-cox")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()
yeo-johnson transform
# yeo-johnson变换
train = load_data()
pt = PowerTransformer(method="yeo-johnson")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()
feature encoding
FeatureHasher
# 1.3特征编码-FeatureHasher练习
train = load_data()
from sklearn.feature_extraction import FeatureHasher
col = ["product", "year", "month", "day"]
for c in col:
fh = FeatureHasher(n_features=4, input_type="string")
hash_train = fh.transform(train[[c]].astype(str).values)
hash_train = pd.DataFrame(
hash_train.todense(), columns=[f"{
c}_{
i}" for i in range(4)]
)
train = pd.concat([train, hash_train], axis=1)
train
frequency encoding
# 特征编码-frequency encoding练习
train = load_data()
for c in col:
freq = train[c].value_counts()
train[c] = train[c].map(freq)
train
target encoding
# 特征编码-target encoding练习
train = pd.read_csv("./练习数据/ch03_practice_2.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
# target encoding
for c in col:
data_tmp = pd.DataFrame({
c: train_x[c], "target": train_y})
target_mean = data_tmp.groupby(c)["target"].mean()
tmp = np.repeat(np.nan, train_x.shape[0]) # 通过repeat构造一个指定大小与数值的narray
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for idx_1, idx_2 in kf.split(train_x):
target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
train_x[c] = tmp
train_x
wide form, long form
# 1.4宽表格、长表格
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time
df_wide = df_time
df_long = df_wide.stack().reset_index(1) # stack()将列索引转换为最内层的行索引
df_long.columns = ["id", "value"]
df_long
df_wide = df_long.pivot(
index=None, columns="id", values="value"
) # 重塑数据(产生一个“pivot”表格)以列值为标准
df_wide
feature structure
Construct features from missing values
# 2.2利用每一行缺失特征数构造特征以及根据是否缺失构造特征
train["nan_count"] = train.isnull().sum(axis=1)
train["year_nan"] = train.year.isnull().astype(int)
train
Sliding window feature construction
# 2.4滑窗特征构造
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time_A = df_time[["A"]]
df_time_A_saved = df_time_A.copy()
def load_data():
df_time_A = df_time_A_saved.copy()
return df_time_A
x = load_data()
x_lag1 = x.shift(1)
x_lag7 = x.shift(7)
x["lag1"] = x_lag1
x["lag7"] = x_lag7
x
x = load_data()
x = df_time_A
x_avg3 = x.shift(1).rolling(window=3).mean() # rolling()指定窗口大小的聚合函数
x_max7 = x.shift(1).rolling(window=7).max()
x_e7_avg = (x.shift(7) + x.shift(14) + x.shift(21)) / 3.0
x["avg3"] = x_avg3
x["max7"] = x_max7
x["e7_avg"] = x_e7_avg
x
Unsupervised Feature Construction
# 2.5无监督特征构造
# pip install umap-learn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_time)
df = scaler.transform(df_time)
UMAP algorithm
The UMAP algorithm, which accepts a dataset with more than two features, outputs a low-dimensional image for exploring the dataset. Similar samples tend to cluster together in the output umap result dot plot. The purpose of the UMAP algorithm is to display the clusters of samples in high-dimensional space and the connections between sample points on low-dimensional images.
import umap
um = umap.UMAP()
um.fit(df)
df_um = pd.DataFrame(um.transform(df), columns=[["um_1", "um_2"]], index=df_time.index)
df_um
Kmeans clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_time)
df_clusters = kmeans.predict(df_time)
df_distances = kmeans.transform(df_time) # 计算每个样本到类中心点的距离
df_distances = pd.DataFrame(
df_distances,
columns=["distance_1", "distance_2", "distance_3"],
index=df_time.index,
)
df = pd.concat([df_time, df_distances], axis=1)
df
feature importance
# 2.7特征重要性输出
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_x, train_y)
fi = clf.feature_importances_ # 返回一个narray,每个值对应特征的重要性
idx = np.argsort(fi)[::-1]
top_features, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print("random forest importance")
print(top_features, top_importances)
xgboost
import xgboost as xgb
dtrain = xgb.DMatrix(train_x, label=train_y)
params = {
"objective": "binary:logistic", "silent": 1}
num_round = 20
model = xgb.train(params, dtrain, num_round)
fscore = model.get_score(importance_type="total_gain")
fscore = sorted(
[(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True
)
print("xgboost importance")
print(fscore[:5])
keep searching
# 2.7 特征重要性输出(不断搜索)
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
# 构造训练集合与验证集
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss
def evaluate(features):
dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
dvalid = xgb.DMatrix(va_x[features], label=va_y)
params = {
"objective": "binary:logistic", "silent": 1}
num_round = 10
early_stopping_rounds = 3
watchlist = [(dtrain, "train"), (dvalid, "eval")]
model = xgb.train(
params,
dtrain,
num_round,
evals=watchlist,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=0,
)
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred) # log_loss越小模型越好
return score
best_score = 9999.0
candidates = np.random.RandomState(0).permutation(train_x.columns) # 对列名进行随机排列
selected = set([])
for feature in candidates:
fs = list(selected) + [feature]
score = evaluate(fs)
if score < best_score:
selected.add(feature)
best_score = score
print(f"selected:{
feature}")
print(f"score:{
score}")
print(f"selected features: {
selected}")
Model tuning (Xgboost)
Bayesian optimization
# 3.4 xgboost调参
# 贝叶斯寻优练习
class Model:
def __init__(self, params=None):
self.model = None
if params is None:
self.params = {
}
else:
self.params = params
def fit(self, tr_x, tr_y, va_x, va_y):
params = {
"objective": "binary:logistic", "silent": 1}
params.update(self.params)
num_round = 10
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
watchlist = [(dtrain, "train"), (dvalid, "eval")]
self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
def predict(self, x):
data = xgb.DMatrix(x)
pred = self.model.predict(data)
return pred
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import log_loss
def score(params):
params["max_depth"] = int(params["max_depth"])
model = Model(params)
model.fit(tr_x, tr_y, va_x, va_y)
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f"params: {
params}, logloss: {
score:.4f}")
history.append((params, score))
return {
"loss": score, "status": STATUS_OK}
space = {
"min_child_weight": hp.quniform("min_child_weight", 1, 5, 1),
"max_depth": hp.quniform("max_depth", 3, 9, 1),
"gamma": hp.quniform("gamma", 0, 0.4, 0.1),
}
max_evals = 10
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{
best[0]}, score:{
best[1]:.4f}")
Xgboost parameter space
# 3.4.2Xgboost参数空间
params = {
"booster": "gbtree",
"objective": "binary:logistic",
"eta": 0.1,
"gamma": 0.0,
"alpha": 0.0,
"lambda": 1.0,
"min_child_weight": 1,
"max_depth": 5,
"subsample": 0.8,
"colsample_bytree": 0.8,
"random_state": 0,
}
param_space = {
"min_child_weight": hp.loguniform("min_child_weight", np.log(0.1), np.log(10)),
"max_depth": hp.quniform("max_depth", 3, 9, 1),
"subsample": hp.quniform("subsample", 0.6, 0.95, 0.05),
"colsample_bytree": hp.quniform("colsample_bytree", 0.6, 0.95, 0.05),
"gamma": hp.loguniform("gamma", np.log(1e-8), np.log(1.0))
# 资源充裕时再做如下寻优
# 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
# 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0))
}
Neural Network Tuning
# 3.5神经网络调参
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from hyperopt import hp
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization
from keras.layers.advanced_activations import PReLU, ReLU
from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import SGD, Adam
base_param = {
"input_dropout": 0.0,
"hidden_layers": 3,
"hidden_units": 96,
"hidden_activation": "relu",
"hidden_dropout": 0.2,
"batch_norm": "before_act",
"optimizer": {
"type": "adam", "lr": 0.001},
"batch_size": 64,
}
param_space = {
"input_dropout": hp.quniform("input_dropout", 0, 0.2, 0.05),
"hidden_layers": hp.quniform("hidden_layers", 2, 4, 1),
"hidden_units": hp.quniform("hidden_units", 32, 256, 32),
"hidden_activation": hp.choice("hidden_activation", ["prelu", "relu"]),
"hidden_dropout": hp.quniform("hidden_dropout", 0, 0.3, 0.05),
"batch_norm": hp.choice("batch_norm", ["before_act", "no"]),
"optimizer": hp.choice(
"optimizer",
[
{
"type": "adam",
"lr": hp.loguniform("adam_lr", np.log(0.00001), np.log(0.01)),
},
{
"type": "sgd",
"lr": hp.loguniform("sgd_lr", np.log(0.00001), np.log(0.01)),
},
],
),
"batch_size": hp.quniform("batch_size", 32, 128, 32),
}
class MLP:
def __init__(self, params):
self.params = params
self.scaler = None
self.model = None
def fit(self, tr_x, tr_y, va_x, va_y):
input_dropout = self.params["input_dropout"]
hidden_layers = int(self.params["hidden_layers"])
hidden_units = int(self.params["hidden_units"])
hidden_activation = self.params["hidden_activation"]
hidden_dropout = self.params["hidden_dropout"]
batch_norm = self.params["batch_norm"]
optimizer_type = self.params["optimizer"]["type"]
optimizer_lr = self.params["optimizer"]["lr"]
batch_size = int(self.params["batch_size"])
# 标准化
self.scaler = StandardScaler()
tr_x = self.scaler.fit_transform(tr_x)
va_x = self.scaler.transform(va_x)
self.model = Sequential()
# 输入层
self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))
# 中间层
for i in range(hidden_layers):
self.model.add(Dense(hidden_units))
if batch_norm == "before_act":
self.model.add(BatchNormalization())
if hidden_activation == "prelu":
self.model.add(PReLU())
elif hidden_activation == "relu":
self.model.add(ReLU())
else:
raise NotImplementedError
self.model.add(Dropout(hidden_dropout))
# 输出层
self.model.add(Dense(1, activation="sigmoid"))
# 优化器
if optimizer_type == "sgd":
optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
elif optimizer_type == "adam":
optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.0)
else:
raise NotImplementedError
# 目标函数和评估指标
self.model.compile(
loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
nb_epoch = 200
patience = 20
early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)
history = self.model.fit(
tr_x,
tr_y,
epochs=nb_epoch,
batch_size=batch_size,
verbose=1,
validation_data=(va_x, va_y),
callbacks=[early_stopping],
)
def predict(self, x):
x = self.scaler.transform(x)
y_pred = self.model.predict(x)
y_pred = y_pred.flatten()
return y_pred
def score(params):
model = MLP(params)
model.fit(tr_x, tr_y, va_x, va_y)
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f"params: {
params}, logloss: {
score:.4f}")
history.append((params, score))
return {
"loss": score, "status": STATUS_OK}
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{
best[0]}, score:{
best[1]:.4f}")
linear model
# 3.6线性模型练习
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tr_x = scaler.fit_transform(tr_x)
va_x = scaler.transform(va_x)
model = LogisticRegression()
model.fit(tr_x, tr_y)
va_pred = model.predict_proba(va_x)
score = log_loss(va_y, va_pred)
print(f"logloss: {
score:.4f}")
custom evaluation function
# 3.7自定义评估函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# 自定义目标函数和评估指标
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return "custom-error", float(sum(labels != (preds > 0.0))) / len(labels)
params = {
"silent": 1, "random_state": 0}
num_round = 50
watchlist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)
params = {
"silent": 1, "random_state": 0, "objective": "binary:logistic"}
bst = xgb.train(params, dtrain, num_round, watchlist)
pred = bst.predict(dvalid)
logloss = log_loss(va_y, pred)
print(logloss)
# xgboost类mae作为目标函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# xgboost类mae作为目标函数
def fair(preds, dtrain):
x = preds - dtrain.get_label()
c = 1.0
den = abs(x) + c
grad = c * x / den
hess = c * c / den ** 2
return grad, hess
import numpy as np
import xgboost as xgb
param = {
"max_depth": 3, "eta": 1, "silent": 1}
watchlist = [(dvalid, "eval"), (dtrain, "train")]
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist, fair)
model fusion
Stacking
# 3.8.1 stacking练习,搭建基模型
import numpy as np
import pandas as pd
import xgboost as xgb
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
class Model1Xgb:
def __init__(self):
self.model = None
def fit(self, tr_x, tr_y, va_x, va_y):
params = {
"objective": "binary:logistic",
"silent": 1,
"random_state": 0,
"eval_metric": "logloss",
}
num_round = 10
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
watchlist = [(dtrain, "train"), (dvalid, "eval")]
self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
def predict(self, x):
data = xgb.DMatrix(x)
pred = self.model.predict(data)
return pred
class Model1NN:
def __init__(self):
self.model = None
self.scaler = None
def fit(self, tr_x, tr_y, va_x, va_y):
self.scaler = StandardScaler()
self.scaler.fit(tr_x)
batch_size = 128
epochs = 10
tr_x = self.scaler.transform(tr_x)
va_x = self.scaler.transform(va_x)
model = Sequential()
model.add(Dense(256, activation="relu", input_shape=(tr_x.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam")
history = model.fit(
tr_x,
tr_y,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(va_x, va_y),
)
self.model = model
def predict(self, x):
x = self.scaler.transform(x)
pred = self.model.predict(x).reshape(-1)
return pred
class Model2Linear:
def __init__(self):
self.model = None
self.scaler = None
def fit(self, tr_x, tr_y, va_x, va_y):
self.scaler = StandardScaler()
self.scaler.fit(tr_x)
tr_x = self.scaler.transform(tr_x)
self.model = LogisticRegression(solver="lbfgs")
self.model.fit(tr_x, tr_y)
def predict(self, x):
x = self.scaler.transform(x)
pred = self.model.predict_proba(x)[:, 1]
return pred
def predict_cv(model, train_x, train_y, test_x):
preds = []
preds_test = []
va_idxes = []
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
model.fit(tr_x, tr_y, va_x, va_y)
pred = model.predict(va_x)
preds.append(pred)
pred_test = model.predict(test_x)
preds_test.append(pred_test)
va_idxes.append(va_idx)
va_idxes = np.concatenate(va_idxes)
preds = np.concatenate(preds, axis=0)
order = np.argsort(va_idxes)
pred_train = preds[order]
preds_test = np.mean(preds_test, axis=0)
return pred_train, preds_test
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)
model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x, train_y, test_x)
print(f"logloss: {
log_loss(train_y, pred_train_1a, eps=1e-7):.4f}")
print(f"logloss: {
log_loss(train_y, pred_train_1b, eps=1e-7):.4f}")
train_x_2 = pd.DataFrame({
"pred_1a": pred_train_1a, "pred_1b": pred_train_1b})
test_x_2 = pd.DataFrame({
"pred_1a": pred_test_1a, "pred_1b": pred_test_1b})
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f"logloss: {
log_loss(train_y, pred_train_2, eps=1e-7):.4f}")
Hold-out
# 3.8.2hold-out堆叠练习
kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_index = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_index]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_index]
model_1a = Model1Xgb()
model_1a.fit(tr_x, tr_y, va_x, va_y)
va_pred_1a = model_1a.predict(va_x)
test_pred_1a = model_1a.predict(test_x)
model_1b = Model1NN()
model_1b.fit(tr_x, tr_y, va_x, va_y)
va_pred_1b = model_1b.predict(va_x)
test_pred_1b = model_1b.predict(test_x)
print(f"logloss: {
log_loss(va_y, va_pred_1a, eps=1e-7):.4f}")
print(f"logloss: {
log_loss(va_y, va_pred_1b, eps=1e-7):.4f}")
va_x_2 = pd.DataFrame({
"pred_1a": va_pred_1a, "pred_1b": va_pred_1b})
test_x_2 = pd.DataFrame({
"pred_1a": test_pred_1a, "pred_1b": test_pred_1b})
model2 = Model2Linear()
model2.fit(va_x_2, va_y, None, None)
pred_test_2 = model2.predict(test_x_2)
Cross-validation
Stratified-fold
# 4.1交叉验证进阶stratified-fold
import numpy as np
import pandas as pd
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x, train_y):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
Group-fold
# 交叉验证进阶group-fold,以‘user_id'为例
train_x["user_id"] = np.arange(0, len(train_x)) // 4
train_x
# GroupKFold练习
from sklearn.model_selection import GroupKFold, KFold
user_id = train_x["user_id"]
unique_user_ids = user_id.unique()
kf = GroupKFold(n_splits=4)
for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
# group-fold无法设定shuffle和乱数种子,因此设计改进版如下
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]
is_tr = user_id.isin(tr_groups)
is_va = user_id.isin(va_groups)
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
# 4.2时间序列的交叉验证
import numpy as np
import pandas as pd
train = pd.read_csv("D:\\ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("D:\\ch03_practice_4_test.csv")
train_x["period"] = np.arange(0, len(train_x)) // (len(train_x) // 4)
train_x["period"] = np.clip(train_x["period"], 0, 3)
test_x["period"] = 4
train_x
direct division
# 直接划分数据集,不使用交叉验证
is_tr = train_x["period"] < 3
is_va = train_x["period"] == 3
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
periodic cross-validation
# 全部存量数据的交叉验证,按周期
va_period_list = [1, 2, 3]
for va_period in va_period_list:
is_tr = train_x["period"] < va_period
is_va = train_x["period"] == va_period
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
Time-Ordered Cross-Validation
# 全部存量数据的交叉验证,按排序
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=4)
for tr_idx, va_idx in tss.split(train_x):
print(tr_idx.min(), tr_idx.max(), va_idx.min(), va_idx.max())
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
Window sliding isometric cross-validation
# 窗口滑动等长交叉验证
from sktime.forecasting.model_selection import SlidingWindowSplitter
cv = SlidingWindowSplitter(window_length=12, fh=list(range(0, 8)), step_length=3)
for train_idx, test_idx in cv.split(train_x):
print(train_idx, test_idx)
tr_x, va_x = train_x.iloc[train_idx], train_x.iloc[test_idx]
tr_y, va_y = train_y.iloc[train_idx], train_y.iloc[test_idx]
Weakly sequenced cross-validation
# 弱先后顺序的交叉验证
va_period_list = [0, 1, 2, 3]
for va_period in va_period_list:
is_tr = train_x["period"] != va_period
is_va = train_x["period"] == va_period
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
Optimizing Thresholds for Classification Problems
Binary classification optimization
# 4.3最佳化阈值练习
from scipy.optimize import minimize
from sklearn.metrics import f1_score
rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
init_threshold = 0.5
init_score = f1_score(train_y, train_pred_prob >= init_threshold)
print(init_threshold, init_score)
def f1_opt(x):
return -f1_score(train_y, train_pred_prob >= x)
result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
best_threshold = result["x"].item()
best_score = f1_score(train_y, train_pred_prob >= best_threshold)
print(best_threshold, best_score)
Out-Of-Fold
# out-of-fold阈值优化
from scipy.optimize import minimize
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
thresholds = []
scores_tr = []
scores_va = []
kf = KFold(n_splits=4, random_state=0, shuffle=True)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_pred_prob)):
tr_pred_prob, va_pred_prob = train_pred_prob[tr_idx], train_pred_prob[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
def f1_opt(x):
return -f1_score(tr_y, tr_pred_prob >= x)
result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
threshold = result["x"].item()
score_tr = f1_score(tr_y, tr_pred_prob >= threshold)
score_va = f1_score(va_y, va_pred_prob >= threshold)
print(threshold, score_tr, score_va)
thresholds.append(threshold)
scores_tr.append(score_tr)
scores_va.append(score_va)
threshold_test = np.mean(thresholds)
print(threshold_test)