任务三（订正）

建立模型
调用sklearn包将数据集按比例7:3划分为训练集和数据集，随机种子2018：
X_train, X_test, y_train, y_test = train_test_split(data_del, label, test_size=0.3, random_state=2018)
查看划分的数据集和训练集大小：
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]
[(3133, 12), (3133,), (1343, 12), (1343,)]
模型调优
构建并训练本次需要用到的七个模型，包含四个集成模型：XGBoost，LightGBM，GBDT，随机森林，和三个非集成模型：逻辑回归，SVM，决策树。使用网格搜索法对7个模型进行调优（调参时采用五折交叉验证的方式）
逻辑回归
部分参数介绍：

C : float, default: 1.0

C值越小，正则化强度越强，C必须是一个正的浮点数。

class_weight : dict or ‘balanced’, default: None

指定每个类的权重，未指定时所有类都默认有同一权重。“平衡”模式使用y的值来自动调整与输入数据中的类频率成反比的权重，即n_samples / (n_classes * np.bincount(y)).

solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.

优化时使用的算法，对于小数据集，'liblinear’是一个好选择，而‘sag’ 和 ‘saga’对大数据集训练更快。对于多分类问题，只有‘newton-cg’, ‘sag’, ‘saga’ 和 ‘lbfgs’能处理多元损失，‘liblinear’只能以one-versus-rest模式训练。
‘newton-cg’, ‘lbfgs’ 和 ‘sag’只能使用l2惩罚项，‘liblinear’ 和 ‘saga’可以使用l1惩罚项。
注意，“sag”和“saga”快速收敛仅在具有大致相同比例的特征上得到保证。可以使用sklearn.preprocessing中的scaler预处理数据。

max_iter : int, default: 100

最大迭代次数
查看最佳参数和评分：
param_grid = {
‘C’: np.arange(0.01, 0.1, 0.01),
‘solver’: [‘liblinear’, ‘lbfgs’],
‘class_weight’: [‘balanced’, None]
}
log_grid = GridSearchCV(LogisticRegression(random_state=2018, max_iter=1000),
param_grid, cv=5)
log_grid.fit(X_train, y_train)
log_grid.best_estimator_, log_grid.best_score_
(LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=1000, multi_class=‘ovr’, n_jobs=1,
penalty=‘l2’, random_state=2018, solver=‘lbfgs’, tol=0.0001,
verbose=0, warm_start=False), 0.7874241940631982)
SVM

C: float, optional (default=1.0)

目标函数的惩罚系数C，用来平衡分类间隔margin和错分样本的

kernel: string, optional (default=’rbf’)

参数选择有RBF, Linear, Poly, Sigmoid

gamma：float, optional (default=’auto’)

核函数的系数(‘Poly’, ‘RBF’ 和 ‘Sigmoid’), 默认是gamma = 1 / n_features;
param_grid = {
‘C’: np.arange(0.1, 5.2, 0.5),
‘gamma’: [‘auto’, 0.01, 0.5],
}

svc_grid = GridSearchCV(SVC(random_state=2018, probability=True), param_grid, cv=5)
svc_grid.fit(X_train, y_train)
svc_grid.best_estimator_, svc_grid.best_score_
12345678
(SVC(C=2.1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=‘ovr’, degree=3, gamma=‘auto’, kernel=‘rbf’,
max_iter=-1, probability=True, random_state=2018, shrinking=True,
tol=0.001, verbose=False), 0.7871050111714012)
决策树

max_depth : int or None, optional (default=None)

树的最大深度，如果设置为None则直到叶节点只剩一个或者少于min_samples_split时停止。

max_features : int, float, string or None, optional (default=None)

在寻找最佳分割时考虑的特征数，设置为’auto’时max_features=sqrt(n_features)

class_weight : dict, list of dicts, “balanced” or None, default=None

指定每个类的权重，未指定时所有类都默认有同一权重。“平衡” 模式使用 y 的值来自动调整与输入数据中的类频率成反比的权重，即 n_samples / (n_classes * np.bincount(y)).
param_grid = {
‘max_depth’: range(2, 8, 1),
‘min_samples_split’: range(2, 11, 1)
}
tree_grid = GridSearchCV(DecisionTreeClassifier(random_state=2018), param_grid, cv=5)
tree_grid.fit(X_train, y_train)
tree_grid.best_estimator_, tree_grid.best_score_
(DecisionTreeClassifier(class_weight=None, criterion=‘gini’, max_depth=4,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=2018,
splitter=‘best’), 0.7749760612831152)
param_grid = {
‘min_samples_leaf’: range(26, 35, 2),
‘max_features’: range(2, 10, 1)
}
tree_grid = GridSearchCV(DecisionTreeClassifier(random_state=2018, max_depth=4,
min_samples_split=2),
param_grid, cv=5)
tree_grid.fit(X_train, y_train)
tree_grid.best_estimator_, tree_grid.best_score_
123456789
(DecisionTreeClassifier(class_weight=None, criterion=‘gini’, max_depth=4,
max_features=7, max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=30,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=2018, splitter=‘best’),
0.7794446217682732)
随机森林
param = {‘n_estimators’: list(range(10, 1001, 50))}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2018),
param_grid=param, cv=5)
forest_grid.fit(X_train, y_train)
forest_grid.best_estimator_, forest_grid.best_score_
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion=‘gini’,
max_depth=None, max_features=‘auto’, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=860, n_jobs=1,
oob_score=False, random_state=2018, verbose=0,
warm_start=False), 0.7883817427385892)
forest_grid = forest_grid.best_estimator_

param = {
‘n_estimators’: list(range(forest_grid.n_estimators - 40,
forest_grid.n_estimators + 50, 10))
}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2018),
param_grid=param, cv=5)
forest_grid.fit(X_train, y_train)
forest_grid.best_estimator_, forest_grid.best_score_
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion=‘gini’,
max_depth=None, max_features=‘auto’, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=860, n_jobs=1,
oob_score=False, random_state=2018, verbose=0,
warm_start=False), 0.7883817427385892)
forest_grid = forest_grid.best_estimator_

param = {
‘max_depth’: range(3, 15, 2),
‘min_samples_split’: range(2, 53, 10)
}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2018,
n_estimators=forest_grid.n_estimators),
param_grid=param, cv=5)
forest_grid.fit(X_train, y_train)
forest_grid.best_estimator_, forest_grid.best_score_
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion=‘gini’,
max_depth=9, max_features=‘auto’, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=30,
min_weight_fraction_leaf=0.0, n_estimators=860, n_jobs=1,
oob_score=False, random_state=2018, verbose=0,
warm_start=False), 0.7918927545483562)
param = {‘min_samples_leaf’: range(1, 10, 2)}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2018, max_features=‘auto’,
max_depth=9,
n_estimators=860,
min_samples_split=30),
param_grid=param, cv=5)
forest_grid.fit(X_train, y_train)
forest_grid.best_estimator_, forest_grid.best_score_
12345678
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion=‘gini’,
max_depth=9, max_features=‘auto’, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=30,
min_weight_fraction_leaf=0.0, n_estimators=860, n_jobs=1,
oob_score=False, random_state=2018, verbose=0,
warm_start=False), 0.7918927545483562)
GBDT

n_estimators : integer, optional (default=10)

要执行的迭代次数。梯度增强对于过拟合强健性好，多以较大的值通常效果更好。

learning_rate : float, optional (default=0.1)

通过learning_rate缩小每棵树的贡献，需要在learning_rate和n_estimators之间进行权衡。
param_grid = {
‘n_estimators’: range(80, 150, 10),
‘learning_rate’: [0.02, 0.01, 0.04],
}
gbdt = GridSearchCV(GradientBoostingClassifier(random_state=2018), param_grid, cv=5)
gbdt.fit(X_train, y_train)
gbdt.best_estimator_, gbdt.best_score_
(GradientBoostingClassifier(criterion=‘friedman_mse’, init=None,
learning_rate=0.02, loss=‘deviance’, max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=110,
presort=‘auto’, random_state=2018, subsample=1.0, verbose=0,
warm_start=False), 0.7864666453878072)
gbdt = gbdt.best_estimator_

param_grid = {
‘max_depth’: range(3, 12, 2),
‘min_samples_split’: range(20, 41, 5)
}
gbdt = GridSearchCV(GradientBoostingClassifier(random_state=2018,
n_estimators=gbdt.n_estimators,
learning_rate=gbdt.learning_rate),
param_grid, cv=5)
gbdt.fit(X_train, y_train)
gbdt.best_estimator_, gbdt.best_score_
(GradientBoostingClassifier(criterion=‘friedman_mse’, init=None,
learning_rate=0.02, loss=‘deviance’, max_depth=5,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=35,
min_weight_fraction_leaf=0.0, n_estimators=110,
presort=‘auto’, random_state=2018, subsample=1.0, verbose=0,
warm_start=False), 0.7874241940631982)
gbdt = gbdt.best_estimator_

param_grid = {
‘min_samples_leaf’: range(1, 10, 2)
}
gbdt = GridSearchCV(GradientBoostingClassifier(random_state=2018,
n_estimators=gbdt.n_estimators,
learning_rate=gbdt.learning_rate,
max_depth=gbdt.max_depth,
min_samples_split=gbdt.min_samples_split),
param_grid, cv=5)
gbdt.fit(X_train, y_train)
gbdt.best_estimator_, gbdt.best_score_
(GradientBoostingClassifier(criterion=‘friedman_mse’, init=None,
learning_rate=0.02, loss=‘deviance’, max_depth=5,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=35,
min_weight_fraction_leaf=0.0, n_estimators=110,
presort=‘auto’, random_state=2018, subsample=1.0, verbose=0,
warm_start=False), 0.7874241940631982)
XGBoost
param_grid = {
‘n_estimators’: range(70, 150, 10),
‘learning_rate’: [0.02, 0.1, 0.2],
}
xgb = GridSearchCV(XGBClassifier(random_state=2018), param_grid, cv=5)
xgb.fit(X_train, y_train)
xgb.best_estimator_, xgb.best_score_
(XGBClassifier(base_score=0.5, booster=‘gbtree’, colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=90,
n_jobs=1, nthread=None, objective=‘binary:logistic’,
random_state=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=None, silent=True, subsample=1), 0.7867858282796042)
xgb = xgb.best_estimator_

param_grid = {
‘max_depth’: range(1, 4, 1),
‘min_samples_split’: range(1, 22, 5)
}
xgb = GridSearchCV(XGBClassifier(random_state=2018,
n_estimators=xgb.n_estimators,
learning_rate=xgb.learning_rate),
param_grid, cv=5)
xgb.fit(X_train, y_train)
xgb.best_estimator_, xgb.best_score_
123456789101112
(XGBClassifier(base_score=0.5, booster=‘gbtree’, colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
max_depth=3, min_child_weight=1, min_samples_split=1, missing=None,
n_estimators=90, n_jobs=1, nthread=None,
objective=‘binary:logistic’, random_state=2018, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=1), 0.7867858282796042)
LightGBM
param_grid = {
‘n_estimators’: range(70, 150, 10),
‘learning_rate’: [0.02, 0.1, 0.2],
}
lgbm = GridSearchCV(LGBMClassifier(random_state=2018), param_grid, cv=5)
lgbm.fit(X_train, y_train)
lgbm.best_estimator_, lgbm.best_score_
1234567
(LGBMClassifier(boosting_type=‘gbdt’, class_weight=None, colsample_bytree=1.0,
importance_type=‘split’, learning_rate=0.02, max_depth=-1,
min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
n_estimators=90, n_jobs=-1, num_leaves=31, objective=None,
random_state=2018, reg_alpha=0.0, reg_lambda=0.0, silent=True,
subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
lgbm = lgbm.best_estimator_

param_grid = {
‘max_depth’: range(1, 10, 2),
‘min_samples_split’: range(10, 31, 5)
}
lgbm = GridSearchCV(LGBMClassifier(random_state=2018,
n_estimators=lgbm.n_estimators,
learning_rate=lgbm.learning_rate),
param_grid, cv=5)
lgbm.fit(X_train, y_train)
lgbm.best_estimator_, lgbm.best_score_
(LGBMClassifier(boosting_type=‘gbdt’, class_weight=None, colsample_bytree=1.0,
importance_type=‘split’, learning_rate=0.02, max_depth=5,
min_child_samples=20, min_child_weight=0.001, min_samples_split=10,
min_split_gain=0.0, n_estimators=90, n_jobs=-1, num_leaves=31,
objective=None, random_state=2018, reg_alpha=0.0, reg_lambda=0.0,
silent=True, subsample=1.0, subsample_for_bin=200000,
subsample_freq=0), 0.7832748164698372)
模型评估
对构建的七个模型进行评估
models = {‘随机森林’: forest_grid.best_estimator_,
‘GBDT’: gbdt.best_estimator_,
‘XGBoost’: xgb.best_estimator_,
‘LightGBM’: lgbm,
‘逻辑回归’: log_grid.best_estimator_,
‘SVM’: svc_grid.best_estimator_,
‘决策树’: tree_grid.best_estimator_}

assessments = {
‘Accuracy’: [],
‘Precision’: [],
‘Recall’: [],
‘F1-score’: [],
‘AUC’: []
}
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, label=label)
plt.plot([0, 1], [0, 1], ‘k–’)
plt.axis([0, 1, 0, 1])
plt.xlabel(‘False Positive Rate’)
plt.ylabel(‘True Positive Rate’)
plt.legend()
plt.tight_layout()
for name, model in models.items():
test_pre = model.predict(X_test)
train_pre = model.predict(X_train)
test_proba = model.predict_proba(X_test)[:,1]
train_proba = model.predict_proba(X_train)[:,1]

acc_test = accuracy_score(test_pre, y_test) * 100
acc_train = accuracy_score(train_pre, y_train) * 100
accuracy = '训练集：%.2f%%；测试集:%.2f%%' % (acc_train, acc_test)
assessments['Accuracy'].append(accuracy)

pre_test = precision_score(test_pre, y_test) * 100
pre_train = precision_score(train_pre, y_train) * 100
precision = '训练集：%.2f%%；测试集:%.2f%%' % (pre_train, pre_test)
assessments['Precision'].append(precision)

rec_test = recall_score(test_pre, y_test) * 100
rec_train = recall_score(train_pre, y_train) * 100
recall = '训练集：%.2f%%；测试集:%.2f%%' % (rec_train, rec_test)
assessments['Recall'].append(recall)

f1_test = f1_score(test_pre, y_test) * 100
f1_train = f1_score(train_pre, y_train) * 100
f1 = '训练集：%.2f%%；测试集:%.2f%%' % (f1_train, f1_test)
assessments['F1-score'].append(f1)

fig = plt.figure(figsize=(8, 6))
fpr, tpr, thresholds = roc_curve(y_test, test_proba)
plot_roc_curve(fpr, tpr, label='测试集')
fpr, tpr, thresholds = roc_curve(y_train, train_proba)
plot_roc_curve(fpr, tpr, label='训练集')
plt.title(name)

auc_test = roc_auc_score(y_test, test_proba) * 100
auc_train = roc_auc_score(y_train, train_proba) * 100
auc = '训练集：%.2f%%；测试集:%.2f%%' % (auc_train, auc_test)
assessments['AUC'].append(auc)

fig = plt.figure(figsize=(8, 6))
for name, model in models.items():
proba = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, proba)
plot_roc_curve(fpr, tpr, label=name)
fig = plt.figure(figsize=(8, 6))
for name, model in models.items():
proba = model.predict_proba(X_train)[:,1]
fpr, tpr, thresholds = roc_curve(y_train, proba)
plot_roc_curve(fpr, tpr, label=name)
ass_df = pd.DataFrame(assessments, index=models.keys())
ass_df

猜你喜欢