内容:
1.构建基础的回归模型
2.交叉验证
3.尝试多种模型
4.调参技术
#导入特征工程处理后的数据
sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))
continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model'……]#提取连续值变量
线性模型只能处理连续性变量,无法处理缺省值,要做相应处理
sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
train = sample_feature[continuous_feature_names + ['price']]
train_X = train[continuous_feature_names]
train_y = train['price']
- 线性模型
from sklearn.linear_model import LinearRegression
#线性模型要做归一化
model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)
#查看拟合结果
'intercept:'+ str(model.intercept_)
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
效果不是很好,可以画图看一下数据分布,做一下变化(log,指数变化等),将数据近似为正态分布情形(因为很多模型也是基于类似的假设)
eg. train_y_ln = np.log(train_y + 1)
- 交叉验证
训练集:训练模型
验证集:评估选择模型,调节超参数
测试集:应用模型
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))
划分各个集合时也要考虑测试集的分布情况,尽量使验证集和测试集分布一致,以及注意样本类各个类别的数据比例(分层采样)、数据样本的时序关系等
- 判断过拟合—绘制学习曲线和验证曲线
from sklearn.model_selection import learning_curve, validation_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspac
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training example')
plt.ylabel('score')
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, t
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()#区域
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color='r',
label="Training score")
plt.plot(train_sizes, test_scores_mean,'o-',color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_ln[:1000])
3 进一步尝试多种模型
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
各种模型的使用大同小异,无非是.fit(),.predict(),有些模型构建还要考虑初始超参数的设定等……
- 调参技术
贝叶斯调参
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
val = cross_val_score(
LGBMRegressor(objective = 'regression_l1',
num_leaves=int(num_leaves),
max_depth=int(max_depth),
subsample = subsample,
min_child_samples = int(min_child_samples)
),
X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
).mean()
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (2, 100),
'max_depth': (2, 100),
'subsample': (0.1, 1),
'min_child_samples' : (2, 100)
}
)
rf_bo.maximize()
1 - rf_bo.max['target']
直播分享:
关注点还是应该多放在特征工程上,调参实际上带来的效果可能并不是很理想;最好把特征工程与实际建模相分离。