模型融合stacking

日萌社

人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)



In [1]:

from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:

#没有用bagging和boosting
#stacking    先用几个不同的模型做预测  输出预测值  然后将这几个模型输出的预测值作为特征来训练一个新的模型

获取数据

In [3]:

data=pd.read_csv("data/onehot_feature.csv")
data_test = pd.read_csv("./data/onehot_feature_test.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150518 entries, 0 to 150517
Data columns (total 34 columns):
Unnamed: 0    150518 non-null int64
时间            150518 non-null int64
小区名           150518 non-null int64
小区房屋出租数量      150518 non-null float64
楼层            150518 non-null int64
总楼层           150518 non-null float64
房屋面积          150518 non-null float64
房屋朝向          150518 non-null object
居住状态          150518 non-null float64
卧室数量          150518 non-null int64
厅的数量          150518 non-null int64
卫的数量          150518 non-null int64
出租方式          150518 non-null float64
区             150518 non-null float64
位置            150518 non-null float64
地铁线路          150518 non-null float64
地铁站点          150518 non-null float64
距离            150518 non-null float64
装修情况          150518 non-null float64
月租金           150518 non-null float64
log_rent      150518 non-null float64
新朝向           150518 non-null object
房+卫+厅         150518 non-null int64
房/总           150518 non-null float64
卫/总           150518 non-null float64
厅/总           150518 non-null float64
卧室面积          150518 non-null float64
楼层比           150518 non-null float64
户型            150518 non-null int64
有地铁           150518 non-null int64
小区线路数         150518 non-null int64
位置线路数         150518 non-null int64
新小区名          150518 non-null int64
小区条数大于100     150518 non-null int64
dtypes: float64(18), int64(14), object(2)
memory usage: 39.0+ MB

In [4]:

# 将离散特征转换成字符串类型
colunms = ['时间', '新小区名', '居住状态', '出租方式', '区',
           '位置', '地铁线路', '地铁站点', '装修情况', '户型']
for col in colunms:
    data[col] = data[col].astype(str)

In [5]:

x_columns=['小区房屋出租数量','新小区名', '楼层', '总楼层', '房屋面积','居住状态', '卧室数量',
       '卫的数量',  '位置',  '地铁站点', '距离', '装修情况', 
       '新朝向', '房+卫+厅', '房/总', '卫/总', '厅/总', '卧室面积', '楼层比', '户型','有地铁','小区线路数','位置线路数','小区条数大于100',]
y_label='log_rent'
x=data[x_columns]
y=data[y_label]
X_TEST = data_test[x_columns]

In [6]:

# 2.分割数据集
train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.25, random_state=12)

In [7]:

# 1.特征转换
vector = DictVectorizer(sparse=True)
x_train = vector.fit_transform(train_x.to_dict(orient='records'))
x_test = vector.transform(test_x.to_dict(orient='records'))
X_TEST = vector.transform(X_TEST.to_dict(orient="records"))

In [8]:

print(x_train.shape, x_test.shape, X_TEST.shape)
(112888, 826) (37630, 826) (46000, 826)

In [9]:

# 2.降维
pca=PCA(0.98)
pca_x_train=pca.fit_transform(x_train.toarray())
pca_x_test=pca.transform(x_test.toarray())
PCA_X_TEST = pca.transform(X_TEST.toarray())

In [10]:

print(pca_x_train.shape, pca_x_test.shape, PCA_X_TEST.shape)
(112888, 361) (37630, 361) (46000, 361)

In [68]:

def rmse(y_true,y_pred):
    y_pred=np.exp(y_pred)-1  # 转换成真实的租金
    y_true=np.exp(y_true)-1
    return np.sqrt(mean_squared_error(y_true,y_pred))

构建子模型

构建岭回归模型

In [69]:

%%time
# 1.通过参数搜索,确定最优参数alpha的值
ridge = Ridge(normalize=True)
params = {
    "alpha": [0.005, 0.01, 1, 5, 10, 20, 50]
}
model1 = GridSearchCV(ridge, param_grid=params, cv=5, n_jobs=-1)
model1.fit(pca_x_train, train_y)
model1.best_params_
#{'alpha': 50, 'fit_intercept': True}
CPU times: user 1.78 s, sys: 705 ms, total: 2.48 s
Wall time: 21.5 s

In [70]:

# 利用搜索出的最优参数构建模型
ridge = Ridge(alpha=50, normalize=True)
ridge.fit(pca_x_train, train_y)

Out[70]:

Ridge(alpha=50, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)

In [71]:

y_pred_test=ridge.predict(pca_x_test)
y_pred_train=ridge.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 6.342657781238426
测试集rmse: 6.493947602276618

构建lasso回归

In [72]:

%%time
# 1.参数搜索
lasso = Lasso(normalize=True)
params = {
    "alpha": [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    "fit_intercept": [True, False]
}
model2 = GridSearchCV(lasso, param_grid=params, cv=5, n_jobs=-1)
model2.fit(pca_x_train, train_y)
print(model2.best_params_)
#{'alpha': 0.001, 'fit_intercept': True}
{'alpha': 0.001, 'fit_intercept': True}
CPU times: user 1.68 s, sys: 551 ms, total: 2.23 s
Wall time: 49.6 s

In [73]:

# 利用搜索出的最优参数构建模型
lasso=Lasso(alpha=0.001, normalize=True)
lasso.fit(pca_x_train,train_y)

Out[73]:

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [74]:

%%time
y_pred_test=lasso.predict(pca_x_test)
y_pred_train=lasso.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 6.385065714494761
测试集rmse: 6.53676743372339
CPU times: user 393 ms, sys: 47.4 ms, total: 440 ms
Wall time: 87.1 ms

构建随机森林

In [75]:

%%time
# 1.参数搜索
rf = RandomForestRegressor(max_features='sqrt')  # 设置max_features='sqrt',不然太耗时间
params = {
    "n_estimators": [200],  # [200,500,700],
    "max_depth": [50],  # [40, 50, 60]
    "min_samples_split": [20, 50, 100],
    "min_samples_leaf": [10, 20, 30]
}
model3 = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1, verbose=2)
model3.fit(pca_x_train, train_y)
print(model3.best_params_)
# {'max_depth': 50,
#  'min_samples_leaf': 10,
#  'min_samples_split': 20,
#  'n_estimators': 200}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 55.7min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 81.1min finished
{'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 200}
CPU times: user 10min 4s, sys: 8.96 s, total: 10min 13s
Wall time: 1h 31min 30s

In [76]:

%%time
# 利用搜索出的最优参数构建模型
rf=RandomForestRegressor(n_estimators=200,
                         max_features=0.8,
                         max_depth=50,
                         min_samples_split=20,
                         min_samples_leaf=10,
                         n_jobs=-1)
rf.fit(pca_x_train,train_y)
CPU times: user 3h 34min 3s, sys: 1min 29s, total: 3h 35min 32s
Wall time: 33min 4s

In [77]:

%%time
y_pred_test=rf.predict(pca_x_test)
y_pred_train=rf.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 2.133144119124377
测试集rmse: 2.7950254213867094
CPU times: user 24.4 s, sys: 465 ms, total: 24.9 s
Wall time: 4.53 s

构建决策树

In [78]:

%%time
tree=DecisionTreeRegressor()
params={
    "max_depth":[60],  # [40,50,60,70],
    "min_samples_split":[5],  # [5,10,20,30,40,50]
    "min_samples_leaf":[5], # [2,3,5,7,9,11]
}
model4=GridSearchCV(tree,param_grid=params,cv=5,n_jobs=-1)
model4.fit(pca_x_train,train_y)
print(model4.best_params_)
# {'max_depth': 60, 'min_samples_leaf': 2, 'min_samples_split': 5}
{'max_depth': 60, 'min_samples_leaf': 5, 'min_samples_split': 5}
CPU times: user 1min 34s, sys: 2.06 s, total: 1min 36s
Wall time: 3min 26s

In [79]:

%%time
from sklearn.tree import DecisionTreeRegressor
#利用搜索出的最优参数构建模型
tree=DecisionTreeRegressor(max_depth=60,min_samples_leaf=2,min_samples_split=5)
tree.fit(pca_x_train,train_y)
CPU times: user 1min 36s, sys: 1.48 s, total: 1min 38s
Wall time: 1min 40s

In [80]:

%%time
y_pred_test=tree.predict(pca_x_test)
y_pred_train=tree.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 0.805142479875888
测试集rmse: 2.6702036461919856
CPU times: user 254 ms, sys: 123 ms, total: 377 ms
Wall time: 380 ms

In [81]:

import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.show()

构建支持向量机

In [ ]:

# %%time
# # 1.参数搜索----数据量大 svm太耗时,调参几乎不可能
# svr=SVR()
# params={
#     "gamma":[0.001,0.01,0.1,0.5,1,5],
#     "C":[0.001,0.1,0.5,1,5] 
# }
# model5=GridSearchCV(svr,param_grid=params,cv=5,n_jobs=-1,verbose=10)
# # verbose:日志冗长度,int:冗长度,0:不输出训练过程,1:偶尔输出,>1:对每个子模型都输出。
# model5.fit(pca_x_train,train_y)
# model5.best_params_

In [ ]:

# %%time
# # 随意选一组参数   --- 耗时太长 放弃该模型
# svr=SVR(gamma=0.1,C=0.5)
# svr.fit(pca_x_train,train_y)
# y_pred=svr.predict(pca_x_test)
# print(rmse(test_y,y_pred))

构建xgboost模型

In [82]:

%%time
import xgboost as xgb
xgbr = xgb.XGBRegressor(objective='reg:linear', learning_rate=0.1, gamma=0.05, max_depth=45,
                 min_child_weight=0.5, subsample=0.6, reg_alpha=0.5, reg_lambda=0.8, colsample_bytree=0.5, n_jobs=-1)
xgbr.fit(pca_x_train, train_y)
y_pred = xgbr.predict(pca_x_test)
print(rmse(test_y,y_pred))
/Users/sherwin/anaconda3/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
  if getattr(data, 'base', None) is not None and \
[12:23:28] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
2.1601162492127104
CPU times: user 28min 30s, sys: 24.2 s, total: 28min 54s
Wall time: 29min 29s

In [83]:

%%time
y_pred_test=xgbr.predict(pca_x_test)
y_pred_train=xgbr.predict(pca_x_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred_test))
训练集rmse: 0.9609658477710833
测试集rmse: 2.1601162492127104
CPU times: user 10 s, sys: 427 ms, total: 10.4 s
Wall time: 10.6 s

In [84]:

import matplotlib.pyplot as plt
plt.figure(figsize=(10,10),dpi=100)
plt.scatter(test_y,y_pred_test)
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.show()

Stacking融合

构建Stacking模型需要的数据

In [86]:

%%time
# 获取每个子模型的预测结果作为特征
# 训练特征
train_features=[]
train_features.append(ridge.predict(pca_x_train))  # 将每个模型预测值保存起来
train_features.append(lasso.predict(pca_x_train))
# train_features.append(svr.predict(pca_x_train))  # 这个太慢了  不要了
train_features.append(rf.predict(pca_x_train))
train_features.append(tree.predict(pca_x_train))
train_features.append(xgbr.predict(pca_x_train))
# 测试特征
test_features=[]
test_features.append(ridge.predict(pca_x_test))
test_features.append(lasso.predict(pca_x_test))
# test_features.append(svr.predict(pca_x_test))
test_features.append(rf.predict(pca_x_test))
test_features.append(tree.predict(pca_x_test))
test_features.append(xgbr.predict(pca_x_test))
# 提交结果特征
TEST_FEATURES=[]
TEST_FEATURES.append(ridge.predict(PCA_X_TEST))
TEST_FEATURES.append(lasso.predict(PCA_X_TEST))
# TEST_FEATURES.append(svr.predict(PCA_X_TEST))
TEST_FEATURES.append(rf.predict(PCA_X_TEST))
TEST_FEATURES.append(tree.predict(PCA_X_TEST))
TEST_FEATURES.append(xgbr.predict(PCA_X_TEST))
CPU times: user 42.1 s, sys: 1.49 s, total: 43.6 s
Wall time: 20.3 s

In [87]:

train_features

Out[87]:

[array([2.04715431, 2.05232901, 2.04572967, ..., 2.04659472, 2.04508413,
        2.05562638]),
 array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758,
        2.05200758]),
 array([1.67325566, 1.94499122, 1.85460452, ..., 1.92275812, 1.76267895,
        2.22438597]),
 array([1.59023952, 1.84714777, 1.85130219, ..., 1.96150612, 1.77317884,
        2.23207518]),
 array([1.6343094, 1.9145248, 1.8356705, ..., 1.9381661, 1.7626299,
        2.2465973], dtype=float32)]

In [88]:

test_features

Out[88]:

[array([2.04925512, 2.04865288, 2.04878586, ..., 2.07295592, 2.05666692,
        2.0560697 ]),
 array([2.05200758, 2.05200758, 2.05200758, ..., 2.05200758, 2.05200758,
        2.05200758]),
 array([1.93842148, 1.71689679, 1.71233925, ..., 3.7684956 , 2.1988801 ,
        2.15518207]),
 array([1.93762954, 1.71991266, 1.59023952, ..., 3.92681962, 2.1296814 ,
        2.08786427]),
 array([1.9394264, 1.6995616, 1.8815998, ..., 3.7348156, 2.2026072,
        2.1582646], dtype=float32)]

In [89]:

# np.vstack:按垂直方向(行顺序)堆叠数组构成一个新的数组
mx_train=np.vstack(train_features).T
mx_test=np.vstack(test_features).T
MX_TEST=np.vstack(TEST_FEATURES).T
MX_TEST.shape

Out[89]:

(46000, 5)

Stacking模型训练

In [90]:

%%time
stack_model=Ridge(fit_intercept=False)
params={
    "alpha":np.logspace(-2,3,20)
}
model=GridSearchCV(stack_model,param_grid=params,cv=5,n_jobs=-1)
model.fit(mx_train,train_y)
print(model.best_params_)
{'alpha': 0.06158482110660264}
CPU times: user 580 ms, sys: 439 ms, total: 1.02 s
Wall time: 3.47 s

In [91]:

%%time
stack_model=Ridge(alpha=0.379269,fit_intercept=False)
stack_model.fit(mx_train,train_y)
y_pred=stack_model.predict(mx_test)
y_pred_train=stack_model.predict(mx_train)
print("训练集rmse:",rmse(train_y,y_pred_train))
print("测试集rmse:",rmse(test_y,y_pred))
训练集rmse: 0.7337935133190991
测试集rmse: 2.3272631885188044
CPU times: user 30.8 ms, sys: 9.28 ms, total: 40.1 ms
Wall time: 13.2 ms

In [92]:

stack_model.coef_

Out[92]:

array([-0.1330147 ,  0.13235901, -0.15773228,  0.6991465 ,  0.45928745])

提交结果输出

In [96]:

Y_PRED_TEST = stack_model.predict(MX_TEST)
Y_PRED_TEST = np.exp(Y_PRED_TEST)-1
print(Y_PRED_TEST)
data = range(1, len(Y_PRED_TEST)+1)
Y_PRED = pd.DataFrame(data=Y_PRED_TEST, columns=["月租金"])
Y_PRED["id"] = range(1, Y_PRED.shape[0]+1)
Y_PRED.head()
[6.2493489  5.12626054 8.64297508 ... 3.59608672 1.05481017 4.8740706 ]

Out[96]:

  月租金 id
0 6.249349 1
1 5.126261 2
2 8.642975 3
3 8.885262 4
4 4.482541 5

In [97]:

Y_PRED.to_csv("./data/Y_PRED_STACK.csv")

模型保存

In [98]:

from sklearn.externals import joblib
joblib.dump(stack_model, "./data/stack_model.kpl")

Out[98]:

['./data/stack_model.kpl']
发布了372 篇原创文章 · 获赞 121 · 访问量 20万+

猜你喜欢

转载自blog.csdn.net/zimiao552147572/article/details/104897536