最近在看阿里天池的这个工业蒸汽预测比赛项目,顺便把看到的整理下来,以备不时之需啊!
1.导入包和数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# 导入数据
train_data = pd.read_table('比赛/工业蒸汽量预测/dataset/zhengqi_train.txt',encoding='utf-8')
test_data = pd.read_table('比赛/工业蒸汽量预测/dataset/zhengqi_test.txt',encoding='utf-8')
2.数据探索EDA
2.1画箱型图探索数据
fig = plt.figure(figsize=(16, 5)) # 指定绘图对象宽度和高度
sns.boxplot(train_data['V0'],orient="v", width=0.5)
# 画箱式图
column = train_data.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(20, 40)) # 指定绘图对象宽度和高度
for i in range(38):
plt.subplot(13, 3, i + 1) # 13行3列子图
sns.boxplot(train_data[column[i]], orient="v", width=0.5) # 箱式图
plt.ylabel(column[i], fontsize=8)
plt.show()
2.1 查看数据分布图
查看特征变量‘V0’的数据分布直方图,并绘制Q-Q图查看数据是否近似于正态分布
plt.figure(figsize=(10,5))
ax=plt.subplot(1,2,1)
sns.distplot(train_data['V0'],fit=stats.norm)
ax=plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)
查看查看所有数据的直方图和Q-Q图,查看训练集的数据是否近似于正态分布
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))
i=0
for col in train_data.columns:
i+=1
ax=plt.subplot(train_rows,train_cols,i)
sns.distplot(train_data[col],fit=stats.norm)
i+=1
ax=plt.subplot(train_rows,train_cols,i)
res = stats.probplot(train_data[col], plot=plt)
plt.show()
由上面的数据分布图信息可以看出,很多特征变量(如’V1’,‘V9’,‘V24’,'V28’等)的数据分布不是正态的,数据并不跟随对角线,后续可以使用数据变换对数据进行转换。
对比同一特征变量‘V0’下,训练集数据和测试集数据的分布情况,查看数据分布是否一致
ax = sns.kdeplot(train_data['V0'], color="Red", shade=True)
ax = sns.kdeplot(test_data['V0'], color="Blue", shade=True)
ax.set_xlabel('V0')
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
查看所有特征变量下,训练集数据和测试集数据的分布情况,分析并寻找出数据分布不一致的特征变量。
dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for col in test_data.columns:
ax=plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(train_data[col], color="Red", shade=True)
ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
查看特征’V5’, ‘V17’, ‘V28’, ‘V22’, ‘V11’, 'V9’数据的数据分布
drop_col = 6
drop_row = 1
plt.figure(figsize=(5*drop_col,5*drop_row))
i=1
for col in ["V5","V9","V11","V17","V22","V28"]:
ax =plt.subplot(drop_row,drop_col,i)
ax = sns.kdeplot(train_data[col], color="Red", shade=True)
ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train","test"])
i+=1
plt.show()
由上图的数据分布可以看到特征’V5’,‘V9’,‘V11’,‘V17’,‘V22’,‘V28’ 训练集数据与测试集数据分布不一致,会导致模型泛化能力差,采用删除此类特征方法
drop_columns = ['V5','V9','V11','V17','V22','V28']
2.3 合并训练集和测试集数据,并可视化训练集和测试集数据特征分布图
2.3.1 可视化线性回归关系
查看特征变量‘V0’与’target’变量的线性回归关系
fcols = 2
frows = 1
plt.figure(figsize=(8,4))
ax=plt.subplot(1,2,1)
sns.regplot(x='V0', y='target', data=train_data, ax=ax,
scatter_kws={
'marker':'.','s':3,'alpha':0.3},
line_kws={
'color':'k'});
plt.xlabel('V0')
plt.ylabel('target')
ax=plt.subplot(1,2,2)
sns.distplot(train_data['V0'].dropna())
plt.xlabel('V0')
查看所有特征变量与’target’变量的线性回归关系
fcols = 6
frows = len(test_data.columns)
plt.figure(figsize=(5*fcols,4*frows))
i=0
for col in test_data.columns:
i+=1
ax=plt.subplot(frows,fcols,i)
sns.regplot(x=col, y='target', data=train_data, ax=ax,
scatter_kws={
'marker':'.','s':3,'alpha':0.3},
line_kws={
'color':'k'});
plt.xlabel(col)
plt.ylabel('target')
i+=1
ax=plt.subplot(frows,fcols,i)
sns.distplot(train_data[col].dropna())
plt.xlabel(col)
2.4 查看特征变量的相关性
data_train1 = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
train_corr = data_train1.corr()
train_corr
# 画出相关性热力图
ax = plt.subplots(figsize=(28, 18))#调整画布大小
ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)#画热力图 annot=True 显示系数
# 找出相关程度
data_train1 = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
plt.figure(figsize=(20, 16)) # 指定绘图对象宽度和高度
colnm = data_train1.columns.tolist() # 列表头
mcorr = data_train1[colnm].corr(method="spearman") # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
上图为所有特征变量和target变量两两之间的相关系数,由此可以看出各个特征变量V0-V37之间的相关性以及特征变量V0-V37与target的相关性。
2.4.1 查找出特征变量和target变量相关系数大于0.5的特征变量
#寻找K个最相关的特征信息
k = 10 # number of variables for heatmap
cols = train_corr.nlargest(k, 'target')['target'].index
cm = np.corrcoef(train_data[cols].values.T)
hm = plt.subplots(figsize=(10, 10))#调整画布大小
#hm = sns.heatmap(cm, cbar=True, annot=True, square=True)
#g = sns.heatmap(train_data[cols].corr(),annot=True,square=True,cmap="RdYlGn")
hm = sns.heatmap(train_data[cols].corr(),annot=True,square=True)
plt.show()
threshold = 0.5
corrmat = train_data.corr()
top_corr_features = corrmat.index[abs(corrmat["target"])>threshold]
plt.figure(figsize=(10,10))
g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
drop_columns.clear()
drop_columns = ['V5','V9','V11','V17','V22','V28']
# Threshold for removing correlated variables
threshold = 0.5
# Absolute value correlation matrix
corr_matrix = data_train1.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
#data_all.drop(drop_col, axis=1, inplace=True)
由于’V14’, ‘V21’, ‘V25’, ‘V26’, ‘V32’, ‘V33’, 'V34’特征的相关系数值小于0.5,故认为这些特征与最终的预测target值不相关,删除这些特征变量;
#merge train_set and test_set
train_x = train_data.drop(['target'], axis=1)
#data_all=pd.concat([train_data,test_data],axis=0,ignore_index=True)
data_all = pd.concat([train_x,test_data])
data_all.drop(drop_columns,axis=1,inplace=True)
#View data
data_all.head()
# normalise numeric columns
cols_numeric=list(data_all.columns)
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
data_all[cols_numeric].describe()
3.特征工程
3.1 相关性分析
mcorr=mcorr.abs()
numerical_corr=mcorr[mcorr['target']>0.1]['target']
print(numerical_corr.sort_values(ascending=False))
index0 = numerical_corr.sort_values(ascending=False).index
print(train_data_scaler[index0].corr('spearman'))
3.2 相关性初筛
features_corr = numerical_corr.sort_values(ascending=False).reset_index()
features_corr.columns = ['features_and_target', 'corr']
features_corr_select = features_corr[features_corr['corr']>0.3] # 筛选出大于相关性大于0.3的特征
print(features_corr_select)
select_features = [col for col in features_corr_select['features_and_target'] if col not in ['target']]
new_train_data_corr_select = train_data_scaler[select_features+['target']]
new_test_data_corr_select = test_data_scaler[select_features]
3.3 多重共线性分析
from statsmodels.stats.outliers_influence import variance_inflation_factor #多重共线性方差膨胀因子
#多重共线性
new_numerical=['V0', 'V2', 'V3', 'V4', 'V5', 'V6', 'V10','V11',
'V13', 'V15', 'V16', 'V18', 'V19', 'V20', 'V22','V24','V30', 'V31', 'V37']
X=np.matrix(train_data_scaler[new_numerical])
VIF_list=[variance_inflation_factor(X, i) for i in range(X.shape[1])]
VIF_list
3.4 PCA去除多重共线性 降维
from sklearn.decomposition import PCA #主成分分析法
#PCA方法降维
#保持90%的信息
pca = PCA(n_components=0.9)
new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_90 = pca.transform(test_data_scaler)
new_train_pca_90 = pd.DataFrame(new_train_pca_90)
new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_train_pca_90['target'] = train_data_scaler['target']
new_train_pca_90.describe()
#PCA方法降维
#保留16个主成分
pca = PCA(n_components=0.95)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']
new_train_pca_16.describe()
4.模型训练
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.neighbors import KNeighborsRegressor #K近邻回归
from sklearn.tree import DecisionTreeRegressor #决策树回归
from sklearn.ensemble import RandomForestRegressor #随机森林回归
from sklearn.svm import SVR #支持向量回归
import lightgbm as lgb #lightGbm模型
from sklearn.model_selection import train_test_split # 切分数据
from sklearn.metrics import mean_squared_error #评价指标
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
4.1 切分训练数据和线下验证数据
#采用 pca 保留16维特征的数据
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
# 切分数据 训练数据80% 验证数据20%
train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)
4.2 定义绘制模型学习曲线函数
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
print(train_scores_mean)
print(test_scores_mean)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
4.3 多元线性回归模型
clf = LinearRegression()
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("LinearRegression: ", score)
LinearRegression: 0.2716996586179345
4.3.1 绘制线性回归模型学习曲线
X = train_data.values
y = train_target.values
# 图一
title = r"LinearRegression"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = LinearRegression() #建模
plot_learning_curve(estimator, title, X, y, ylim=(0.5, 0.8), cv=cv, n_jobs=1)
4.4 决策树回归
clf = DecisionTreeRegressor()
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("DecisionTreeRegressor: ", score)
DecisionTreeRegressor: 0.5916015121107268
4.4.1 绘制决策树学习曲线
X = train_data.values
y = train_target.values
# 决策树回归
title = r"DecisionTreeRegressor"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = DecisionTreeRegressor() #建模
plot_learning_curve(estimator, title, X, y, ylim=(0.1, 1.3), cv=cv, n_jobs=1)
4.5 随机森林回归
clf = RandomForestRegressor(n_estimators=200) # 200棵树模型
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor: ", score)
RandomForestRegressor: 0.2512592901935121
4.5.1 绘制随机森林回归学习曲线
X = train_data.values
y = train_target.values
# K近邻回归
title = r"RandomForestRegressor"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = RandomForestRegressor(n_estimators=200) #建模
plot_learning_curve(estimator, title, X, y, ylim=(0.4, 1.0), cv=cv, n_jobs=1)
4.6 lgb模型回归
# lgb回归模型
clf = lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2019,
objective='regression',
)
# 训练模型
clf.fit(
X=train_data, y=train_target,
eval_metric='MSE',
verbose=50
)
score = mean_squared_error(test_target, clf.predict(test_data))
print("lightGbm: ", score)
4.6.1 绘制lgb回归学习曲线
X = train_data.values
y = train_target.values
# K近邻回归
title = r"LGBMRegressor"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=100,
boosting_type='gbdt',
random_state=2019,
objective='regression'
) #建模
plot_learning_curve(estimator, title, X, y, ylim=(0.4, 1.0), cv=cv, n_jobs=1)
5.模型验证
参考天池官方源码模型验证部分
6.特征优化
6.1 定义特征构造方法,构造特征
epsilon=1e-5
#组交叉特征,可以自行定义,如增加: x*x/y, log(x)/y 等等
func_dict = {
'add': lambda x,y: x+y,
'mins': lambda x,y: x-y,
'div': lambda x,y: x/(y+epsilon),
'multi': lambda x,y: x*y
}
6.2 定义特征构造函数
def auto_features_make(train_data,test_data,func_dict,col_list):
train_data, test_data = train_data.copy(), test_data.copy()
for col_i in col_list:
for col_j in col_list:
for func_name, func in func_dict.items():
for data in [train_data,test_data]:
func_features = func(data[col_i],data[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
data[col_func_features] = func_features
return train_data,test_data
6.3 对训练集和测试集数据进行特征构造
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)
from sklearn.decomposition import PCA #主成分分析法
#PCA方法降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']
6.4 使用lightgbm模型对新构造的特征进行模型训练和评估
# ls_validation i
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
# 5折交叉验证
Folds=5
kf = KFold(n_splits=Folds, random_state=2019, shuffle=True)
# 记录训练和预测MSE
MSE_DICT = {
'train_mse':[],
'test_mse':[]
}
# 线下训练预测
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
# lgb树模型
lgb_reg = lgb.LGBMRegressor(
learning_rate=0.01,
max_depth=-1,
n_estimators=5000,
boosting_type='gbdt',
random_state=2019,
objective='regression',
)
# 切分训练集和预测集
X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
# 训练模型
lgb_reg.fit(
X=X_train_KFold,y=y_train_KFold,
eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
eval_names=['Train','Test'],
early_stopping_rounds=100,
eval_metric='MSE',
verbose=50
)
# 训练集预测 测试集预测
y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
print('------\n', '训练MSE\n', train_mse, '\n------')
test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
print('------\n', '预测MSE\n', test_mse, '\n------\n')
MSE_DICT['train_mse'].append(train_mse)
MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')
7.模型融合
参考:天池新书