1.1 导入相应的包和数据
%matplotlib inline
#在jupyter里面需要加入此命令显示图
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import RANSACRegressor, LinearRegression, TheilSenRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.linear_model import Ridge,Lasso,ElasticNet,BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
data = pd.read_csv('../cement_data.csv')
# 查看数据记录的长度,共1030行
print(len(data))
# 查看前五行数据
data.head()
数据展示如下:
重新为列标签命名:
data.columns = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'concrete_strength']
data.head()
1.2 特征探索
先用可视化方法查看各个变量分别和concrete_strength的关系,结果可看到很多的自变量都存在大量0值,忽略0值看到cement_component、superplasticizer和concrete strength呈正相关关系,Flay_Ash、water component、coarse aggregate、fine aggregate和concrete strength呈负相关关系,age和concrete strength没有明显的关系,而且年份呈现离散趋势。
plt.figure(figsize=(15,10.5))
plot_count = 1
for feature in list(data.columns)[:-1]:
plt.subplot(3,3,plot_count)
plt.scatter(data[feature], data['concrete_strength'])
plt.xlabel(feature.replace('_',' ').title())
plt.ylabel('Concrete strength')
plot_count+=1
plt.show()
接下来对年份进行分段,查看每个年份段下各个特征跟因变量之间的pearson相关系数。从上面的年份图可以看到年份大致在100以下、100-300、300以上,所以进行以下的年份区分,并加入age_level列。
data.loc[data['age'] <= 30, 'age_level'] = '<30'
data.loc[((data['age'] <= 100) & (data['age'] > 30)), 'age_level'] = '30<age<100'
data.loc[((data['age'] <= 300) & (data['age'] > 100)), 'age_level'] = '100<age<300'
data.loc[data['age'] > 300, 'age_level'] = 'age>300'
data.head(20)
对比未对年份分组的pearson系数和对年份分组的pearson系数。未对年份分组的pearson系数表中显示,cement_component、superplasticizer、furnace_slag和concrete_strength呈现正相关关系,water_component、coarse_aggregate、fine_aggregate、flay_ash和concrete_strength呈现负相关关系。
对年份分组的pearson系数表中显示,在100
all_correlations = data.corr(method='pearson')
print(all_correlations)
print('---------------------------------------------------------------------------------------')
column = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age_level', 'concrete_strength']
# 按age_level分组求pearson相关系数
correlations = data[column].groupby('age_level').corr(method='pearson')
print(correlations)
cement_component furnace_slag flay_ash water_component superplasticizer \
cement_component 1.000 -0.275 -0.397 -0.082 0.092
furnace_slag -0.275 1.000 -0.324 0.107 0.043
flay_ash -0.397 -0.324 1.000 -0.257 0.378
water_component -0.082 0.107 -0.257 1.000 -0.658
superplasticizer 0.092 0.043 0.378 -0.658 1.000
coarse_aggregate -0.109 -0.284 -0.010 -0.182 -0.266
fine_aggregate -0.223 -0.282 0.079 -0.451 0.223
age 0.082 -0.044 -0.154 0.278 -0.193
concrete_strength 0.498 0.135 -0.106 -0.290 0.366
coarse_aggregate fine_aggregate age concrete_strength
cement_component -0.109 -0.223 0.082 0.498
furnace_slag -0.284 -0.282 -0.044 0.135
flay_ash -0.010 0.079 -0.154 -0.106
water_component -0.182 -0.451 0.278 -0.290
superplasticizer -0.266 0.223 -0.193 0.366
coarse_aggregate 1.000 -0.178 -0.003 -0.165
fine_aggregate -0.178 1.000 -0.156 -0.167
age -0.003 -0.156 1.000 0.329
concrete_strength -0.165 -0.167 0.329 1.000
--------------------------------------------------------------------------------------------
cement_component coarse_aggregate concrete_strength \
age_level
100<age<300 cement_component 1.000 0.544 0.558
coarse_aggregate 0.544 1.000 0.481
concrete_strength 0.558 0.481 1.000
fine_aggregate -0.509 0.033 -0.575
flay_ash NaN NaN NaN
furnace_slag -0.595 -0.390 -0.034
superplasticizer NaN NaN NaN
water_component -0.204 -0.782 -0.063
30<age<100 cement_component 1.000 -0.468 0.565
coarse_aggregate -0.468 1.000 -0.282
concrete_strength 0.565 -0.282 1.000
fine_aggregate -0.213 -0.254 -0.179
flay_ash -0.491 0.450 -0.216
furnace_slag 0.019 -0.306 0.443
superplasticizer 0.337 -0.178 0.617
water_component -0.057 -0.105 -0.472
<30 cement_component 1.000 -0.057 0.534
coarse_aggregate -0.057 1.000 -0.227
concrete_strength 0.534 -0.227 1.000
fine_aggregate -0.178 -0.199 -0.203
flay_ash -0.369 -0.127 -0.092
furnace_slag -0.323 -0.274 0.138
superplasticizer 0.070 -0.316 0.383
water_component -0.157 -0.185 -0.325
age>300 cement_component 1.000 -0.378 0.095
coarse_aggregate -0.378 1.000 -0.319
concrete_strength 0.095 -0.319 1.000
fine_aggregate -0.462 0.683 -0.560
flay_ash NaN NaN NaN
furnace_slag -0.569 -0.081 0.342
superplasticizer NaN NaN NaN
water_component 0.378 -0.794 0.631
fine_aggregate flay_ash furnace_slag superplasticizer \
age_level
100<age<300 cement_component -0.509 NaN -0.595 NaN
coarse_aggregate 0.033 NaN -0.390 NaN
concrete_strength -0.575 NaN -0.034 NaN
fine_aggregate 1.000 NaN -0.324 NaN
flay_ash NaN NaN NaN NaN
furnace_slag -0.324 NaN 1.000 NaN
superplasticizer NaN NaN NaN NaN
water_component -0.558 NaN 0.498 NaN
30<age<100 cement_component -0.213 -0.491 0.019 0.337
coarse_aggregate -0.254 0.450 -0.306 -0.178
concrete_strength -0.179 -0.216 0.443 0.617
fine_aggregate 1.000 0.122 -0.309 0.285
flay_ash 0.122 1.000 -0.547 0.102
furnace_slag -0.309 -0.547 1.000 0.067
superplasticizer 0.285 0.102 0.067 1.000
water_component -0.445 -0.323 0.065 -0.793
<30 cement_component -0.178 -0.369 -0.323 0.070
coarse_aggregate -0.199 -0.127 -0.274 -0.316
concrete_strength -0.203 -0.092 0.138 0.383
fine_aggregate 1.000 0.011 -0.288 0.151
flay_ash 0.011 1.000 -0.298 0.412
furnace_slag -0.288 -0.298 1.000 0.029
superplasticizer 0.151 0.412 0.029 1.000
water_component -0.374 -0.163 0.123 -0.584
age>300 cement_component -0.462 NaN -0.569 NaN
coarse_aggregate 0.683 NaN -0.081 NaN
concrete_strength -0.560 NaN 0.342 NaN
fine_aggregate 1.000 NaN -0.419 NaN
flay_ash NaN NaN NaN NaN
furnace_slag -0.419 NaN 1.000 NaN
superplasticizer NaN NaN NaN NaN
water_component -0.943 NaN 0.364 NaN
water_component
age_level
100<age<300 cement_component -0.204
coarse_aggregate -0.782
concrete_strength -0.063
fine_aggregate -0.558
flay_ash NaN
furnace_slag 0.498
superplasticizer NaN
water_component 1.000
30<age<100 cement_component -0.057
coarse_aggregate -0.105
concrete_strength -0.472
fine_aggregate -0.445
flay_ash -0.323
furnace_slag 0.065
superplasticizer -0.793
water_component 1.000
<30 cement_component -0.157
coarse_aggregate -0.185
concrete_strength -0.325
fine_aggregate -0.374
flay_ash -0.163
furnace_slag 0.123
superplasticizer -0.584
water_component 1.000
age>300 cement_component 0.378
coarse_aggregate -0.794
concrete_strength 0.631
fine_aggregate -0.943
flay_ash NaN
furnace_slag 0.364
superplasticizer NaN
water_component 1.000
接下来查看所有变量之间的相关关系图
data_ = data[(data.T != 0).any()]
seaborn.pairplot(data_, vars=data.columns, kind='reg')
plt.show()
1.3 回归分析
建立split_train_test()函数划分数据
def split_train_test(data, feature, train_index=0.7):
train, test = train_test_split(data, test_size = 1-train_index)
if type(feature) == list:
x_train = train[feature].as_matrix()
y_train = train['concrete_strength'].as_matrix()
x_test = test[feature].as_matrix()
y_test = test['concrete_strength'].as_matrix()
else:
x_train = [[x] for x in list(train[feature])]
y_train = [[x] for x in list(train['concrete_strength'])]
x_test = [[x] for x in list(test[feature])]
y_test = [[x] for x in list(test['concrete_strength'])]
return x_train, y_train, x_test, y_test
由单变量线性回归可视化可知,cement_component(0.227)、superplasticizer(0.0129)和concrete_strength呈现正相关线性趋势,flay_ash(0.0237), water_component(0.0727), coarse_aggregate(0.0129)和concrete_strength呈现负相关线性趋势。
plt.figure(figsize=(15,7))
plot_count = 1
for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]
x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)
# Create linear regression object
regr = LinearRegression()
# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)
# Plot outputs
plt.subplot(2,3,plot_count)
plt.scatter(x_test, y_test, color='black')
plt.plot(x_test, y_pred, color='blue',
linewidth=3)
plt.xlabel(feature.replace('_',' ').title())
plt.ylabel('Concrete strength')
print(feature, r2_score(y_test, y_pred))
plot_count+=1
plt.show()
cement_component 0.22709501673033738
flay_ash 0.02372873998753655
water_component 0.07274737892115468
superplasticizer 0.01293229609021429
coarse_aggregate 0.012992870179391658
1.4 多变量回归分析
1.4.1 线性回归
features = ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
data_tr = data
data_tr=data_tr[(data_tr.T != 0).all()]
x_train, y_train, x_test, y_test = split_train_test(data_tr, features)
# Create linear regression object
regr = LinearRegression()
# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue', linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: 0.114955
Intercept: 56.893169
Coefficients: [ 0.0502359 -0.03243765 -0.12711574 0.42090465 -0.0092923 ]
1.4.2 Ridge回归
alphas = np.arange(0.1,5,0.1)
model = Ridge()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
y_pred = cv.fit(x_train, y_train).predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue', linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: 0.115025
Intercept: 56.893169
Coefficients: [ 0.0502359 -0.03243765 -0.12711574 0.42090465 -0.0092923 ]
1.4.3 Lasso回归
model = Lasso()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
y_pred = cv.fit(x_train, y_train).predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue', linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: 0.129458
Intercept: 56.893169
Coefficients: [ 0.0502359 -0.03243765 -0.12711574 0.42090465 -0.0092923 ]
1.4.4 ElasticNet回归
model = ElasticNet()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
y_pred = cv.fit(x_train, y_train).predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue', linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: 0.126087
Intercept: 56.893169
Coefficients: [ 0.0502359 -0.03243765 -0.12711574 0.42090465 -0.0092923 ]
1.4.5 GradientBoostingRegressor单变量回归
plt.figure(figsize=(15,7))
plot_count = 1
for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]
x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)
# Create linear regression object
regr = GradientBoostingRegressor()
# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)
# Plot outputs
plt.subplot(2,3,plot_count)
plt.scatter(x_test, y_test, color='black')
plt.plot(x_test, y_pred, color='blue',
linewidth=3)
plt.xlabel(feature.replace('_',' ').title())
plt.ylabel('Concrete strength')
print(feature, r2_score(y_test, y_pred))
plot_count+=1
plt.show()
cement_component 0.29991407621280963
flay_ash 0.07501932678751821
water_component 0.33285235447360906
superplasticizer 0.1270301345197723
coarse_aggregate 0.2679084164701997
1.4.6 GradientBoostingRegressor多变量回归
model = GradientBoostingRegressor()
y_pred = model.fit(x_train, y_train).predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue',linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: -0.089525
Intercept: 81.404002
Coefficients: [ 0.0523122 -0.00354028 -0.16425187 0.1049935 -0.03001721]
1.4.7 SVR回归
model = SVR(kernel='linear')
y_pred = model.fit(x_train, y_train).predict(x_test)
plt.scatter(range(len(y_test)), y_test, color='black')
plt.plot(y_pred, color='blue', linewidth=3)
print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
Features: [‘cement_component’, ‘flay_ash’, ‘water_component’, ‘superplasticizer’, ‘coarse_aggregate’]
R2 score: 0.029033
1.5 回归预测
通过cement component预测concrete strength得到当cement component=213.5时,concrete strength=37.198606.
feature = 'cement_component'
cc_new_data = 213.5
data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]
x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)
regr = GradientBoostingRegressor()
# Train the model using the training sets
regr.fit(x_train, y_train)
cs_pred = regr.predict(cc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 37.198606
通过water_component预测concrete strength得到当water_component=213.5时,concrete strength=33.020739.
feature = 'water_component'
wc_new_data = 200
data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]
x_train, y_train, x_test, y_test = split_train_test(data_tr, feature)
regr = GradientBoostingRegressor()
# Train the model using the training sets
regr.fit(x_train, y_train)
cs_pred = regr.predict(wc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 33.020739