【机器学习】回归案例实践:数据处理建模调参

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011240016/article/details/85083130
# -*- coding: utf-8 -*-
"""回归问题案例.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1l8xlYKSd8nljVVEEriZyoc0oivqMDWR0
"""

# 导入必要的包
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

# 导入数据
filename = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PRTATTO', 'B', 'LSTAT', 'MEDV' ]
data = read_csv(filename, names=names, delim_whitespace=True) # 指定文件的分隔符为空格键

data.shape

data.head()

data.describe()

# 理解数据
print(data.dtypes)

#
set_option('precision', 1)
print(data.describe())

# 查看特征之间的两两关联关系
set_option('precision', 2)
print(data.corr(method='pearson'))

# 数据可视化
# 单一特征图表
data.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, layout=(3,5), bins=100)
plt.show()



# 用密度图展示:更加平滑展示数据特征
data.plot(kind='density', subplots=True, layout=(4,4), sharex=False, fontsize=1)
plt.show()

# 箱线图
data.plot(kind='box', subplots=True, layout=(4,4), sharex=False, sharey=False, fontsize=8)
plt.show()

"""### 多重数据图表

看不同数据特征之间的相互影响关系。
"""

# 散点矩阵图
scatter_matrix(data)
plt.show()

# 相关矩阵图

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(data.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0,14,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

"""数据集中的数据结构较复杂,需要考虑对数据进行转换,以提高模型的准确度。

- 特征选择来减少大部分相关性高的特征
- 标准化数据来降低不同数据度量单位带来的影响
- 正态化数据以降低不同的数据分布结构,提高算法的准确度
"""

# 分离数据集
array = data.values
X = array[:, 0:13]
y = array[:, 13]
test_size = 0.2 # 20%测试集
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

X_train.shape

X_test.shape

y_train.shape, y_test.shape

"""### 评估算法

从直观上看,只能得出由于部分数据的线性分布,线性回归算法和弹性网络回归算法对解决问题可能有效。且由于数据的离散化,用决策树算法或者SVM算法可能能生成高准确度的模型。

**但是我们仍然不清楚到底哪个算法能生成准确度最高的模型。**需要设计评估框架来帮助我们选择。这里用10折交叉验证来分离数据,用均方误差来比较算法准确度。其中,均方误差越趋近于0,算法的准确度就越高。
"""

num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

# 先不对原始数据进行任何处理,得出模型性能评价的基准
models = {}
models['LR'] = LinearRegression()
models['Lasso'] = Lasso()
models['EN'] = ElasticNet()
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()

# 直接评估算法,得出一个baseline
results = []
for key in models:
  kfold = KFold(n_splits=num_folds, random_state=seed)
  cv_result = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring=scoring)
  results.append(cv_result)
  print("%s: %f (%f)" % (key, cv_result.mean(), cv_result.std()))

# 评估图——箱线图
fig = plt.figure()
fig.suptitle('算法比较')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()

"""### 分析

线性算法的分布比较类似,K近邻算法结果分布比较紧凑。如何从箱线图上看出数据分布的优劣呢?

### 正态化数据

将数据变成中值为0,方差为1的数据。用`Pipeline`来正态化数据和对模型进行评估,可以防止数据泄露。
"""

# 评估算法
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('Lasso', Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])

pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])

results = []
for key in pipelines:
  kfold = KFold(n_splits=num_folds, random_state=seed)
  cv_result = cross_val_score(pipelines[key], X_train, y_train, cv=kfold, scoring=scoring)
  results.append(cv_result)
  print("%s: %f (%f)" % (key, cv_result.mean(), cv_result.std()))

"""### 分析

正则化以后,KNN算法的MSE最优。
"""

# 评估算法:箱线图
fig = plt.figure()
fig.suptitle('算法评估')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()

type(results)

results[0]

"""### 开始调参

通过上面的分析我们看到KNN算法具有最好的结果,能不能更好呢?这就进入到调参的过程了,即前面学过的网格搜索和随机搜索等方法。
"""

# 网格搜索
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# 遍历参数
param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21]}
model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=y_train)

print('最优: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params'])

for mean, std, param in cv_results:
  print('%f (%f) with %r' % (mean, std, param))

"""### 集成算法

除了使用调参方法以外,还可以用集成算法提升准确率。现在对线性回归,K近邻以及回归树进行集成看看效果。
"""

ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])
ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])

results = []
for key in ensembles:
  kfold = KFold(n_splits=num_folds, random_state=seed)
  cv_result = cross_val_score(ensembles[key], X_train, y_train, cv=kfold, scoring=scoring)
  results.append(cv_result)
  print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

# 评估图——箱线图
fig = plt.figure()
fig.suptitle('算法比较')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys())
plt.show()

"""### 集成算法调参

集成算法有一个参数`n_estimators`, 这个可以调整,看看是否可以得到有提升的结果。
"""

# GBM算法调参——网格搜索
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# 遍历参数
param_grid = {'n_estimators': [10,50,100,200,300,400,500,600,700, 800,900]}
model = GradientBoostingRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=y_train)
print('最优: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

# ET算法调参——网格搜索
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

# 遍历参数
param_grid = {'n_estimators': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
model = ExtraTreesRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=y_train)
print('最优: %s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

"""### 显示最后结果最好的是ET模型。"""

# 训练模型
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
gbr = ExtraTreesRegressor(n_estimators=30)
gbr.fit(X=rescaledX, y=y_train)

# 评估算法模型
rescaledX_test = scaler.transform(X_test)
predictions = gbr.predict(rescaledX_test)

type(rescaledX_test)

rescaledX_test

print(mean_squared_error(y_test, predictions)) # 最后求出均方误差,结果还不错
# 输出
# 14.653818518518516

END.

猜你喜欢

转载自blog.csdn.net/u011240016/article/details/85083130
今日推荐