欠拟合与过拟合

使用线性回归模型在训练样本上进行拟合

from  sklearn.linear_model import  LinearRegression
import numpy as np
import matplotlib.pyplot as plt

#输入训练样本的特征以及目标值
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]

#使用默认配置初始化线性回归模型
regressor = LinearRegression()
regressor.fit(X_train, y_train)

#在x轴上从0至25均匀采样100个数据点
xx = np.linspace(0, 26, 100)
print(xx.shape)
#out[]:(100,)
xx = xx.reshape(-1, 1)
print(xx.shape)
#out[]:(100, 1)
#以上述100个数据点作为基准，预测回归直线
yy = regressor.predict(xx)

#对回归预测到的直线进行作图
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label="Degree=1")

plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1])
plt.show()

#输出线性回归模型在训练样本上的R-squared值
print(regressor.score(X_train, y_train))
#out[]:0.9100015964240102

这里写图片描述

使用2次多项式回归模型在比萨训练样本上进行拟合

#使用线性回归模型在训练样本上进行拟合
from  sklearn.linear_model import  LinearRegression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

#输入训练样本的特征以及目标值
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]

#使用PolynominalFeatures(degree=2)映射出2次多项式特征
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)

#使用默认配置初始化线性回归模型
regressor = LinearRegression()
regressor.fit(X_train, y_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_train_poly2, y_train)

#在x轴上从0至25均匀采样100个数据点
xx = np.linspace(0, 26, 100)
print(xx.shape)
#out[]:(100,)
xx = xx.reshape(-1, 1)
print(xx.shape)
#out[]:(100, 1)
xx_poly2 = poly2.transform(xx)

#以上述100个数据点作为基准，预测回归直线
yy = regressor.predict(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)

#对回归预测到的直线进行作图
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label="Degree=1")
plt2, =plt.plot(xx, yy_poly2, label='Degree=2')

plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2])
plt.show()

#输出线性回归模型在训练样本上的R-squared值
print(regressor.score(X_train, y_train))
#out[]:0.9100015964240102
print(regressor_poly2.score(X_train_poly2, y_train))
#out[]:0.9816421639597427

在升高了特征维度之后，2次多项式回归模型在训练样本上的性能表现更加突出。对训练数据的拟合程度也增加了许多。
这里写图片描述

使用4次多项式回归模型进行拟合

#使用线性回归模型在训练样本上进行拟合
from  sklearn.linear_model import  LinearRegression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

#输入训练样本的特征以及目标值
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]

#使用PolynominalFeatures(degree=2)映射出2次多项式特征
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train)

#使用PolynominalFeatures(degree=4)映射出2次多项式特征
poly4 = PolynomialFeatures(degree=4)
X_train_poly4 = poly4.fit_transform(X_train)

#使用默认配置初始化线性回归模型
regressor = LinearRegression()
regressor.fit(X_train, y_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_train_poly2, y_train)
regressor_poly4 = LinearRegression()
regressor_poly4.fit(X_train_poly4, y_train)

#在x轴上从0至25均匀采样100个数据点
xx = np.linspace(0, 26, 100)
print(xx.shape)
#out[]:(100,)
xx = xx.reshape(-1, 1)
print(xx.shape)
#out[]:(100, 1)
xx_poly2 = poly2.transform(xx)
xx_poly4 = poly4.transform(xx)

#以上述100个数据点作为基准，预测回归直线
yy = regressor.predict(xx)
yy_poly2 = regressor_poly2.predict(xx_poly2)
yy_poly4 = regressor_poly4.predict(xx_poly4)

#对回归预测到的直线进行作图
plt.scatter(X_train, y_train)
plt1, = plt.plot(xx, yy, label="Degree=1")
plt2, =plt.plot(xx, yy_poly2, label='Degree=2')
plt4, =plt.plot(xx, yy_poly4, label='Degree=4')

plt.axis([0, 25, 0, 25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
plt.legend(handles = [plt1, plt2, plt4])
plt.show()

#输出线性回归模型在训练样本上的R-squared值
print(regressor.score(X_train, y_train))
#out[]:0.9100015964240102
print(regressor_poly2.score(X_train_poly2, y_train))
#out[]:0.9816421639597427
print(regressor_poly4.score(X_train_poly4, y_train))
#out[]:1.0

4次多项式几乎完全拟合了所有的训练数据点，对用的R_squared值也为1.0。但是，如果这时觉得已经觉得已经找到了完美的模型，那么显然是高兴过早了。
这里写图片描述

评价3种回归模型在测试数据集上的性能表现

#准备测试数据

X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]

print(regressor.score(X_test, y_test))
#out[]:0.809726797707665

X_test_poly2 = poly2.transform(X_test)
print(regressor_poly2.score(X_test_poly2, y_test))
#out[]:0.8675443656345073

X_test_poly4 = poly4.transform(X_test)
print(regressor_poly4.score(X_test_poly4, y_test))
#out[]:0.8095880795766807

当模型复杂度很低 $（Degree = 1)$ 时，模型不仅没有对训练集上的数据有良好的拟合状态，而且在测试集上也表现平平，这种情况叫做欠拟合 $（Underfitting）$ ;但是，当我们一味追求很高的模型复杂度 $（Degree = 4）$ ,尽管模型几乎完全拟合了所有的训练数据，但是模型也变得非常波动，几乎丧失了对未知数据的预测能力，这种情况叫做过拟合 $(Overfitting)$ 。这两种情况都是缺乏模型范化力的表现。这就要求在增加模型复杂度、提高在可观数据上的性能表现的同时，又需要兼顾模型泛化力，防止发生过拟合的情况，为了平衡这两难的选择，我们通常采用两种模型正则化的方法。

$L_1$ 范数正则化

正则化（Regularization）的目的在于提高模型在未知测试数据上的泛化力，避免参数过拟合。

Lasso模型在4次多项式特征上的拟合表现

from sklearn.linear_model import Lasso
lasso_poly4 = Lasso()
lasso_poly4.fit(X_train_poly4, y_train)
print(lasso_poly4.score(X_test_poly4, y_test))
#out[]:0.8388926873604382
#输出Lasso模型的参数列表
print(lasso_poly4.coef_)
#out[]:[ 0.00000000e+00  0.00000000e+00  1.17900534e-01  5.42646770e-05 -2.23027128e-04]

相比于普通4次多项式回归模型在测试集上的表现，默认配置的Lasso模型性能提高了大约3%
Lasso模型拟合后的参数列表中，4次与3次特征的参数均为0.0，使得特征更加稀疏。

$L_2$ 范数正则化

Ridge模型在4次多项式特征上的拟合表现

#输出普通4项式回归模型的参数列表
print(regressor_poly4.coef_)
#out[]:[[ 0.00000000e+00 -2.51739583e+01  3.68906250e+00 -2.12760417e-01 4.29687500e-03]]
#输出上述这些参数的平方和，验证参数之间的巨大差异
print(np.sum(regressor_poly4.coef_ ** 2))
#out[]:647.3826457369564
from sklearn.linear_model import Ridge
ridge_poly4 = Ridge()
ridge_poly4.fit(X_train_poly4, y_train)
print(ridge_poly4.score(X_test_poly4, y_test))
#out[]:0.8374201759366577
print(ridge_poly4.coef_)
#out[]:[[ 0.         -0.00492536  0.12439632 -0.00046471 -0.00021205]]
#计算Ridge模型拟合后参数的平方和
print(np.sum(ridge_poly4.coef_ ** 2))
#out[]:0.015498965203571016

相比与普通4次多项式回归模型在测试集上的表现，默认配置的Ridge模型性能提高了近3%
Ridge模型拟合后的参数之间差异非常小。

机器学习17-模型正则化

欠拟合与过拟合

使用线性回归模型在训练样本上进行拟合

使用2次多项式回归模型在比萨训练样本上进行拟合

使用4次多项式回归模型进行拟合

评价3种回归模型在测试数据集上的性能表现

$L_1$ 范数正则化

Lasso模型在4次多项式特征上的拟合表现

$L_2$ 范数正则化

Ridge模型在4次多项式特征上的拟合表现

猜你喜欢

机器学习17-模型正则化

欠拟合与过拟合

使用线性回归模型在训练样本上进行拟合

使用2次多项式回归模型在比萨训练样本上进行拟合

使用4次多项式回归模型进行拟合

评价3种回归模型在测试数据集上的性能表现

L1 L 1 L_1范数正则化

Lasso模型在4次多项式特征上的拟合表现

L2 L 2 L_2范数正则化

Ridge模型在4次多项式特征上的拟合表现

猜你喜欢

$L_1$ 范数正则化

$L_2$ 范数正则化