对比利用多项式定义进行多项式拟合:笔记代码:多项式拟合和极值点连接
线性回归
sklearn.linear_model.LinearRegression()
--> return:线性回归器
线性回归器.fit(输入样本,输出标签) # 训练数据
线性回归器.predict(输入样本) # 预测数据
-- > return:预测输出标签
岭回归
# 岭回归 (削弱异常值对拟合的影响,正则强度越大,削弱的越厉害,降低对异常数据的依赖)
loss = J(k, b) + 正则函数(样本权重)*正则强度(或惩罚系数) # 正则项:可以防止过拟合
sklearn.linear_model.Ridge(正则强度,
fit_intercept=是否修正截距,
max_iter=最大迭代次数)
--> return:岭回归器
岭回归器.fit() # 训练数据
岭回归器.predict() # 预测数据
欠拟合:无论是训练数据还是测试数据,模型给出的预测值和真实值都存在较大的误差。
过拟合:模型对于训练数据具有较高的精度,但是对测试数据则表现极差。模型过于特殊,不够泛化(不够一般,即普适性不强)
欠拟合 <--- 模型复杂度 ---> 过拟合
多项式回归
sklearn.preprocessing.PolynomialFeatures(最高次数)
sklearn.pipeline.make_pipeline(多项式特征扩展器, 线性回归器) # 管线函数 pipeline模块
--> return:P:Pipeline
# 或者:
sklearn.pipeline.Pipeline([('扩展器自定义名', 多项式特征扩展器),
('回归器自定义名', 线性回归器)])
x-->多项式特征扩展器 -- x x^2 x^3 ... --> 线性回归器 ---> k1,k2,k3...
# 注pipeline.make_pipeline is a shorthand for the Pipeline constructor,区别在于make_pipeline不需要自定义名称
1. 管线函数make_pipeline(.., ..)
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
output:
Pipeline(memory=None,
steps=[('PolyFeatures', PolynomialFeatures(degree=7, include_bias=True, interaction_only=False)),
('RegressionModel', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))
]
)
# help(...)
def make_pipeline(*steps, **kwargs):
"""Construct a Pipeline from the given estimators.
This is a shorthand for the Pipeline constructor; it does not require, and
does not permit, naming the estimators. Instead, their names will be set
to the lowercase of their types automatically.
Parameters
----------
*steps : list of estimators, # list
memory :
2. 管线函数Pipeline([()], ..)
pl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)), # (自定义名,)
('RegressionModel',lm.LinearRegression())]) # Pipeline的参数是tuple组成的list,[]一定不能少
# sp.PolynomialFeatures(7)的输出作为lm.LinearRegression()的输入
# ==
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
# help(...Pipeline)
Parameters
----------
steps : list
List of (name, transform) tuples (implementing fit/transform) that are
chained, in the order in which they are chained, with the last object
an estimator.
回归模型性能评估
r2_score
# 拟合模型性能评估R^2
est_error = sklearn.metrics.r2_score(train_y, pred_train_y)
# R^2:
# 越接近1,表明方程的变量对y的解释能力越强,这个模型对数据拟合的也较好
# 越接近0,表明模型拟合的越差
# 经验值:>0.4, 拟合效果好
# R^2缺点:数据集的样本越大,R²越大,因此,不同数据集的模型结果比较会有一定的误差
code
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 16:56:50 2018
@author: Administrator
"""
import pickle # 硬盘存储模块
import numpy as np
import sklearn.linear_model as lm # 线性模型模块
import sklearn.metrics as sm # 模型评估模块
import matplotlib.pyplot as plt
import sklearn.pipeline as spl # 管线函数
import sklearn.preprocessing as sp
# traning datas
train_x, train_y = [], []
# 读取文件
with open('single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
train_x.append(data[:-1])
train_y.append(data[-1])
train_x = np.array(train_x) # 训练数据集必须为array或者array_like
train_y = np.array(train_y) # 训练数据集必须为array或者array_like
#print(x.shape, y.shape)
'''
模型建立
sklearn.linear_model.LinearRegression()
--> return:线性回归器
线性回归器.fit(输入样本,输出标签) # 训练数据
线性回归器.predict(输入样本) # 预测数据
-- > return:预测输出标签
'''
model_ln = lm.LinearRegression() # 构建线性回归器
model_ln.fit(train_x, train_y) # 训练数据 不返回k和b model中存储
pred_y_ln = model_ln.predict(train_x)
'''
岭回归 (削弱异常值对拟合的影响,正则强度越大,削弱的越厉害,降低对异常数据的依赖)
loss = J(k, b) + 正则函数(样本权重)*正则强度(或惩罚系数) # 正则项:可以防止过拟合
sklearn.linear_model.Ridge(正则强度,
fit_intercept=是否修正截距,
max_iter=最大迭代次数)
--> return:岭回归器
岭回归器.fit() # 训练数据
岭回归器.predict() # 预测数据
'''
model_rd = lm.Ridge(150, fit_intercept=True, max_iter=10000) # 构建线性回归器
model_rd.fit(train_x, train_y) # 训练数据 不返回k和b model中存储
pred_y_rd = model_rd.predict(train_x)
'''
多项式回归
sklearn.preprocessing.PolynomialFeatures(最高次数)
--> return:多项式特征扩展器
sklearn.pipeline.make_pipeline(多项式特征扩展器, 线性回归器) # 管线函数 pipeline模块 后续需要再研究???
--> return:k1,k2,k3...
x-->多项式特征扩展器 -- x x^2 x^3 ... --> 线性回归器 ---> k1,k2,k3...
'''
# 构建模型:训练数据train
model_poly = spl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression()) # 构建多项式特征扩展器
#model_poly = spl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),
# ('RegressionModel',lm.LinearRegression())])
model_poly.fit(train_x, train_y) # 训练数据集
pred_train_y = model_poly.predict(train_x) # 根据训练集进行训练数据的预测
# 拟合模型性能评估R^2
est_error = sm.r2_score(train_y, pred_train_y) # R-square:模型决定系数R^2
# R^2:
# 越接近1,表明方程的变量对y的解释能力越强,这个模型对数据拟合的也较好
# 越接近0,表明模型拟合的越差
# 经验值:>0.4, 拟合效果好
# R^2缺点:数据集的样本越大,R²越大,因此,不同数据集的模型结果比较会有一定的误差
print(est_error)
# 使用模型,预测数据test: 利用多项式模型进行数据测试test
#test_x = np.linspace(train_x.min(), train_y.max(), 1001)[:,np.newaxis] # np.newaxis新增一个列--> 2-dim
test_x = np.linspace(train_x.min(), train_y.max(), 1001) # .shape == (1001,)
# 一维数组 --> 二维数组(单纯增加一个列)
test_x = test_x.reshape((test_x.shape[0],-1)) # (1001, 1),数组test_x行数:test_x.shape[0]; 列数:任意多列
pred_test_y = model_poly.predict(test_x)
'''
By default, the input is converted to an at least 2D numpy array
'''
# 评估模型
#print(sm.mean_absolute_error(y, pred_y_ln)) # 平均绝对误差
#print(sm.mean_squared_error(y, pred_y_ln)) # 均方差
#print(sm.median_absolute_error(y, pred_y_ln)) # 中位数绝对误差
#print(sm.r2_score(y, pred_y_ln)) # LR模型推荐使用sm.r2_score评估 coefficient of determination
#
# 模型写入硬盘 pkl格式 方便pickle模块读取
with open('linear.pkl', 'wb') as f: # pickle.dump() 与 pickle.dumps()有什么区别 ???????
pickle.dump(model_ln, f)
with open('ridge.pkl', 'wb') as f:
pickle.dump(model_rd, f)
with open('polynomial.pkl', 'wb') as f:
pickle.dump(model_poly, f)
'''
# pikcle.dump(..) 与 pickle.dumps(..)的区别:
dump(obj, file, protocol=None, *, fix_imports=True)
Write a pickled representation of obj to the open file object file.
dumps(obj, protocol=None, *, fix_imports=True)
Return the pickled representation of the object as a bytes object.
'''
'''
可视化
'''
plt.figure('Regressions', facecolor='lightgray')
plt.title('Regressions', fontsize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.tick_params(labelsize=10)
plt.grid(linestyle=':')
# 输入样本散点图
plt.scatter(train_x, train_y, label='Sample', color='black',linewidth=1,alpha=0.8)
# 绘制线性回归和岭回归拟合图(由于训练集的点x不是有序排列的,因此作为x轴画图时需要先对点按x进行排序)
sorted_indices = train_x.T[0].argsort() # train_x 不是有序的,需要进行排序
'''
argsort(a, axis=-1, kind='quicksort', order=None)
Returns the indices that would sort an array.
'''
plt.plot(train_x[sorted_indices], pred_y_ln[sorted_indices], 'o-', label='LinearRegression', color='g',linewidth=1,alpha=1)
plt.plot(train_x[sorted_indices], pred_y_rd[sorted_indices], 'o-', label='RidgeRegression', color='b',linewidth=1,alpha=1)
# 绘制多项式回归拟合图
plt.plot(test_x, pred_test_y, label='PolynomialRegression', color='r',linewidth=2,alpha=1)
plt.legend(fontsize=8, loc='upper left')
plt.show()
'''
def r2_score(y_true, y_pred, sample_weight=None,
multioutput="uniform_average"):
"""R^2 (coefficient of determination) regression score function.
Best possible score is 1.0 and it can be negative (because the
model can be arbitrarily worse). A constant model that always
predicts the expected value of y, disregarding the input features,
would get a R^2 score of 0.0.
'''
'''
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
"""Scale input vectors individually to unit norm (vector length).
Read more in the :ref:`User Guide <preprocessing_normalization>`.
Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
The data to normalize, element by element.
scipy.sparse matrices should be in CSR format to avoid an
un-necessary copy.
def transform(self, X):
"""Transform data to polynomial features
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to transform, row by row.
def fit(self, X, y=None):
"""
Compute number of output features.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data.
dir(sklearn.pipeline):
['Bunch', 'FeatureUnion', 'Memory', 'Parallel', 'Pipeline', 'TransformerMixin',
'_BaseComposition', '__all__', '__builtins__', '__cached__', '__doc__', '__file__',
'__loader__', '__name__', '__package__', '__spec__', '_fit_one_transformer', '_fit_transform_one',
'_name_estimators', '_transform_one', 'check_memory', 'clone', 'defaultdict', 'delayed',
'if_delegate_has_method', 'make_pipeline', 'make_union', 'np', 'six', 'sparse']
'''