对比利用多项式定义进行多项式拟合：笔记代码：多项式拟合和极值点连接

线性回归

sklearn.linear_model.LinearRegression()
    --> return：线性回归器
                线性回归器.fit(输入样本，输出标签)                # 训练数据
                线性回归器.predict(输入样本)                      # 预测数据
                    -- > return：预测输出标签

岭回归

# 岭回归 (削弱异常值对拟合的影响,正则强度越大，削弱的越厉害，降低对异常数据的依赖)
loss = J(k, b) + 正则函数(样本权重)*正则强度(或惩罚系数）         # 正则项：可以防止过拟合
sklearn.linear_model.Ridge(正则强度,
                           fit_intercept=是否修正截距，
                           max_iter=最大迭代次数)
    --> return：岭回归器
                岭回归器.fit()                      # 训练数据
                岭回归器.predict()                  # 预测数据

欠拟合：无论是训练数据还是测试数据，模型给出的预测值和真实值都存在较大的误差。
过拟合：模型对于训练数据具有较高的精度，但是对测试数据则表现极差。模型过于特殊，不够泛化（不够一般，即普适性不强）
欠拟合 <--- 模型复杂度 ---> 过拟合

多项式回归

sklearn.preprocessing.PolynomialFeatures(最高次数)

sklearn.pipeline.make_pipeline(多项式特征扩展器, 线性回归器)         # 管线函数 pipeline模块
    --> return：P:Pipeline

# 或者：
sklearn.pipeline.Pipeline([('扩展器自定义名', 多项式特征扩展器), 
                           ('回归器自定义名', 线性回归器)])

x-->多项式特征扩展器 -- x x^2 x^3 ... --> 线性回归器 ---> k1,k2,k3...
# 注pipeline.make_pipeline is a shorthand for the Pipeline constructor，区别在于make_pipeline不需要自定义名称

1. 管线函数make_pipeline(.., ..)

pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())
output:
Pipeline(memory=None,
         steps=[('PolyFeatures', PolynomialFeatures(degree=7, include_bias=True, interaction_only=False)), 
                ('RegressionModel', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))
                ]
         )

# help(...)
def make_pipeline(*steps, **kwargs):
    """Construct a Pipeline from the given estimators.
    This is a shorthand for the Pipeline constructor; it does not require, and
    does not permit, naming the estimators. Instead, their names will be set
    to the lowercase of their types automatically.

    Parameters
    ----------
    *steps : list of estimators,   # list

    memory :

2. 管线函数Pipeline([()], ..)

pl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),    # （自定义名，）
              ('RegressionModel',lm.LinearRegression())])   # Pipeline的参数是tuple组成的list,[]一定不能少
              # sp.PolynomialFeatures(7)的输出作为lm.LinearRegression()的输入
# == 
pl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())

# help(...Pipeline)
    Parameters
    ----------
    steps : list
        List of (name, transform) tuples (implementing fit/transform) that are
        chained, in the order in which they are chained, with the last object
        an estimator.

回归模型性能评估

r2_score

# 拟合模型性能评估R^2
est_error = sklearn.metrics.r2_score(train_y, pred_train_y)
# R^2:
#    越接近1，表明方程的变量对y的解释能力越强，这个模型对数据拟合的也较好
#    越接近0，表明模型拟合的越差
#    经验值：>0.4， 拟合效果好
# R^2缺点：数据集的样本越大，R²越大，因此，不同数据集的模型结果比较会有一定的误差

code

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 16:56:50 2018

@author: Administrator
"""

import pickle                              # 硬盘存储模块
import numpy as np
import sklearn.linear_model as lm          # 线性模型模块
import sklearn.metrics as sm               # 模型评估模块
import matplotlib.pyplot as plt
import sklearn.pipeline as spl             # 管线函数
import sklearn.preprocessing as sp

# traning datas
train_x, train_y = [], []
# 读取文件
with open('single.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr in line.split(',')]
        train_x.append(data[:-1])
        train_y.append(data[-1])
train_x = np.array(train_x)                 # 训练数据集必须为array或者array_like
train_y = np.array(train_y)                 # 训练数据集必须为array或者array_like
#print(x.shape, y.shape)

'''
模型建立
sklearn.linear_model.LinearRegression()
    --> return：线性回归器
                线性回归器.fit(输入样本，输出标签)                # 训练数据
                线性回归器.predict(输入样本)                      # 预测数据
                    -- > return：预测输出标签
'''
model_ln = lm.LinearRegression()               # 构建线性回归器
model_ln.fit(train_x, train_y)                 # 训练数据  不返回k和b model中存储
pred_y_ln = model_ln.predict(train_x)

'''
岭回归 (削弱异常值对拟合的影响,正则强度越大，削弱的越厉害，降低对异常数据的依赖)
loss = J(k, b) + 正则函数(样本权重)*正则强度(或惩罚系数）         # 正则项：可以防止过拟合
sklearn.linear_model.Ridge(正则强度,
                           fit_intercept=是否修正截距，
                           max_iter=最大迭代次数)
    --> return：岭回归器
                岭回归器.fit()                      # 训练数据
                岭回归器.predict()                  # 预测数据
'''
model_rd = lm.Ridge(150, fit_intercept=True, max_iter=10000)     # 构建线性回归器
model_rd.fit(train_x, train_y)                                   # 训练数据  不返回k和b model中存储
pred_y_rd = model_rd.predict(train_x)

'''
多项式回归
sklearn.preprocessing.PolynomialFeatures(最高次数)
    --> return：多项式特征扩展器
sklearn.pipeline.make_pipeline(多项式特征扩展器, 线性回归器)         # 管线函数 pipeline模块 后续需要再研究？？？
    --> return：k1,k2,k3...
x-->多项式特征扩展器 -- x x^2 x^3 ... --> 线性回归器 ---> k1,k2,k3...
'''
# 构建模型：训练数据train
model_poly = spl.make_pipeline(sp.PolynomialFeatures(7),lm.LinearRegression())  # 构建多项式特征扩展器
#model_poly = spl.Pipeline([('PolyFeatures', sp.PolynomialFeatures(7)),
#                          ('RegressionModel',lm.LinearRegression())])
model_poly.fit(train_x, train_y)                                                # 训练数据集
pred_train_y = model_poly.predict(train_x)                                      # 根据训练集进行训练数据的预测
# 拟合模型性能评估R^2
est_error = sm.r2_score(train_y, pred_train_y)                                  # R-square：模型决定系数R^2
# R^2:
#    越接近1，表明方程的变量对y的解释能力越强，这个模型对数据拟合的也较好
#    越接近0，表明模型拟合的越差
#    经验值：>0.4， 拟合效果好
# R^2缺点：数据集的样本越大，R²越大，因此，不同数据集的模型结果比较会有一定的误差
print(est_error)

# 使用模型，预测数据test: 利用多项式模型进行数据测试test
#test_x = np.linspace(train_x.min(), train_y.max(), 1001)[:,np.newaxis]  # np.newaxis新增一个列--> 2-dim
test_x = np.linspace(train_x.min(), train_y.max(), 1001)        # .shape == (1001,)
# 一维数组 -->  二维数组(单纯增加一个列)
test_x = test_x.reshape((test_x.shape[0],-1))                   # (1001, 1),数组test_x行数:test_x.shape[0]; 列数：任意多列
pred_test_y = model_poly.predict(test_x)
'''
By default, the input is converted to an at least 2D numpy array
'''

# 评估模型
#print(sm.mean_absolute_error(y, pred_y_ln))        # 平均绝对误差
#print(sm.mean_squared_error(y, pred_y_ln))         # 均方差
#print(sm.median_absolute_error(y, pred_y_ln))      # 中位数绝对误差
#print(sm.r2_score(y, pred_y_ln))                   # LR模型推荐使用sm.r2_score评估 coefficient of determination
#
# 模型写入硬盘 pkl格式 方便pickle模块读取
with open('linear.pkl', 'wb') as f:                 # pickle.dump() 与 pickle.dumps()有什么区别 ???????
    pickle.dump(model_ln, f)
with open('ridge.pkl', 'wb') as f:
    pickle.dump(model_rd, f)
with open('polynomial.pkl', 'wb') as f:
    pickle.dump(model_poly, f)
'''
# pikcle.dump(..) 与 pickle.dumps(..)的区别：
dump(obj, file, protocol=None, *, fix_imports=True)
    Write a pickled representation of obj to the open file object file.
dumps(obj, protocol=None, *, fix_imports=True)
    Return the pickled representation of the object as a bytes object.
'''


'''
可视化
'''
plt.figure('Regressions', facecolor='lightgray')
plt.title('Regressions', fontsize=20)
plt.xlabel('x', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.tick_params(labelsize=10)
plt.grid(linestyle=':')

# 输入样本散点图
plt.scatter(train_x, train_y, label='Sample', color='black',linewidth=1,alpha=0.8)
# 绘制线性回归和岭回归拟合图（由于训练集的点x不是有序排列的，因此作为x轴画图时需要先对点按x进行排序）
sorted_indices = train_x.T[0].argsort()       # train_x 不是有序的，需要进行排序
'''
argsort(a, axis=-1, kind='quicksort', order=None)
    Returns the indices that would sort an array.
'''
plt.plot(train_x[sorted_indices], pred_y_ln[sorted_indices], 'o-', label='LinearRegression', color='g',linewidth=1,alpha=1)
plt.plot(train_x[sorted_indices], pred_y_rd[sorted_indices], 'o-', label='RidgeRegression', color='b',linewidth=1,alpha=1)
# 绘制多项式回归拟合图
plt.plot(test_x, pred_test_y, label='PolynomialRegression', color='r',linewidth=2,alpha=1)

plt.legend(fontsize=8, loc='upper left')
plt.show()




'''
def r2_score(y_true, y_pred, sample_weight=None,
             multioutput="uniform_average"):
    """R^2 (coefficient of determination) regression score function.

    Best possible score is 1.0 and it can be negative (because the
    model can be arbitrarily worse). A constant model that always
    predicts the expected value of y, disregarding the input features,
    would get a R^2 score of 0.0.
'''

'''
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
    """Scale input vectors individually to unit norm (vector length).
    Read more in the :ref:`User Guide <preprocessing_normalization>`.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

def transform(self, X):
        """Transform data to polynomial features
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to transform, row by row.

def fit(self, X, y=None):
        """
        Compute number of output features.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data.

dir(sklearn.pipeline)：         
    ['Bunch', 'FeatureUnion', 'Memory', 'Parallel', 'Pipeline', 'TransformerMixin', 
    '_BaseComposition', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', 
    '__loader__', '__name__', '__package__', '__spec__', '_fit_one_transformer', '_fit_transform_one', 
    '_name_estimators', '_transform_one', 'check_memory', 'clone', 'defaultdict', 'delayed', 
    'if_delegate_has_method', 'make_pipeline', 'make_union', 'np', 'six', 'sparse']
'''

机器学习sklearn模块（线性回归LinearRegression模型、岭回归Ridge模型、基于LinearRegression的多项式回归模型）

线性回归

岭回归

多项式回归

1. 管线函数make_pipeline(.., ..)

2. 管线函数Pipeline([()], ..)

回归模型性能评估

code

猜你喜欢