五：方差和偏差

一：init.py(主函数)

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import function as f

X, y, Xval, yval, Xtest, ytest = f.load_data()

#用二维点阵展示数据
df = pd.DataFrame({'water_level':X, 'flow':y}) #创建一个二维表
sns.lmplot('water_level', 'flow', data = df, fit_reg = False, height = 7)
# plt.show()

X, Xval, Xtest =[np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis = 1) for x in (X, Xval, Xtest)] #插入一列x0

#假设theta = 1，看一下代价是多少，梯度是多少
# theta = np.ones(X.shape[1])
# print(f.cost(theta, X, y))
# print(f.gradient(theta, X, y))
# print(f.regularized_gradient(theta, X, y))

#对theta进行优化
# theta = np.ones(X.shape[0])
# final_theta = f.linear_regression_np(X, y, l = 0).get('x')

#画出拟合的直线
# b = final_theta[0] # intercept
# m = final_theta[1] # slope
# plt.scatter(X[:,1], y, label="Training data")
# plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
# plt.legend(loc=2)
# plt.show()

training_cost, cv_cost = [], [] #创建训练集和测试集的误差的数组
# 1.使用训练集的子集来拟合应模型
# 2.在计算训练代价和交叉验证代价时，没有用正则化
# 3.记住使用相同的训练集子集来计算训练代价
# TIP：向数组里添加新元素可使用append函数
##计算训练代价和交叉验证集代价
#step1:获取样本个数，遍历每个样本
# m = X.shape[0]
# for i in range(1,m+1):
#     #step2:计算当前样本的代价
#     res = f.linear_regression_np(X[:i, : ], y[:i], l = 0)
#     tc = f.regularized_cost(res.x, X[:i, :], y[:i], l=0)
#     cv = f.regularized_cost(res.x, Xval, yval, l=0)
#     #step3:把计算的结果存储在预先定义的数组中
#     training_cost.append(tc)
#     cv_cost.append(cv)
#
# plt.plot(np.arange(1, m+1), training_cost, label='training cost')
# plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
# plt.legend(loc=1)
# plt.show() #这个模型不好欠拟合

##创建多项式特征来避免欠拟合
X, y, Xval, yval, Xtest, ytest = f.load_data()
print(f.poly_features(X, power = 3))
#准备多项式回归数据
# 1：扩展特征到 8阶,或者你需要的阶数
# 2：使用 归一化 来合并  ??
# 3：不要忘记添加偏置项
X_poly, Xval_poly, Xtest_poly = f.perpare_poly_data(X, Xval, Xtest, power = 8)
print(X_poly[:3, :])

##接下来画出学习曲线，并找到最佳λ，画出曲线

#λ=0
f.plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()

##选择λ
#验证集
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []
for l in l_candidate:
    res = f.linear_regression_np(X_poly, y, l)

    tc = f.cost(res.x, X_poly, y)
    cv = f.cost(res.x, Xval_poly, yval)

    training_cost.append(tc)
    cv_cost.append(cv)

#误差曲线
plt.plot(l_candidate, training_cost, label = 'training')
plt.plot(l_candidate, cv_cost, label = 'cross validation')
plt.legend(loc = 2)
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()

#测试集
print("显示：",l_candidate[np.argmin(cv_cost)])
for l in l_candidate:
    theta = f.linear_regression_np(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, f.cost(theta, Xtest_poly, ytest)))

二:function.py

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#读取数据
def load_data():
    d = sio.loadmat(r'******')
    return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']]) #训练集，测试集，验证集

#代价函数
def cost(theta, X, y):
    #input:参数值theta，数据X，标签y
    #output：当前参数值下代价函数
    #todo：根据参数和输入的数据计算代价函数

    #获取样本个数
    m = X.shape[0]
    #计算代价函数
    inner = X @ theta - y
    square_num = inner.T @ inner
    cost = square_num / (2 * m)
    return cost

#梯度
def gradient(theta, X, y):
    #获取样本个数
    m = X.shape[0]
    #计算代价函数
    grad = (X.T @ (X @ theta - y)) / m
    return grad

#正则化梯度和代价函数
def regularized_gradient(theta, X, y, l = 1):
    #获取样本个数
    m = X.shape[0]
    #计算正则化梯度
    regularized_term = theta.copy()
    regularized_term[0] = 0 #正则化项 从 1开始
    regularized_term = (l / m) * regularized_term
    return gradient(theta, X, y) + regularized_term

def regularized_cost(theta, X, y, l = 1):
    #获取样本个数
    m = X.shape[0]

    regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()
    return cost(theta, X, y) + regularized_term

#拟合参数
def linear_regression_np(X, y, l = 1):
    #初始化参数
    theta = np.ones(X.shape[1])
    #调用优化算法
    res = opt.minimize(fun = regularized_cost,
                       x0 = theta,
                       args = (X, y, l),
                       method = 'TNC',
                       jac = regularized_gradient,
                       options = {'disp': True})
    return res

#创建多项式特征
def perpare_poly_data(*args, power):
    def prepare(x):
        #特征映射
        df = poly_features(x, power = power) #n次多项式生成器
        #归一化处理
        ndarr = normalize_feature(df).values
        #添加偏置项
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis = 1)
    return [prepare(x) for x in args]


def poly_features(x, power, as_ndarray = False):#特征映射
    data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
    df = pd.DataFrame(data)

    return df.values if as_ndarray else df

#归一化
def normalize_feature(df):
    """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())

#画出学习曲线
def plot_learning_curve(X, y, Xval, yval, l = 0):
    #input:训练数据集X,y。交叉验证集Xval，yval，正则化参数l
    #output:当前参数值下的梯度
    #step1:初始化参数，获取样本个数，开始遍历
    training_cost, cv_cost = [], []
    m = X.shape[0]
    for i in range(1, m + 1):
        #step2:调用之前写好的拟合数据函数进行数据拟合
        res = linear_regression_np(X[:i, : ], y[:i], l = l)
        #step3:计算样本代价
        tc = cost(res.x, X[:i, : ], y[:i])
        cv = cost(res.x, Xval, yval)

        training_cost.append(tc)
        cv_cost.append(cv)

    plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
    plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
    plt.legend(loc = l)

Chris_hx

发布了40 篇原创文章 · 获赞 4 · 访问量 5174

私信关注

一：__init__.py(主函数)

二:function.py

猜你喜欢

一：init.py(主函数)