06.线性回归


理论推导

通常对于一组特征数据和其标记值: ( x 1 , y 1 ) , ( x 2 , y 2 ) , . . . , ( x n , y n ) (x_1, y_1), (x_2, y_2), ..., (x_n, y_n) ​,在使用特征值 x i x_i​ y i y_i​ 进行预测时,根据习惯,如果 y i y_i​ 是连续的,则称这种操作或者技术为回归;如果 y i y_i​ 是离散的,则通常称为分类

X R n m : { x 1 , x 2 , . . . , x n } X\in R^{n*m}:\{x_1,x_2,...,x_n\}

Y R n 1 : { y 1 , y 2 , . . . , y n } Y\in R^{n*1}:\{y_1,y_2,...,y_n\}

其中 X X 特征数据, n n 条,每条属性维度为 m m Y Y 为对应的标签

我们需找到 w R m 1 w\in R^{m*1} , b R 1 1 b\in R^{1*1} 使得, X w + b = Y Xw+b=Y

进一步把 b b 放在 w w 中,则有: Y = X w Y=Xw^* ,其中 X = { 1 , x 1 , x 2 , . . , x n } R n ( m + 1 ) X=\{1,x_1,x_2,..,x_n\}\in R^{n*(m+1)} w R ( m + 1 ) 1 w^*\in R^{(m+1)*1}

如果 X X 可逆,则有 w = X 1 Y w^*=X^{-1}Y ,这就可以找到 w w^*

因为: y 1 = b + x 11 w 1 + x 12 w 2 + , . . . + x 1 n w n = 1 w 0 + x 11 w 1 + x 12 w 2 + , . . . + x 1 n w n y_1 = b+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n = 1*w_0+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n

写成矩阵形式为: y 1 = [ 1 , x 11 , x 12 , . . . , x 1 n ] [ w 0 , w 1 , w 2 , . . . , w n ] T y_1=[1,x_{11},x_{12},...,x_{1n}]*[w_0,w_1,w_2,...,w_n]^T

因此就有: Y = X w Y=Xw^*
Y = [ y 1 , y 2 , . . . , y n ] T Y=[y_1,y_2,...,y_n]^T , X = [ 1 , x 1 , x 2 , . . . , x n ] X=[1,x_1,x_2,...,x_n] , w = [ b , w ] T w^*=[b,w]^T


求逆(伪逆)法

在这里插入图片描述

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import scipy

def plotData(X,y):
    plt.scatter(X[...,0],y)
    plt.show()

# 直接求导
def matrixSolver(X,y):
    X = np.hstack((np.ones((len(X),1)),X))
    # w = np.dot(np.linalg.inv(X),y) # 求逆
    w = np.dot(np.linalg.pinv(X), y)  # 求伪逆
    return w


if __name__=="__main__":
    X, y = make_regression(200, 1, bias=2, noise=4)
    # 划分训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
    # plotData(X,y)

    # 直接求导
    w = matrixSolver(X_train,y_train)
    # print(w)

    # 画图
    plt.scatter(X_test,y_test,s=30,c='red',marker='o',alpha=0.5,label='C1')
    plt.plot(X_test,np.dot(np.hstack((np.ones((len(X_test),1)),X_test)),w),c="blue")

    plt.show()

梯度下降

详细推导参考:

实例

"""
Author:wucng
Time:  20200114
Summary: 线性回归对boston数据预测
源代码: https://github.com/wucng/MLAndDL
参考:https://cuijiahua.com/blog/2017/11/ml_3_decision_tree_2.html
"""

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import scipy,pickle,os,time
import pandas as pd

# 1.加载数据集(并做预处理)
def loadData(dataPath: str) -> tuple:
    with open(dataPath,"r") as fp:
        lines = fp.readlines()
        dataset=[]
        i = 0
        while i<len(lines):
            line = lines[i]
            i += 1
            if line[0].isdigit(): # 数字开头
                data1=list(map(float,line.strip().split(" ")))
                line = lines[i]
                i += 1
                data2 = list(map(float, line.strip().split(" ")))
                data1.extend(data2)
                dataset.append(data1)
            else:
                continue

        dataset = np.asarray(dataset)

        # 拆分成训练集与标签
        X,y = dataset[...,:-1],dataset[...,-1]

        # 数据归一化
        X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))

        # 使用sklearn方式
        # X = MinMaxScaler().transform(X)

    return (X,y)

class LinearRegressionSelf(object):
    """求逆(伪逆)法"""
    def __init__(self,save_file="model.npy"):
        self.save_file = save_file

    def __fit(self,X,y):
        # 直接求导
        X = np.hstack((np.ones((len(X), 1)), X))
        # w = np.dot(np.linalg.inv(X),y) # 求逆
        w = np.dot(np.linalg.pinv(X), y)  # 求伪逆

        return w

    def fit(self,X,y,batch_size=32,epochs=20):
        if not os.path.exists(self.save_file):
            length = len(y)
            m = len(y)//batch_size
            last_w = []
            for epoch in range(epochs):
                w = []
                # 随机打乱数据
                index = np.arange(0, length)
                np.random.seed(epoch)
                np.random.shuffle(index)
                new_X = X[index]
                new_y = y[index]
                for i in range(m):
                    start = i*batch_size
                    end = min((i+1)*batch_size,length)
                    w.append(self.__fit(new_X[start:end],new_y[start:end]))

                last_w.append(np.mean(w,0))

            # save parameter
            np.save(self.save_file,np.mean(last_w,0))

        self.w = np.load(self.save_file)

    def predict(self,X):
        X = np.hstack((np.ones((len(X), 1)), X))
        return np.dot(X,self.w)

    def error(self,y_true,y_pred):
        # https://www.jianshu.com/p/3a98f33113ac
        # 越大,拟合的效果越好,最优值为1,并且模型的效果很离谱时可能为负
        return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)

class LinearRegressionSelf2(object):
    """梯度下降"""
    def __init__(self,save_file="model.ckpt"):
        self.save_file = save_file

    def __fit(self,X,y,w,b,lr=1e-3):
        diff = np.dot(X, w) + b - y
        w-=lr*(1/len(y))*(np.dot(np.transpose(X), diff))
        b-=lr*np.mean(diff)

        return w,b

    def fit(self,X,y,batch_size=32,epochs=50000,lr=5e-4):
        if not os.path.exists(self.save_file):
            length = len(y)
            m = len(y)//batch_size
            w = np.random.random((len(X[0]),1)) # 初始随机值
            b = np.random.random((1,1)) # 初始随机值

            for epoch in range(epochs):
                # 随机打乱数据
                index = np.arange(0, length)
                np.random.seed(epoch)
                np.random.shuffle(index)
                new_X = X[index]
                new_y = y[index]
                for i in range(m):
                    start = i*batch_size
                    end = min((i+1)*batch_size,length)
                    w,b = self.__fit(new_X[start:end],new_y[start:end],w,b,lr)

                # print(w,b)

            # save parameter
            pickle.dump({"w":w,"b":b},open(self.save_file,"wb"))

        data = pickle.load(open(self.save_file,"rb"))
        self.w = data["w"]
        self.b = data["b"]

    def predict(self,X):
        return np.dot(X,self.w)+self.b

    def error(self,y_true,y_pred):
        # https://www.jianshu.com/p/3a98f33113ac
        # 越大,拟合的效果越好,最优值为1,并且模型的效果很离谱时可能为负
        return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)

if __name__=="__main__":
    dataPath = "../../dataset/boston.txt"
    X, y = loadData(dataPath)
    if len(y.shape)==1:y=y[...,None]
    # 划分训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

    start = time.time()
    clf = LinearRegressionSelf()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print("error:",clf.error(y_test,y_pred))
    error = np.sum((y_pred - y_test) ** 2) / len(y_test)
    print("cost time:%.6f(s) error:%.3f"%(time.time()-start,error))
    """
    error: 0.7131946712017807
    cost time:0.000985(s) error:32.785
    """
    # 使用sklearn的LinearRegression方法
    start = time.time()
    clf = LinearRegression()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    error = 1-np.sum((y_pred-y_test)**2)/np.sum((y_test-np.mean(y_test))**2)
    print("error:",error)
    error = np.sum((y_pred - y_test) ** 2) / len(y_test)
    print("cost time:%.6f(s) error:%.3f" % (time.time() - start, error))
    """
    error: 0.7215519718844166
    cost time:0.001995(s) error:31.830
    """
发布了96 篇原创文章 · 获赞 179 · 访问量 64万+

猜你喜欢

转载自blog.csdn.net/wc781708249/article/details/103960513