Linear regression 是机器学习中一种常用的预测算法。他根据训练样本的分布,提出一个假设函数
Linear regression algorithm
step 1: 构造假设函数(hypothesis function)
Note:这里的
step 2: 损失函数(cost function)
step3: 梯度下降
对
Note: 这里的θ要同步更新。例如:
正确做法
temp0 :=
temp1 :=
错误示例:
temp0 :=
temp1 :=
在这里推荐用矩阵计算,运算速度快而且可以避免这一问题,
即可得到所有同时更新的θ值。θ是
重复迭代多次即可得到拟合参数
实例:训练样本数据https://pan.baidu.com/s/1nuM66mt
1.训练样本
2.拟合曲线
多说一点,
(1)在
(2)要对训练样本特征做归一化处理,这样梯度下降会比较快。而且对于量纲之间差距比较大的特征,如果不做归一化处理,梯度下降有可能收敛在局部最优而不是全局最优。
对于本例程
可以看出当
本例程中
对原始数据的80%作为训练样本,20%作为测试用例。源代码如下
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import spline
class LinearRegression:
def sampleDivde(self,data_X,data_Y):
combineMatix = np.append(data_X.T,data_Y).reshape(data_X.shape[1]+1,data_X.shape[0]).T
#打乱顺序,随机选择前80%的数据作为训练样本,20%作为测试用例
np.random.shuffle(combineMatix)
m = int(data_X.shape[0]*0.8)
train_X = combineMatix[:m,:2]
train_Y = combineMatix[:m,2]
test_X =combineMatix[m:,:2]
test_Y = combineMatix[m:,2]
return train_X,train_Y,test_X,test_Y
def featureScale(self, data_X):
u = np.mean(data_X,0)
sigma = np.std(data_X,0)
newData = (data_X - u)/sigma
return newData
def costFunc(self,theta,data_X,data_Y):
m = data_X.shape[0]
J = np.dot((np.dot(theta,data_X.T).T- data_Y).T,np.dot(theta,data_X.T).T-data_Y)/(2*m)
return J
def gradientDescent(self,theta,data_X,data_Y,alpha,num_iters):
m = data_X.shape[0]
for i in range(num_iters):
theta = theta - np.dot(data_X.T,np.dot(theta.T,data_X.T).T- data_Y)*alpha/m
return theta
def error(self,theta,test_X,Y_label):
test_Y = np.dot(theta,test_X.T)
error = sum(test_Y-Y_label)/Y_label.shape
return error
def learningRateCheck(self,theta,data_X,data_Y,alpha):
m = data_Y.shape[0]
j_X = []
j_Y = []
for i in range(0,100,5):
j_X.append(i)
for j in range(i):
theta = theta - np.dot(data_X.T, np.dot(theta.T, data_X.T).T - data_Y) * alpha / m
j_Y.append(self.costFunc(theta,data_X,data_Y))
xNew = np.linspace(min(j_X), max(j_X), 300)
ySmooth = spline(j_X,j_Y,xNew)
plt.plot(xNew,ySmooth, color = 'red')
plt.show()
if __name__ =='__main__':
path = 'Housedata.txt'
fr = open(path, 'r+')
x = []
y = []
for line in fr:
items = line.strip().split(',')
x.append(float(items[0]))
y.append(float(items[1]))
data_Y = np.array(y)
test = LinearRegression()
biasX = np.ones(len(y))
data_X = np.append(biasX, np.array(x)).reshape(2, len(y)).T
norm_X = np.append(biasX, test.featureScale(np.array(x))).reshape(2, len(y)).T
theta = np.zeros(norm_X.shape[1])
train_X, train_Y, test_X, test_Y = test.sampleDivde(norm_X, data_Y)
theta_Results = test.gradientDescent(theta, norm_X, data_Y, 0.1, 1500)
X = data_X[:,1]
Y = np.dot(theta_Results, norm_X.T)
plt.plot(X, Y, color='red')
plt.scatter(x, y, s=20, marker="x")
plt.title('Test data')
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
error = test.error(theta_Results, test_X, test_Y)
print(error)
plt.show()
test.learningRateCheck(theta, norm_X, data_Y, 0.1)