(一) 岭回归简介:
线性回归最主要问题是对异常值敏感。在真实世界的数据收集过程中,经过会遇到错误的度量结果。而线性回归使用的普通最小二乘法,其目标是使平方误差最小化。这时,由于异常值误差的绝对值很大,因此破坏整个模型。
如何解决呢?
我们引入正则化项的系数作为阈值来消除异常的影响。这个方法称为岭回归。
(具体原理待完善,读者可参考其他文献)
(二) 岭回归实现原理(代码参考《机器实战》):
fromnumpy import *
defloadDataSet(fileName):
numFeat =len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
defridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print "This matrix is singular,cannot do inverse"
return
ws = denom.I * (xMat.T*yMat)
return ws
defridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc meanthen subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
(三) sklearn中岭回归举例:
importnumpy as np
fromnumpy import *
filename="data.txt"
X = []
y = []
withopen(filename, 'r') as f:
for line in f.readlines():
xt= [float(i) for i in line.split(',')]
X.append(xt[:-1])
y.append(xt[-1])
X=np.array(X);y=np.array(y).T
print(shape(X),shape(y))
num_training= int(0.7 * len(X))
num_test= len(X) - num_training
X_train =np.array(X[:num_training])
y_train =np.array(y[:num_training])
X_test =np.array(X[num_training:])
y_test =np.array(y[num_training:])
fromsklearn import linear_model
Ridge_regressor= linear_model.Ridge(alpha=0.1,fit_intercept=True,max_iter=10000)
Ridge_regressor.fit(X_train,y_train)
y_test_pred= Ridge_regressor.predict(X_test)
for i,yyin enumerate(y_test):
print("true:",yy," predict:",y_test_pred[i])