结论：在一般数据上，标准回归、岭回归、前向逐步回归效果差不多。可利用交叉验证比较出相对较优的模型。一般来说，训练数据的相关系数会高于测试数据的相关系数。

1.引入regression.py和导入数据

# 引入regression.py
import regression

df = pd.read_excel('Row_data-array.xlsx')
xArr = df.iloc[:,:-1].values
yArr = df.iloc[:,-1].values

print(xArr.shape)
print(yArr.shape)
#(1007, 9)
#(1007,)

df.head()

在这里插入图片描述
.

2. 最小二乘法的标准回归分析

ws = regression.standRegres(xArr,yArr)
ws
#matrix([[ 9.87108037e+00],
#        [ 1.33547703e+01],
#        [ 5.53508696e-02],
#        [ 2.54700518e-01],
#        [-2.48875176e-01],
#        [-1.42790710e+00],
#        [ 2.72100042e-03],
#        [ 2.76569295e-02],
#        [ 5.47646170e-01]])

# 求相关系数
xMat = mat(xArr);yMat = mat(yArr)
yHat = xMat*ws
corrcoef(yHat.T,yMat)
#array([[1.        , 0.76793521],
#       [0.76793521, 1.        ]])

# 预测值
yHat1 = mat(xArr[0])*ws
yHat1
#matrix([[327.12552844]])

3. 岭回归

df = pd.read_excel('Row_data-array.xlsx')
xArr = df.iloc[:,:-1].values
yArr = df.iloc[:,-1].values

ridgeWeights = regression.ridgeTest2(xArr,yArr)

# 观察系数的变化
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()

在这里插入图片描述
.

ridgeWeights[0]
#array([ 9.87107951e+00,  1.33547677e+01,  5.53508773e-02,  2.54700520e-01,
#       -2.48877259e-01, -1.42790686e+00,  2.72100114e-03,  2.76569309e-02,
#        5.47646173e-01])

# 预测值
xMat = mat(xArr)
yMat = mat(yArr).T
print(xMat[:2])
#[[  1.    2.  570.  380.    1.    8.  225.  275.  302.5]
# [  1.    3.  640.  640.    0.    9.  360.  340.  475. ]]
print(yMat[:2])
#[[345]
# [455]]

yHat = xMat*mat(ridgeWeights[0]).T
yHat
#matrix([[327.12552905],
#        [506.03196428],
#        [348.00154736],
#        ...,
#        [333.09528552],
#        [336.24559748],
#        [560.58028032]])

# 计算岭回归的相关系数
corrcoef(yHat.T,yMat.T)
#array([[1.        , 0.76793521],
#       [0.76793521, 1.        ]])

4.前向逐步回归

df = pd.read_excel('Row_data-array.xlsx')
xArr = df.iloc[:,:-1].values
yArr = df.iloc[:,-1].values

4.1 第1种参数：步长0.05，迭代1000次

# 第1种参数：步长0.05，迭代1000次
wMat = regression.stageWise2(xArr,yArr,0.05,1000)

# 打印前向逐步回归的系数变化
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(wMat)
plt.legend(range(wMat.shape[1]))
plt.show()

在这里插入图片描述

# 打印系数w
wMat[-1]
#array([ 8.7 , -1.2 ,  0.65,  0.1 , -2.65, -4.35,  0.1 ,  0.  ,  0.05])

# 预测值与真实值之间的相关程度
xMat = mat(xArr);yMat = mat(yArr)
yHat = xMat*ws
corrcoef(yHat.T,yMat)
#array([[1.        , 0.76793521],
#       [0.76793521, 1.        ]])

# 预测值
yHat1 = mat(xArr[0])*ws
yHat1.A[0][0]
#327.12552843866297

4.2 第2种参数：步长0.1，迭代1000次

# 第2种参数：步长0.1，迭代1000次
wMat = regression.stageWise2(xArr,yArr,0.1,1000)

# 打印前向逐步回归的系数变化
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(wMat)
plt.legend(range(wMat.shape[1]))
plt.show()

在这里插入图片描述

# 打印系数w
wMat[-1]
#array([ 8.8, -2.3,  0.7,  0.1, -3.6, -4.8,  0.1,  0. ,  0. ])

# 预测值与真实值之间的相关程度
xMat = mat(xArr);yMat = mat(yArr)
yHat = xMat*ws
corrcoef(yHat.T,yMat)
#array([[1.        , 0.76793521],
#       [0.76793521, 1.        ]])

# 预测值
yHat1 = mat(xArr[0])*ws
yHat1.A[0][0]
#327.12552843866297

4.3 第3种参数：步长0.01，迭代1000次

# 第3种参数：步长0.01，迭代10000次
wMat = regression.stageWise2(xArr,yArr,0.01,1000)

# 打印前向逐步回归的系数变化
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(wMat)
plt.legend(range(wMat.shape[1]))
plt.show()

在这里插入图片描述

# 打印系数w
wMat[-1]
#array([ 4.64,  0.  ,  0.45,  0.11, -1.58, -2.34,  0.11,  0.05,  0.22])

# 预测值与真实值之间的相关程度
xMat = mat(xArr);yMat = mat(yArr)
yHat = xMat*ws
corrcoef(yHat.T,yMat)
#array([[1.        , 0.76793521],
#       [0.76793521, 1.        ]])

# 预测值
yHat1 = mat(xArr[0])*ws
yHat1.A[0][0]
#327.12552843866297

在交叉验证之前，未区分训练数据和测试数据，故预测值都是327.12552843866297。

5. 交叉验证

# 导入数据
df = pd.read_excel('Row_data-array.xlsx')
xArr = df.iloc[:,:-1].values
yArr = df.iloc[:,-1].values

# 区分训练数据和测试数据
trainX,trainY,testX,testY = regression.crossValidation2(xArr,yArr,traning_rate=0.9)

mattrainX = mat(trainX); mattrainY=mat(trainY).T
mattestX = mat(testX);mattestY = mat(testY).T

5.1 交叉验证标准回归分析

ws = regression.standRegres(mattrainX,mattrainY.T)

# 预测值与真实值之间的相关程度
yHat = mattestX*ws
corrcoef(yHat.T,mattestY.T)[0][1]
#0.6376006824230555

5.2 交叉验证岭回归

ws = regression.ridgeRegres(mattrainX,mattrainY)
# 预测值与真实值之间的相关程度
yHat = mattestX*ws
corrcoef(yHat.T,mattestY.T)[0][1]
#0.6376050542255823

5.3 交叉验证前向逐步回归

ws = regression.stageWise2(mattrainX,mattrainY,eps=0.1,numIt=500)
# 预测值与真实值之间的相关程度
yHat = mattestX*mat(ws[-1]).T
corrcoef(yHat.T,mattestY.T)[0][1]
#0.5642825810183151

regression.py

from numpy import *
import pandas as pd

def standRegres(xArr,yArr):
    xMat = mat(xArr); yMat = mat(yArr).T
    xTx = xMat.T*xMat
    if linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws

def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
    return ((yArr-yHatArr)**2).sum()

def ridgeRegres(xMat,yMat,lam=0.2):
    xTx = xMat.T*xMat
    denom = xTx + eye(shape(xMat)[1])*lam
    if linalg.det(denom) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = denom.I * (xMat.T*yMat)
    return ws
    
def ridgeTest2(xArr,yArr):
    xMat = mat(xArr); yMat=mat(yArr).T
    numTestPts = 30
    wMat = zeros((numTestPts,shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat,yMat,exp(i-10))
        wMat[i,:]=ws.T
    return wMat

def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat

def stageWise2(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    m,n=shape(xMat)
    returnMat = zeros((numIt,n)) #testing code remove
    ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
        print (ws.T)
        lowestError = inf; 
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE = rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i,:]=ws.T
    return returnMat

def crossValidation2(xArr,yArr,traning_rate=0.9):
    m = len(yArr)                           
    indexList = list(range(m))
    trainX=[]; trainY=[]
    testX = []; testY = []
    random.shuffle(indexList)
    for j in range(m):#create training set based on first 90% of values in indexList
        if j < m*traning_rate: 
            trainX.append(xArr[indexList[j]])
            trainY.append(yArr[indexList[j]])
        else:
            testX.append(xArr[indexList[j]])
            testY.append(yArr[indexList[j]])
    return trainX,trainY,testX,testY

机器学习之回归分析--预测值

结论：在一般数据上，标准回归、岭回归、前向逐步回归效果差不多。可利用交叉验证比较出相对较优的模型。一般来说，训练数据的相关系数会高于测试数据的相关系数。

1.引入regression.py和导入数据

2. 最小二乘法的标准回归分析

3. 岭回归

4.前向逐步回归

4.1 第1种参数：步长0.05，迭代1000次

4.2 第2种参数：步长0.1，迭代1000次

4.3 第3种参数：步长0.01，迭代1000次

5. 交叉验证

5.1 交叉验证标准回归分析

5.2 交叉验证岭回归

5.3 交叉验证前向逐步回归

regression.py

猜你喜欢