第五章 Logistic回归

PS：个人笔记根据《机器学习实战》这本书，Jack-Cui的博客，以及深度眸的视频进行学习

1 改进的随机梯度上升算法

from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
import numpy as np
import random

def loadDataSet():
    dataMat = []                                                  
    labelMat = []                                                   
    fr = open('testSet.txt')                                      
    for line in fr.readlines():                                    
        lineArr = line.strip().split()                               
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])    
        labelMat.append(int(lineArr[2]))                           
    fr.close()                                                     
    return dataMat, labelMat                                            

def sigmoid(inX):
    return 1.0 / (1 + np.exp(-inX))

def plotBestFit(weights):
    dataMat, labelMat = loadDataSet()                                  
    dataArr = np.array(dataMat)                                            
    n = np.shape(dataMat)[0]                                    
    xcord1 = []; ycord1 = []                                   
    xcord2 = []; ycord2 = []                                  
    for i in range(n):                                                 
        if int(labelMat[i]) == 1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])  
        else:
            xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])   
    fig = plt.figure()
    ax = fig.add_subplot(111)                                           
    ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)
    ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5)         
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]
    ax.plot(x, y)
    plt.title('BestFit')                                               
    plt.xlabel('X1'); plt.ylabel('X2')                                  
    plt.show()

"""
函数说明:改进的随机梯度上升算法

Parameters:
    dataMatrix - 数据数组
    classLabels - 数据标签
    numIter - 迭代次数
Returns:
    weights - 求得的回归系数数组(最优参数)
"""
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
    m,n = np.shape(dataMatrix)                                                # m=100,n=3
    weights = np.ones(n)                                                      #参数初始化,weights:[1,1,1]
    for j in range(numIter):                                                  #j = 0
        dataIndex = list(range(m))                                            #dataIndex:[0,...,99]
        for i in range(m):                                                    #i = 0 ,循环起到加总的作用
            alpha = 4/(1.0+j+i)+0.01                                          #⭐降低alpha的大小，每次减小1/(j+i)。改进之处，迭代的速度会更快，4.01
            randIndex = int(random.uniform(0,len(dataIndex)))                 #随机选取样本，0-99随机选一个数
            h = sigmoid(sum(dataMatrix[randIndex]*weights))                   #选择随机选取的一个样本，计算h=hθ(Xi)
            error = classLabels[randIndex] - h                                #计算误差 这里就是Y-hθ(Xi)
            weights = weights + alpha * error * dataMatrix[randIndex]         #更新回归系数，这里就是θ+α[Y-hθ(Xi)]*Xi，上面循环起到加总的作用
            del(dataIndex[randIndex])                                         #删除已经使用的样本，这里就不会重复抽取了
    return weights                                                            #weights其实就是更新了150*100次。             

if __name__ == '__main__':
    dataMat, labelMat = loadDataSet()
    weights = stocGradAscent1(np.array(dataMat), labelMat)
    plotBestFit(weights)

二个改进之处：

①alpha在每次迭代的时候都会调整，越来越小，不会变成0，因为有一个常数项（先开始大后来小，符合梯度变化。）注意J是迭代次数。

②更新回归系数时，只是用一个样本点，并且选择的样本点是随机的，每次迭代不使用已经用过的样本点（减少计算量）

2 回归系数与迭代次数的关系

from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
import numpy as np
import random

def loadDataSet():
    dataMat = []                                                 
    labelMat = []                                                   
    fr = open('testSet.txt')                                      
    for line in fr.readlines():                                       
        lineArr = line.strip().split()                                 
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])       
        labelMat.append(int(lineArr[2]))                          
    fr.close()                                                      
    return dataMat, labelMat                                   

def sigmoid(inX):
    return 1.0 / (1 + np.exp(-inX))

def gradAscent(dataMatIn, classLabels):
    dataMatrix = np.mat(dataMatIn)                                 
    labelMat = np.mat(classLabels).transpose()                        
    m, n = np.shape(dataMatrix)                                     
    alpha = 0.01                                                        
    maxCycles = 500                                                     
    weights = np.ones((n,1))
    weights_array = np.array([])
    for k in range(maxCycles):
        h = sigmoid(dataMatrix * weights)                             
        error = labelMat - h
        weights = weights + alpha * dataMatrix.transpose() * error
        weights_array = np.append(weights_array,weights)
    weights_array = weights_array.reshape(maxCycles,n)
    return weights.getA(),weights_array                               

def stocGradAscent1(dataMatrix, classLabels, numIter=150):
    m,n = np.shape(dataMatrix)                                            
    weights = np.ones(n)                                                       
    weights_array = np.array([])                                        
    for j in range(numIter):                                           
        dataIndex = list(range(m))
        for i in range(m):           
            alpha = 4/(1.0+j+i)+0.01                                            
            randIndex = int(random.uniform(0,len(dataIndex)))              
            h = sigmoid(sum(dataMatrix[randIndex]*weights))              
            error = classLabels[randIndex] - h                               
            weights = weights + alpha * error * dataMatrix[randIndex]      
            weights_array = np.append(weights_array,weights,axis=0)        
            del(dataIndex[randIndex])                                       
    weights_array = weights_array.reshape(numIter*m,n)                       
    return weights,weights_array                                            

"""
函数说明:绘制回归系数与迭代次数的关系

Parameters:
    weights_array1 - 回归系数数组1
    weights_array2 - 回归系数数组2
Returns:
    无
"""
def plotWeights(weights_array1,weights_array2):
    font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)         #设置汉字格式
    fig, axs = plt.subplots(nrows=3, ncols=2,sharex=False, sharey=False, figsize=(20,10))
    x1 = np.arange(0, len(weights_array1), 1)            
    axs[0][0].plot(x1,weights_array1[:,0])                        #weights_array1[:,0]代表W0
    axs0_title_text = axs[0][0].set_title(u'梯度上升算法：回归系数与迭代次数关系',FontProperties=font)
    axs0_ylabel_text = axs[0][0].set_ylabel(u'W0',FontProperties=font)
    plt.setp(axs0_title_text, size=20, weight='bold', color='black') 
    plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')

    axs[1][0].plot(x1,weights_array1[:,1])                        #weights_array1[:,1]代表W1
    axs1_ylabel_text = axs[1][0].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
                
    axs[2][0].plot(x1,weights_array1[:,2])                        #weights_array1[:,2]代表W2
    axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数',FontProperties=font)
    axs2_ylabel_text = axs[2][0].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') 
    plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')


    x2 = np.arange(0, len(weights_array2), 1)
    axs[0][1].plot(x2,weights_array2[:,0])                                                  
    axs0_title_text = axs[0][1].set_title(u'改进的随机梯度上升算法：回归系数与迭代次数关系',FontProperties=font)
    axs0_ylabel_text = axs[0][1].set_ylabel(u'W0',FontProperties=font)
    plt.setp(axs0_title_text, size=20, weight='bold', color='black') 
    plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')

    axs[1][1].plot(x2,weights_array2[:,1])
    axs1_ylabel_text = axs[1][1].set_ylabel(u'W1',FontProperties=font)
    plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')

    axs[2][1].plot(x2,weights_array2[:,2])
    axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数',FontProperties=font)
    axs2_ylabel_text = axs[2][1].set_ylabel(u'W2',FontProperties=font)
    plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') 
    plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')

    plt.show()       

if __name__ == '__main__':
    dataMat, labelMat = loadDataSet()           
    weights1,weights_array1 = stocGradAscent1(np.array(dataMat), labelMat)

    weights2,weights_array2 = gradAscent(dataMat, labelMat)
    plotWeights(weights_array1, weights_array2)

上图左侧相当于遍历整个数据集20次的时候，回归系数已收敛。训练已完成。

上图右侧当迭代次数为300多次的时候，回归系数才收敛。

《机器学习实战》个人学习记录笔记（十一）———Logistic回归改进与例子

第五章 Logistic回归

1 改进的随机梯度上升算法

2 回归系数与迭代次数的关系

猜你喜欢