《机器学习实战》第五章学习笔记(Logistic回归)

利用Logistic回归进行分类的主要思想是:根据现有数据对分类边界线建立回归公式,以此进行分类。这里的“回归”一词源于最佳拟合,表示要找到最佳拟合参数集。训练分类器时的做法就是寻找最佳拟合参数,使用的是最优化算法。

一、 Logistic分布

二、  Logistic回归模型(二分类模型)

其中,sigmoid函数:

                                        

换言之,直接利用sigmoid函数理解二分类Logistic回归模型:

P(Y=1|x)=sigmoid(z);P(Y=0|x)=1-P(Y=1|x)=sigmoid(z)。其中,z的表达式如下:


三、 模型参数估计

3.1 极大似然函数的方式估计参数:

3.2 梯度上升法估计参数

梯度上升算法用来求函数的最大值,而梯度下降算法用来求函数的最小值。

梯度上升算法迭代公式:


推导第一个:其中e是误差,样本矩阵x,类别标签y,回归系数w,步长a。

梯度上升的目的是最小化误差:

四、基于梯度上升算法估计最佳参数

4.1 代码实现

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 19:07:15 2018
file name:logRegres.py
@author: lizihua
"""
import numpy as np
from numpy import exp,mat,shape,ones,array,random
import matplotlib.pyplot as plt
#Logistic回归梯度上升优化算法
#加载数据
#注意,返回的是列表!!!
def loadDataSet():
    dataMat = [];labelMat = []
    fr = open('testSet.txt')
    for line in fr.readlines():
        lineArr = line.strip().split()
        #添加一列x0=1
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

#定义激活函数
def sigmoid(inX):
    return 1.0/(1+exp(-inX))

#梯度上升法
def gradAscent(dataMatIn,classLabels):
    #将数据转换成矩阵,此时,dataMatIn是list,dataMatrix是矩阵
    dataMatrix = mat(dataMatIn)
    #将标签数据转换成矩阵,并进行转置(transpose)
    labelMat = mat(classLabels).transpose()
    #m代表样本数目,n代表特征数目(已加上x0(x0=1)特征)
    m,n = shape(dataMatrix)
    #初始化:alpha是移动步长,iterNum 是迭代次数,weights是训练好的回归系数
    alpha = 0.001
    iterNum = 500
    weights = ones((n,1))
    for k in range(iterNum):
        h = sigmoid(dataMatrix*weights)
        error = labelMat - h
        weights = weights + alpha*dataMatrix.transpose()*error
    return weights

#随机梯度上升算法:随机选取一个误分类点来更新回归系数,直至没有误分类点

#随机梯度上升算法:随机选取一个误分类点来更新回归系数,直至没有误分类点
def stocGradAscent0(dataMatrix, classLabels,numIter):
    m,n = shape(dataMatrix)
    weights = ones(n)
    alpha=0.01
    weightSet=[]
    for j in range(numIter):
        for i in range(m):
            h = sigmoid(sum(dataMatrix[i]*weights))
            error = classLabels[i]-h
            #注意:原代码中,输入参数都是列表,列表*小数会报错,所以,将dataMatrix转换成数组array
            weights = weights +alpha *error* array(dataMatrix[i])
            weightSet.append(weights)
    return weights,array(weightSet)

#改进版
def stocGradAscent1(dataMatrix, classLabels,numIter=150):
    m,n = shape(dataMatrix)
    weights = ones(n)
    weightSet=[]
    for j in range(numIter):
        #python3中range返回的是range对象,后面使用了del方法(list方法),因此,需转成list
        dataIndex = list(range(m))
        for i in range(m):
            #改进2:每次迭代时调整alpha
            alpha=4/(1.0+i+j)+0.01
            #改进2:随机选取更新
            randIndex = int(random.uniform(0,len(dataIndex)))
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex]-h
            #注意:原代码中,输入参数都是列表,列表*小数会报错,所以,将dataMatrix转换成数组array
            weights = weights +alpha *error* array(dataMatrix[randIndex])
            weightSet.append(weights)
            del(dataIndex[randIndex])
    return weights,array(weightSet)


#画出数据集和Logistic回归最佳拟合直线
def plotBestFit(weights):
    dataMat,labelMat= loadDataSet()
    dataArr = array(dataMat)
    m = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord0 = []; ycord0 = []
    for i in range(m):
        if int(labelMat[i])==1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord0.append(dataArr[i,1]); ycord0.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord0,ycord0,s=30,c='green')
    #绘制拟合直线
    x = np.arange(-3.0, 3.0, 0.1)    # x取值范围
    #weight[1]是1*1的矩阵,x是1*60的矩阵,结果是1*60
    #而plot(x,y)要求x和y的shape[0]相同,此时x,y的shape[0]分别是60,1,因此,报错
    #错误形式:y = -(weights[0]+weights[1]*x)/weights[2]
    #分割线方差设置sigmoid函数为0,即0=w0x0+w1x1+w2x2
    y = -(weights[0]+weights[1]*x)/weights[2]
    ax.plot(x,y.T)
    plt.xlabel('X1'); plt.ylabel('X2')
"""    
def plotWeightAndnumIter(weights,numIter):
    w0=weight
"""
if __name__=="__main__":
    dataArr,labelMat = loadDataSet()
    """
    #改进前随机梯度中迭代次数和回归系数之间的关系
    weights,weightSet=stocGradAscent0(dataArr,labelMat,2)
    numIter = np.arange(0,2*100,1)
    plotBestFit(weights)
    """
    #改进后迭代次数和回归系数的关系
    weights,weightSet=stocGradAscent1(dataArr,labelMat,200)
    numIter = np.arange(0,200*100,1)
    plotBestFit(weights)
    numIter = np.arange(0,200*100,1)
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(311)
    ax1.plot(numIter,weightSet[:,0])
    ax2 = fig1.add_subplot(312)
    ax2.plot(numIter,weightSet[:,1])
    ax3 = fig1.add_subplot(313)
    ax3.plot(numIter,weightSet[:,2])

4.2 结果显示

4.2.1 梯度上升算法的结果:


4.2.2 改进前随机梯度上升算法:迭代次数与回归系数的关系:

4.2.3 改进后随机梯度上升算法:迭代次数与回归系数的关系:

五、从疝气病症预测病马的死亡率

代码实现:

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 20:58:36 2018
*************************************从疝气病症预测病马的死亡率****************************************
@author: lizihau
"""
from numpy import array,random,exp,shape,ones
import numpy as np

#定义激活函数
def sigmoid(inX):
    return 1.0/(1+exp(-inX))

def stocGradAscent1(dataMatrix, classLabels, numIter=150):
    m,n = shape(dataMatrix)
    weights = ones(n)   #initialize to all ones
    for j in range(numIter):
        dataIndex = list(range(m))
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not 
            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del(dataIndex[randIndex])
    return weights
#分类函数
def classifyVector(inX, weights):
    prob = sigmoid(sum(inX*weights))
    if prob > 0.5: return 1.0
    else: return 0.0

def colicTest():
    frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
    trainingSet = []; trainingLabels = []
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr =[]
        #数据有22列,前21个为特征,最后一个是分类标签
        for i in range(21):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[21]))
    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
    errorCount = 0; numTestVec = 0.0
    for line in frTest.readlines():
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr =[]
        for i in range(21):
            lineArr.append(float(currLine[i])) 
        if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
            errorCount += 1
    errorRate = (float(errorCount)/numTestVec)
    print("测试集错误率: %f" % errorRate)
    return errorRate

#调用10次colicTest函数,然后求结果的平均值
def multiTest():
    numTests = 10; errorSum=0.0
    for k in range(numTests):
        errorSum += colicTest()
    print("第 %d 次迭代的错误率平均值: %f" % (numTests, errorSum/float(numTests)))

if __name__=="__main__":
    multiTest()

结构显示:


猜你喜欢

转载自blog.csdn.net/LZH_12345/article/details/80023918