"Machine Learning in Practice" Chapter 5 Study Notes (Logistic Regression)

The main idea of ​​using Logistic regression for classification is to establish a regression formula on the classification boundary line according to the existing data, so as to classify. The word "regression" here comes from best fit, which means to find the best fit set of parameters. The way to train a classifier is to find the best fitting parameters, using an optimization algorithm.

1. Logistic distribution

2. Logistic regression model (two classification model)

Among them, the sigmoid function:

                                        

In other words, directly use the sigmoid function to understand the binary logistic regression model:

P(Y=1|x)=sigmoid(z); P(Y=0|x)=1-P(Y=1|x)=sigmoid(z). Among them, the expression of z is as follows:


3. Model parameter estimation

3.1 Estimate parameters by means of maximum likelihood function:

3.2 Gradient ascent method to estimate parameters

The gradient ascent algorithm is used to find the maximum value of a function, while the gradient descent algorithm is used to find the minimum value of a function.

Gradient ascent algorithm iteration formula:


Derive the first one: where e is the error, sample matrix x, class label y, regression coefficient w, step size a.

The purpose of gradient ascent is to minimize the error:

4. Estimation of optimal parameters based on gradient ascent algorithm

4.1 Code Implementation

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 19:07:15 2018
file name:logRegres.py
@author: lizihua
"""
import numpy as np
from numpy import exp,mat,shape,ones,array,random
import matplotlib.pyplot as plt
#Logistic regression gradient ascent optimization algorithm
#Download Data
# Note that the list is returned! ! !
def loadDataSet():
    dataMat = [];labelMat = []
    fr = open('testSet.txt')
    for line in fr.readlines():
        lineArr = line.strip().split()
        #Add a column x0=1
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

#define activation function
def sigmoid(inX):
    return 1.0/(1+exp(-inX))

#gradient ascent
def gradAscent(dataMatIn,classLabels):
    #Convert the data into a matrix, at this time, dataMatIn is a list, dataMatrix is ​​a matrix
    dataMatrix = mat (dataMatIn)
    #Convert the label data into a matrix and transpose
    labelMat = mat(classLabels).transpose()
    #m represents the number of samples, n represents the number of features (x0 (x0=1) features have been added)
    m,n = shape(dataMatrix)
    #Initialization: alpha is the moving step size, iterNum is the number of iterations, weights is the trained regression coefficient
    alpha = 0.001
    iterNum = 500
    weights = ones((n,1))
    for k in range(iterNum):
        h = sigmoid(dataMatrix*weights)
        error = labelMat - h
        weights = weights + alpha*dataMatrix.transpose()*error
    return weights

#Stochastic gradient ascent algorithm: randomly select a misclassified point to update the regression coefficient until there are no misclassified points

#Stochastic gradient ascent algorithm: randomly select a misclassified point to update the regression coefficient until there are no misclassified points
def stocGradAscent0(dataMatrix, classLabels,numIter):
    m,n = shape(dataMatrix)
    weights = ones(n)
    alpha=0.01
    weightSet=[]
    for j in range(numIter):
        for i in range(m):
            h = sigmoid(sum(dataMatrix[i]*weights))
            error = classLabels[i]-h
            #Note: In the original code, the input parameters are all lists, and the list * decimal will report an error, so convert the dataMatrix into an array array
            weights = weights +alpha *error* array(dataMatrix[i])
            weightSet.append(weights)
    return weights,array(weightSet)

#Improved version
def stocGradAscent1(dataMatrix, classLabels,numIter=150):
    m,n = shape(dataMatrix)
    weights = ones(n)
    weightSet=[]
    for j in range(numIter):
        #In python3, the range returned is the range object, and the del method (list method) is used later. Therefore, it needs to be converted into a list.
        dataIndex = list(range(m))
        for i in range(m):
            # Improvement 2: Adjust alpha on each iteration
            alpha=4/(1.0+i+j)+0.01
            #Improvement 2: Randomly pick updates
            randIndex = int (random.uniform (0, len (dataIndex)))
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex]-h
            #Note: In the original code, the input parameters are all lists, and the list * decimal will report an error, so convert the dataMatrix into an array array
            weights = weights +alpha *error* array(dataMatrix[randIndex])
            weightSet.append(weights)
            del (dataIndex [randIndex])
    return weights,array(weightSet)


#Draw the best fit line between the dataset and Logistic regression
def plotBestFit (weights):
    dataMat,labelMat= loadDataSet()
    dataArr = array(dataMat)
    m = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord0 = []; ycord0 = []
    for i in range(m):
        if int(labelMat[i])==1:
            xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
        else:
            xcord0.append(dataArr[i,1]); ycord0.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
    ax.scatter(xcord0,ycord0,s=30,c='green')
    # draw the fitted line
    x = np.arange(-3.0, 3.0, 0.1) # x value range
    #weight[1] is a 1*1 matrix, x is a 1*60 matrix, the result is 1*60
    #And plot(x,y) requires the shape[0] of x and y to be the same. At this time, the shape[0] of x and y are 60,1 respectively, so an error is reported
    #Incorrect form: y = -(weights[0]+weights[1]*x)/weights[2]
    #The variance of the dividing line sets the sigmoid function to 0, that is, 0=w0x0+w1x1+w2x2
    y = -(weights[0]+weights[1]*x)/weights[2]
    ax.plot(x,y.T)
    plt.xlabel('X1'); plt.ylabel('X2')
"""    
def plotWeightAndnumIter(weights,numIter):
    w0=weight
"""
if __name__=="__main__":
    dataArr,labelMat = loadDataSet()
    """
    #Improve the relationship between the number of iterations and regression coefficients in the former stochastic gradient
    weights,weightSet=stocGradAscent0(dataArr,labelMat,2)
    numIter = np.arange(0,2*100,1)
    plotBestFit (weights)
    """
    #The relationship between the number of iterations and the regression coefficient after improvement
    weights,weightSet=stocGradAscent1(dataArr,labelMat,200)
    numIter = np.arange(0,200*100,1)
    plotBestFit (weights)
    numIter = np.arange(0,200*100,1)
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(311)
    ax1.plot(numIter,weightSet[:,0])
    ax2 = fig1.add_subplot(312)
    ax2.plot(numIter,weightSet[:,1])
    ax3 = fig1.add_subplot(313)
    ax3.plot(numIter,weightSet[:,2])

4.2 Display of results

4.2.1 The result of the gradient ascent algorithm:


4.2.2 Stochastic gradient ascent algorithm before improvement: the relationship between the number of iterations and the regression coefficient:

4.2.3 The improved stochastic gradient ascent algorithm: the relationship between the number of iterations and the regression coefficient:

5. Predicting the death rate of sick horses from hernia symptoms

Code:

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 20:58:36 2018
************************************ Prediction of Sick Horse Mortality from Hernia Disorders**** ************************************
@author: lizihau
"""
from numpy import array,random,exp,shape,ones
import numpy as np

#define activation function
def sigmoid(inX):
    return 1.0/(1+exp(-inX))

def stocGradAscent1(dataMatrix, classLabels, numIter=150):
    m,n = shape(dataMatrix)
    weights = ones(n)   #initialize to all ones
    for j in range(numIter):
        dataIndex = list(range(m))
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not
            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del (dataIndex [randIndex])
    return weights
#classification function
def classifyVector(inX, weights):
    prob = sigmoid(sum(inX*weights))
    if prob > 0.5: return 1.0
    else: return 0.0

def colicTest():
    frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
    trainingSet = []; trainingLabels = []
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr =[]
        #The data has 22 columns, the first 21 are features, and the last is the classification label
        for i in range(21):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[21]))
    trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
    errorCount = 0; numTestVec = 0.0
    for line in frTest.readlines():
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr =[]
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
            errorCount += 1
    errorRate = (float(errorCount)/numTestVec)
    print("Test set error rate: %f" % errorRate)
    return errorRate

#Call the colicTest function 10 times, then average the results
def multiTest():
    numTests = 10; errorSum=0.0
    for k in range(numTests):
        errorSum += colicTest()
    print("Mean error rate for iteration %d: %f" % (numTests, errorSum/float(numTests)))

if __name__=="__main__":
    multiTest()

The structure shows:


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324771480&siteId=291194637