python实现逻辑回归(二分类)

代价函数总是NaN的问题已解决

 
 
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/4/3 19:37
# @Author  : HJH
# @Site    : 
# @File    : logistics.py
# @Software: PyCharm

from numpy import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

class log(object):
    def __init__(self):
        self.W=None

    def sigmoid(self,X):
        # longfloat防止溢出,但是并没有什么用
        return longfloat(1.0 / (1.0 + exp(-X)))

    def loss(self,X_train,y_train):
        m,n=X_train.shape
        h=self.sigmoid(X_train.dot(self.W))
        # print(h)
        # print((h-y).shape)
        #此处的loss是矩阵类型,为了便于画图将其中的数取出
        loss=(y_train.T.dot(np.log(h))+(1-y_train).T.dot(np.log(1-h)))/-m
        loss=loss[0,0]

        dW=X_train.T.dot((h - y_train)) / m
        # print(dW.shape, '-----------')
        return loss,dW

    def train(self,X_train,y_train,learn_rate=0.0005,iters=10000):
        m,n=X_train.shape
        # print(m,n)
        self.W=np.random.rand(n,1)
        loss_list = []

        for i in range(iters):
            loss,dW=self.loss(X_train,y_train)
            self.W-=learn_rate*dW
            loss_list.append(loss)
            if i % 500 == 0:
                print('iters = %d,loss = %f' % (i, loss))
        return loss_list

    def predict(self, X_test):
        m=X_test.shape[0]
        X_test = np.hstack((X_test, mat(np.ones((m, 1)))))
        y_pred_list=[]
        for xx in X_test:
            y_pred = self.sigmoid(xx.dot(self.W))
            # y_pred_list.append(y_pred[0,0])
            if y_pred>=0.5:
                y_pred_list.append(1)
            else:
                y_pred_list.append(0)
        return y_pred_list

#从文件中加载数据:特征X,标签label
def loadDataSet():
    digits=load_breast_cancer()
    norm_digits=autoNorm(digits.data)
    X_train = norm_digits[:-10,:]
    m= X_train.shape[0]
    #print(m,n)
    y_total = digits.target.reshape(569,1)
    #print(y_total.shape)
    y_train=y_total[:-10,:]
    #print(m,n)
    #print(X)
    X_train = np.hstack((X_train,mat(np.ones((559,1)))))
    # print(X)
    #print(y.shape)
    X_test=norm_digits[-10:,:]
    X_test=X_test
    y_test = y_total[-10:, :]
    return X_train,y_train,X_test,y_test
#将数据归一化(解决代价函数NaN)
def autoNorm(X):
    minVals=X.min(0)
    maxVals=X.max(0)
    ranges=maxVals-minVals
    normDataSet=zeros(shape(X))
    m=X.shape[0]
    normDataSet=X-tile(minVals,(m,1))#在行方向重复minVals m次和列方向上重复minVals 1次
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet


def plot(loss_list,log):
    fig = plt.figure()
    digits = load_breast_cancer()
    norm_digits = autoNorm(digits.data)
    x_index = 0
    y_index = 1
    colors = ['blue', 'red']
    plt.subplot(211)
    for label, color in zip(range(len(digits.target_names)), colors):
        plt.scatter(norm_digits[digits.target == label, x_index],
                    norm_digits[digits.target == label, y_index],
                    label=digits.target_names[label],
                    c=color)
    plt.xlabel(digits.feature_names[x_index])
    plt.ylabel(digits.feature_names[y_index])
    plt.legend(loc='upper left')

    plt.subplot(212)
    plt.plot(loss_list, color='blue')
    plt.xlabel('epochs')
    plt.ylabel('errors')

    plt.show()


if __name__ == '__main__':
    X_train,y_train,X_test,y_test=loadDataSet()
    l=log()
    loss_list=l.train(X_train,y_train)
    print(l.predict(X_test))
    for i in loss_list:
        print(i)
    plot(loss_list,l)

网上的做法:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/4/4 11:07
# @Author  : HJH
# @Site    : 
# @File    : temp.py
# @Software: PyCharm

from numpy import *
filename='./testSet.txt' #文件目录
def loadDataSet():   #读取数据(这里只有两个特征)
    dataMat = []
    labelMat = []
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])   #前面的1,表示方程的常量。比如两个特征X1,X2,共需要三个参数,W1+W2*X1+W3*X2
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat

def sigmoid(inX):  #sigmoid函数
    return 1.0/(1+exp(-inX))

def gradAscent(dataMat, labelMat): #梯度上升求最优参数
    dataMatrix=mat(dataMat) #将读取的数据转换为矩阵
    classLabels=mat(labelMat).transpose() #将读取的数据转换为矩阵
    m,n = shape(dataMatrix)
    alpha = 0.001  #设置梯度的阀值,该值越大梯度上升幅度越大
    maxCycles = 500 #设置迭代的次数,一般看实际数据进行设定,有些可能200次就够了
    weights = ones((n,1)) #设置初始的参数,并都赋默认值为1。注意这里权重以矩阵形式表示三个参数。
    for k in range(maxCycles):
        h = sigmoid(dataMatrix*weights)
        error = (classLabels - h)     #求导后差值
        weights = weights + alpha * dataMatrix.transpose()* error #迭代更新权重
    return weights

def stocGradAscent0(dataMat, labelMat):  #随机梯度上升,当数据量比较大时,每次迭代都选择全量数据进行计算,计算量会非常大。所以采用每次迭代中一次只选择其中的一行数据进行更新权重。
    dataMatrix=mat(dataMat)
    classLabels=labelMat
    m,n=shape(dataMatrix)
    alpha=0.01
    maxCycles = 500
    weights=ones((n,1))
    for k in range(maxCycles):
        for i in range(m): #遍历计算每一行
            h = sigmoid(sum(dataMatrix[i] * weights))
            error = classLabels[i] - h
            weights = weights + alpha * error * dataMatrix[i].transpose()
    return weights

def stocGradAscent1(dataMat, labelMat): #改进版随机梯度上升,在每次迭代中随机选择样本来更新权重,并且随迭代次数增加,权重变化越小。
    dataMatrix=mat(dataMat)
    classLabels=labelMat
    m,n=shape(dataMatrix)
    weights=ones((n,1))
    maxCycles=500
    for j in range(maxCycles): #迭代
        dataIndex=[i for i in range(m)]
        for i in range(m): #随机遍历每一行
            alpha=4/(1+j+i)+0.0001  #随迭代次数增加,权重变化越小。
            randIndex=int(random.uniform(0,len(dataIndex)))  #随机抽样
            h=sigmoid(sum(dataMatrix[randIndex]*weights))
            error=classLabels[randIndex]-h
            weights=weights+alpha*error*dataMatrix[randIndex].transpose()
            del(dataIndex[randIndex]) #去除已经抽取的样本
    return weights

def plotBestFit(weights):  #画出最终分类的图
    import matplotlib.pyplot as plt
    dataMat,labelMat=loadDataSet()
    dataArr = array(dataMat)
    n = shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if int(labelMat[i])== 1:
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = arange(-3.0, 3.0, 0.1)
    y = (-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

if __name__=='__main__':
    dataMat, labelMat = loadDataSet()
    weights = gradAscent(dataMat, labelMat).getA()
    plotBestFit(weights)

test.txt数据集:

-0.017612   14.053064  0
-1.395634  4.662541   1
-0.752157  6.538620   0
-1.322371  7.152853   0
0.423363   11.054677  0
0.406704   7.067335   1
0.667394   12.741452  0
-2.460150  6.866805   1
0.569411   9.548755   0
-0.026632  10.427743  0
0.850433   6.920334   1
1.347183   13.175500  0
1.176813   3.167020   1
-1.781871  9.097953   0
-0.566606  5.749003   1
0.931635   1.589505   1
-0.024205  6.151823   1
-0.036453  2.690988   1
-0.196949  0.444165   1
1.014459   5.754399   1
1.985298   3.230619   1
-1.693453  -0.557540  1
-0.576525  11.778922  0
-0.346811  -1.678730  1
-2.124484  2.672471   1
1.217916   9.597015   0
-0.733928  9.098687   0
-3.642001  -1.618087  1
0.315985   3.523953   1
1.416614   9.619232   0
-0.386323  3.989286   1
0.556921   8.294984   1
1.224863   11.587360  0
-1.347803  -2.406051  1
1.196604   4.951851   1
0.275221   9.543647   0
0.470575   9.332488   0
-1.889567  9.542662   0
-1.527893  12.150579  0
-1.185247  11.309318  0
-0.445678  3.297303   1
1.042222   6.105155   1
-0.618787  10.320986  0
1.152083   0.548467   1
0.828534   2.676045   1
-1.237728  10.549033  0
-0.683565  -2.166125  1
0.229456   5.921938   1
-0.959885  11.555336  0
0.492911   10.993324  0
0.184992   8.721488   0
-0.355715  10.325976  0
-0.397822  8.058397   0
0.824839   13.730343  0
1.507278   5.027866   1
0.099671   6.835839   1
-0.344008  10.717485  0
1.785928   7.718645   1
-0.918801  11.560217  0
-0.364009  4.747300   1
-0.841722  4.119083   1
0.490426   1.960539   1
-0.007194  9.075792   0
0.356107   12.447863  0
0.342578   12.281162  0
-0.810823  -1.466018  1
2.530777   6.476801   1
1.296683   11.607559  0
0.475487   12.040035  0
-0.783277  11.009725  0
0.074798   11.023650  0
-1.337472  0.468339   1
-0.102781  13.763651  0
-0.147324  2.874846   1
0.518389   9.887035   0
1.015399   7.571882   0
-1.658086  -0.027255  1
1.319944   2.171228   1
2.056216   5.019981   1
-0.851633  4.375691   1
-1.510047  6.061992   0
-1.076637  -3.181888  1
1.821096   10.283990  0
3.010150   8.401766   1
-1.099458  1.688274   1
-0.834872  -1.733869  1
-0.846637  3.849075   1
1.400102   12.628781  0
1.752842   5.468166   1
0.078557   0.059736   1
0.089392   -0.715300  1
1.825662   12.693808  0
0.197445   9.744638   0
0.126117   0.922311   1
-0.679797  1.220530   1
0.677983   2.556666   1
0.761349   10.693862  0
-2.168791  0.143632   1
1.388610   9.341997   0
0.317029   14.739025  0

猜你喜欢

转载自blog.csdn.net/m_z_g_y/article/details/79820561
今日推荐