机器学习逻辑回归

感谢：
https://blog.csdn.net/lu597203933/article/details/38468303

https://www.bilibili.com/video/av10590361/?p=31&t=176 logistic regression chapter

以及《机器学习实战》第五章

'''
logistic regression 逻辑回归：实际上是分类问题
梯度下降算法找到loss损失函数的最小值点
逐步迭代，找到最佳的回归系数，逻辑回归的分类器模型参数更新的过程就是
模型学习/训练的过程，也可以理解成是模型最佳系数回归的过程
'''
import numpy as np
import os
import matplotlib.pyplot as plt
import random
def Read_Data(data_path):
    '''
    读取所给出的保存数据的完整txt路径，并转换成对应的numpy数组
    :param data_path:
    :return:
    '''
    file_obj=open(data_path)
    all_lines=file_obj.readlines()
    num_samples=len(all_lines)
    num_feat=len(all_lines[0].split())-1
    data_array=np.zeros((num_samples,num_feat))
    label_array=np.zeros((num_samples))
    # print(data_array.shape)
    for i,line in enumerate(all_lines):
        line=line.split()
        for j in range(num_feat):
            if line[j][0]=='-':
                data_array[i][j]=-float(line[j][1:])
            else:
                data_array[i][j]=float(line[j])
        label_array[i]=int(line[-1])
    file_obj.close()
    return data_array,label_array

class logistic_reg(object):
    def __init__(self,num_feat):
        # self.weight=np.random.rand(num_feat).reshape(num_feat,1)
        self.weight = np.ones(num_feat).reshape(num_feat, 1)
        # self.bias=np.ones((1))
    def forward(self,train_data,train_label,lr,is_training):
        # 根据前向传播计算得到的对于训练数据的预测值和训练数据标签，求出损失函数，并更新
        # 模型的参数
        '''
        :param train_data: numpy.array = [num_samples,num_feat]
        :return:
        '''
        # train_data=np.concatenate((train_data,np.ones((train_data.shape[0],1))),axis=1)
        train_data=np.concatenate((np.ones((train_data.shape[0],1)),train_data),axis=1)

        pred=np.dot(train_data,self.weight)
        # pred shape [num_samples,1]
        pred=1+np.exp(-pred)
        pred=1/pred
        # 前向传播，计算在当前模型的回归参数下，所预测的输出

        if not is_training:# 在评估模式下，直接返回模型所预测的概率值
            return pred
        else:
            log_fore=-np.log(pred)# 对于ground truth label为1的样本
            log_back=-np.log(1-pred)

            if len(train_label.shape)==1:
                train_label=np.expand_dims(train_label,axis=1)

            loss=np.sum(log_fore[np.where(train_label==1)])+np.sum(log_back[np.where(train_label==0)])
            loss/=train_data.shape[0]
            # 损失函数计算结束

            # 梯度下降算法
            error=pred-train_label
            dW=np.dot(train_data.T,error)
            # dW = [num_feat,1]
            # dW/=train_data.shape[0]
            db=np.mean(pred-train_label)

            self.weight-=lr*dW
            # self.bias-=lr*db
            return pred,loss

if __name__=='__main__':
    data_root_path='F:\\machine_learning\\Ch05'
    data_path=os.path.join(data_root_path,'testSet.txt')
    data_array,label_array=Read_Data(data_path)
    # print(data_array,label_array)
    # print(data_array.shape)

    log_model=logistic_reg(data_array.shape[1]+1)

    epoch=500
    lr=0.001
    for i in range(epoch):
        pred,loss=log_model.forward(data_array,label_array,lr,is_training=True)
        # print('epoch',i,'loss',loss)
    print('weight',log_model.weight)
    # print('bias',log_model.bias)

    print(pred.shape,np.min(pred),np.max(pred))

    prediction=np.where(pred>0.5,1,0)
    prediction=prediction.reshape(-1)

    # print(prediction[:50])
    # print(label_array[:50])

    accuracy=np.sum(prediction==label_array)/label_array.shape[0]
    print('full batch training , accuracy',accuracy)
    '''
    使用梯度下降法，每次迭代/更新梯度值使用的是整个训练数据集
    即一次处理的是所有的数据，称为批处理
    accuracy 0.96
    '''
    #  [[ 0.82234723]
    #  [-0.27227592]
    #  [ 1.03490603]]

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    # 设置标题
    ax1.set_title('two dimension data')
    # 设置X轴标签
    plt.xlabel('x0')
    # 设置Y轴标签
    plt.ylabel('x1')
    # 画散点图
    fore_point=data_array[np.where(label_array==1)]
    back_point=data_array[np.where(label_array==0)]
    ax1.scatter(x=fore_point[:, 0], y=fore_point[:, 1], c='r', marker='o')
    # 设置图标
    plt.legend('fore_point')

    ax1.scatter(x=back_point[:, 0], y=back_point[:, 1], c='b', marker='^')
    # 设置图标
    plt.legend('back_point')

    x=np.arange(-3,3,0.1)
    y=-(log_model.weight[1]/log_model.weight[2])*x-(log_model.weight[0]/log_model.weight[2])
    plt.scatter(x,y,c='g', marker='.')

    # 显示所画的图
    plt.show()

    '''
    批量的梯度下降法（每更新一次回归系数，需要使用到训练数据集中的所有数据）对于大规模数据集的处理计算复杂度会很高
    故而引入随机梯度下降法
    随机梯度下降法：每次使用一个训练样本计算得到的梯度值更新回归系数，称为在线学习
    
    随机梯度下降法可以看作是batch size=1的mini-batch梯度下降法，深度学习中通常的做法是：
    对于每个epoch，首先将整个训练数据集的数据进行随机重排（random shuffle），然后在每个step中从
    随机重排后的数据集中采样出batch  size个样本，更新一次回归系数，再进行下一个step的操作，
    对于同一个epoch而言，每个step采样出来的样本是不具有overlap的，而同一个epcoh的所有step
    所使用到的样本总数就是整个训练数据集
    在这里的小批量梯度下降算法中，引入polynomial learning rate，即在训练过程中，学习率呈现多项式曲线的下降趋势
    lr(t)=base_lr*(t/T)**power
    t表示当前step步数，T表示总的step步数
    '''
    log_model_2 = logistic_reg(data_array.shape[1] + 1)

    epoch = 50
    lr=0.1
    # base_lr = 0.1
    batch_size=1
    if data_array.shape[0]%batch_size==0:
        num_step=data_array.shape[0]//batch_size
    else:
        num_step = int(data_array.shape[0]/ batch_size)+1
    # T = epoch * num_step
    for i in range(epoch):
        # 在每个epoch开始时对整个训练数据集进行随机重排
        index=random.sample(range(data_array.shape[0]), data_array.shape[0])
        if i%25==0:
            lr*=0.1
        # print(i,lr)
        for j in range(num_step):
            # lr=base_lr*(num_step*i+j+1)/()
            row_start=j*batch_size
            row_end=(j+1)*batch_size
            if row_end>data_array.shape[0]:
                row_end=data_array.shape[0]
            pred, loss = log_model_2.forward(data_array[index[row_start:row_end]], label_array[index[row_start:row_end]], lr, is_training=True)
            # print('epoch',i,'loss',loss)
    pred=log_model_2.forward(data_array,label_array,lr=0,is_training=False)
    prediction = np.where(pred > 0.5, 1, 0)
    prediction = prediction.reshape(-1)

    accuracy = np.sum(prediction == label_array) / label_array.shape[0]
    print('mini batch accuracy', accuracy)
    # print('weight', log_model.weight)
    # print('bias',log_model.bias)
    # mini batch accuracy 0.96   经过50次遍历，可达到与梯度下降法遍历500次相同的准确率

    '''
    读取真实数据即大规模数据集
    '''
    data_root_path = 'F:\\machine_learning\\Ch05'
    data_path = os.path.join(data_root_path, 'testSet.txt')
    data_train,label_train=Read_Data(os.path.join(data_root_path,'horseColicTraining.txt'))
    data_test,label_test=Read_Data(os.path.join(data_root_path,'horseColicTest.txt'))
    print(data_train.shape,label_train.shape)

    # (299, 21) (299,)   数据集中的训练样本含有21维的特征

    horse_model=logistic_reg(num_feat=data_train.shape[1]+1)

    epoch = 50
    lr=0.1
    # base_lr = 0.1
    batch_size=50
    if data_train.shape[0]%batch_size==0:
        num_step=data_train.shape[0]//batch_size
    else:
        num_step = int(data_train.shape[0]/ batch_size)+1
    # T = epoch * num_step
    for i in range(epoch):
        # 在每个epoch开始时对整个训练数据集进行随机重排
        index=random.sample(range(data_train.shape[0]), data_train.shape[0])
        if i%25==0:
            lr*=0.1
        # print(i,lr)
        for j in range(num_step):
            # lr=base_lr*(num_step*i+j+1)/()
            row_start=j*batch_size
            row_end=(j+1)*batch_size
            if row_end>data_train.shape[0]:
                row_end=data_train.shape[0]
            pred, loss = horse_model.forward(data_train[index[row_start:row_end]], label_train[index[row_start:row_end]], lr, is_training=True)
            # print('epoch',i,'loss',loss)
        pred=horse_model.forward(data_test,label_test,lr=0,is_training=False)
        prediction = np.where(pred > 0.5, 1, 0)
        prediction = prediction.reshape(-1)

        accuracy = np.sum(prediction == label_test) / label_test.shape[0]
        print('epoch',i,'horse mini batch accuracy', accuracy)
        '''
        epoch 48 horse mini batch accuracy 0.7611940298507462
        epoch 49 horse mini batch accuracy 0.7014925373134329
        '''

机器学习 逻辑回归

猜你喜欢

机器学习逻辑回归