机器学习第四节,Logistic 回归

最优化算法:解决最优化问题,例如如何在最短时间从A到达B

利用Logistic回归进行分类的只要思想是:根据现有数据对分类边界线建立回归公式,以此分类

训练分类器:利用最优化算法寻找最佳拟合参数

 

梯度上升法用来求函数的最大值

w:=w +α▽wf(w)

梯度下降法用来求函数的最小值

w:=w -α▽wf(w)

梯度定义了移动的方向,α(步长)定义了移动的距离



import numpy as np
from math import exp


def load_data():
    data_matrix = []
    label_matrix = []
    fr = open('testSet.txt')
    for line in fr.readlines():
        split_num = line.strip().split()
        data_matrix.append([1.0, float(split_num[0]), float(split_num[1])])
        label_matrix.append(int(split_num[2]))

    return data_matrix, label_matrix


def sigmoid_func(x):
    # f = 1/(1+exp(-x))
    '''
    exp_x = []
    for i in x:
        try:
            exp_x.append(exp(-i))
        except OverflowError:
            exp_x.append(float('inf'))
    '''

    exp_x = np.array([np.exp(-i) for i in x])
    return 1/(1 + exp_x)


def grad_ascent(data_matrix, label_matrix):
    # 初始化数据
    data_matrix = np.mat(data_matrix)
    label_matrix = np.mat(label_matrix).transpose()
    max_loop = 500
    alpha = 0.01
    m, n = np.shape(data_matrix)
    weights = np.ones((n, 1))
    # 循环
    for i in range(max_loop):
        h = sigmoid_func(data_matrix * weights)
        error = label_matrix - np.mat(h).transpose()
        # theta = theta + alpha*(yi - h(xi))*xj(i)
        weights += alpha * data_matrix.transpose() * error

    return weights


def plot_best_fit():
    import matplotlib.pyplot as plt
    data, labels = load_data()
    weights = sto_grad_ascent1(data, labels)
    data = np.array(data)
    x0_1 = []
    x0_2 = []
    x1_1 = []
    x1_2 = []
    m = len(data)
    for i in range(m):
        if labels[i] == 0:
            x0_1.append(data[i, 1])
            x0_2.append(data[i, 2])
        else:
            x1_1.append(data[i, 1])
            x1_2.append(data[i, 2])

    plt.scatter(x0_1, x0_2, c='r', marker='s')
    plt.scatter(x1_1, x1_2, c='k')
    x1 = np.arange(-3.0, 3.0, 0.1)
    x2 = (-weights[0] - weights[1]*x1)/weights[2]
    plt.plot(x1, x2)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()


def sto_grad_ascent(data, labels):
    m, n = np.shape(data)
    weights = np.ones((n,1))
    alpha = 0.01
    for i in range(m):
        h = 1/(1+exp(-sum(data[i]*weights)))
        error = labels[i] - h
        weights += alpha * error * np.array(data[i])
    return weights


# 改进的随机梯度上升算法
def sto_grad_ascent1(data, labels, num_iter=150):
    m, n = np.shape(data)
    weights = np.ones((n, 1))

    for j in range(num_iter):
        data_index = list(range(m))
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.01
            rand_index = int(np.random.uniform(0, len(data_index)))
            transpose_data = np.mat(data[rand_index]).transpose()
            h = 1 / (1 + np.exp(-sum(np.array(transpose_data) * weights)))
            error = labels[rand_index] - h
            weights += alpha * error * np.array(transpose_data)
            del(data_index[rand_index])

    return weights


def classify(in_x, weights):
    prob = 1 / (1 + np.exp(sum(in_x * weights)))
    if prob > 0.5:
        return 1
    else:
        return 0


def colic_test():
    train_file = open('horseColicTraining.txt')
    train_set = []
    train_label = []
    for line in train_file.readlines():
        split_line = line.strip().split('\t')
        curr_arr = []
        for i in range(21):
            curr_arr.append(float(split_line[i]))
        train_set.append(curr_arr)
        train_label.append(float(split_line[21]))

    weights = sto_grad_ascent1(train_set, train_label, 500)

    test_file = open('horseColicTest.txt')
    error = 0
    num_test = 0
    for line in test_file.readlines():
        num_test += 1
        split_line = line.strip().split('\t')
        in_x = np.ones((21, 1))
        for i in range(21):
            in_x[i][0] = float(split_line[i])
        if int(classify(np.array(in_x), weights)) != int(split_line[21]):
            error += 1

    error_rate = error/num_test
    print('error_rate:', error_rate)
    return error_rate


def multi_test():
    sum_rate = 0.0
    for i in range(10):
        sum_rate += colic_test()
    average_rate = sum_rate/10
    print('average_rate:', average_rate)


multi_test()

# plot_best_fit()

# data, label = load_data()
# weight = grad_ascent(data, label)
# print(weight)


但是运行完错误率在60%+,不知道问题出在哪里



猜你喜欢

转载自blog.csdn.net/ll523587181/article/details/78949567