吴恩达机器学习练习2(python)：逻辑回归

吴恩达机器学习课程：网易云课堂

此文转载于：黄海广博士，[email protected]，感谢吴恩达先生、黄海广博士及所有奋斗在AI一线的前辈们，努力学习，自此刻始。

注意：此文只是方便本人学习，侵权立删。

更权威科学的课程和笔记请移步：https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.optimize as opt

path = 'E:\yanwucao/test\ML\ex2\ex2data1.txt'
data = pd.read_csv(path, names=['exam1', 'exam2', 'admitted'])
# print(data.head())

# 绘制两种类型的散点图
#
# isin函数详解
# 在pandas的DataFrame中，我们经常需要根据某属性来选取指定条件的行，这时isin方法就特别有效
# df = pd.DataFrame([[1,2,3],[1,3,4],[2,4,3]], index=['one', 'two', 'three'], columns=['A', 'B', 'C'])
# mask = df['A'].isin([1])
# print(mask)
# print(df[mask])
# 运行结果如下：
# one       True
# two       True
# three    False
# Name: A, dtype: bool
#      A  B  C
# one  1  2  3
# two  1  3  4

# positive = data['admitted'].isin([1])
# negative = data['admitted'].isin([0])
# fig, ax = plt.subplots(figsize=(12, 8))
# data1 = data[positive]
# data2 = data[negative]
# ax.scatter(data1['exam1'], data1['exam2'], s=50, c='r', marker='o', label='admitted')
# ax.scatter(data2['exam1'], data2['exam2'], s=50, c='b', marker='x', label='not admitted')
# ax.legend()
# ax.set_xlabel('exam1 score')
# ax.set_ylabel('exam2 score')
# plt.show()


def sigmoid(x):       # 计算S型函数
    return 1 / (1 + np.exp(-x))
# 检测sigmoid函数是否正确
# nums = np.linspace(-10, 10, 20)
# fig1, ax = plt.subplots(figsize=(12, 8))
# ax.plot(nums, sigmoid(nums), c='r')
# plt.show()

# 错误的cost函数示例
# def cost(X, y, theta):     # 计算代价函数
#     theta = np.matrix(theta)
#     X = np.matrix(X)
#     y = np.matrix(y)
#     frist = np.multiply(-y, np.log(sigmoid(X * theta.T)))
#     second = np.multiply((y-1), np.log(1-sigmoid(X * theta.T)))
#     return np.sum(frist + second) / X.shape[0]

# 注意此处的cost、gradient函数里面形参的顺序不能变，因为和后面的fmin_tnc函数有关
def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))


def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    temp = np.zeros(X.shape[1])
    errors = sigmoid(X * theta.T) - y
    for i in range(theta.shape[1]):
        # 注意这里的theta为多维数组的形式，而不是列表
        temp[i] = np.sum(np.multiply(errors, X[:, i])) / len(X)
    return temp


def predict(theta, X):
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]


col = data.shape[1]
# 此处对data的切割需要使用iloc函数，切记切记切记！！！
X = data.iloc[:, 0:col-1]
X.insert(0, 'one', pd.Series(np.ones(data.shape[0])))
X = np.matrix(X.values)
y = np.matrix(data.iloc[:, col-1:col].values)
theta = np.matrix(np.zeros(X.shape[1]))
# print(costFunc(X, y, theta))

# 不使用梯度下降的方法来求最小theta，使用高级函数fmin_tnc，该函数也适用于linear regression
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
# print(result)
# print(cost(result[0], X, y))

theta = np.matrix(result[0])
prediction = predict(theta, X)
array1 = [1 if (a == b) else 0 for (a, b) in zip(prediction, y)]
accuracy = np.sum(array1) / len(array1)
print(accuracy)



#############################################################################################################

import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import scipy.optimize as opt
# from sklearn import linear_model
# 
# path = 'E:\yanwucao/test\ML\ex2\ex2data2.txt'
# data = pd.read_csv(path, header=None, names=['test1', 'test2', 'accepted'])
# # print(data.head())
# 
# # 数据可视化绘制出来
# # fig, ax = plt.subplots(figsize=(12, 8))
# # positive = data['accepted'].isin([1])
# # negative = data['accepted'].isin([0])
# # # print(positive.head())   # 测试isin
# # data1 = data['test1']
# # data2 = data['test2']
# # ax.scatter(data1[positive], data2[positive], c='r', marker='o', label='accepted')
# # ax.scatter(data1[negative], data2[negative], c='b', marker='x', label='unaccepted')
# # ax.legend()
# # ax.set_xlabel('test1 score')
# # ax.set_ylabel('test2 score')
# # plt.show()
# 
# # 数据的初始化处理（don't understand）
# degree = 5
# x1 = data['test1']
# x2 = data['test2']
# data.insert(3, 'ones', pd.Series(np.ones(len(data))))
# for i in range(1, degree):
#     for j in range(0, i):
#         data['F' + str(i) + str(j)] = pd.Series(np.power(x1, i-j) + np.power(x2, j))
# # pandas.drop剔除不符合条件的行/列
# data.drop('test1', axis=1, inplace=True)
# data.drop('test2', axis=1, inplace=True)
# # print(data.head())
# 
# 
# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))
# 
# 
# # 由于scipy.optimize.fmin_tnc函数的内部需要，此处的cost,gradient函数都需要加上theta=np.matrix(theta)等三句话，具体原因不明
# def cost(theta, X, y, learningRate):
#     theta = np.matrix(theta)
#     X = np.matrix(X)
#     y = np.matrix(y)
# 
#     frist = np.multiply(-y, np.log(sigmoid(X     * theta.T)))
#     second = np.multiply((y-1), np.log(1 - sigmoid(X * theta.T)))
#     regularization = learningRate / (2 * len(X)) * np.sum(np.power(theta[0, 1:theta.shape[1]], 2))
#     return np.sum(frist + second) / len(X) + regularization   # len(X)指的就是X的行数
# 
# 
# # 由于scipy.optimize.fmin_tnc函数的内部需要，此处的cost,gradient函数都需要加上theta=np.matrix(theta)等三句话，具体原因不
# def gradient(theta, X, y, learningRate):
#     theta = np.matrix(theta)
#     X = np.matrix(X)
#     y = np.matrix(y)
# 
#     temp = np.zeros(X.shape[1])
#     deviation = sigmoid(X) - y
#     for i in range(X.shape[1]):
#         if i == 0:
#             temp[i] = np.sum(np.multiply(deviation, X[:, 0])) / len(X)
#         else:
#             temp[i] = (np.sum(np.multiply(deviation, X[:, i])) + learningRate * theta[0, i]) / len(X)
#     return temp
# 
# 
# def predict(theta, X):
#     possibility = sigmoid(X * theta.T)
#     return [1 if i >= 0.5 else 0 for i in possibility]
# 
# 
# locs = data.shape[1]
# X = data.iloc[:, 1:locs]
# X = np.matrix(X.values)
# y = data.iloc[:, 0:1]
# y = np.matrix(y.values)
# theta = np.matrix(np.zeros(X.shape[1]))
# # print(theta.shape)
# # 直观感受len(X) == X.shape[0]
# # print(len(X))
# # print(X.shape[0])
# learningRate = 1
# # print(cost(theta, X, y, learningRate))
# # print(gradient(theta, X, y, learningRate))
# 
# result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y, learningRate))
# # print(result)
# # print(type(predict(theta, X)))   # 返回list形式
# 
# theta1 = np.matrix(result[0])
# prediction = predict(theta1, X)
# error = [1 if a == b else 0 for (a, b) in zip(prediction, y)]
# accuracy = sum(error) / len(error)
# print(accuracy)
# 
# 
# ##########################################################################################
# # 使用sklearn高级包
# model = linear_model.LogisticRegression(penalty='l2', C=1.0)
# model.fit(X, y.ravel())
# print(model.score(X, y))

吴恩达机器学习练习2(python)：逻辑回归

猜你喜欢