吴恩达作业2+python+逻辑回归

参考博客，感谢大佬
 可以用来下载数据集

#学习使用pandas进行数据的读取，很总要
#https://blog.csdn.net/MOU_IT/article/details/78762196
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'],usecols=[0,1,2],nrows=5)
#data = pd.read_csv('ex2data1.txt', header=None,nrows=3,usecols=[0,1,2],prefix="x")  #只有当header=None,时，prefix起作用，用于。。
#data.index=["a","b","c"]  # data.column=[]  ,列标签
print(data.head())   #默认为前5行数据查看
print(data.describe())    #描述性统计
'''
positive = data[data.admitted.isin(['1'])]  # 1  选择特定行
negetive = data[data.admitted.isin(['0'])]  # 0
#数据的可视化
fig, ax = plt.subplots(figsize=(6,5))
ax.scatter(positive['exam1'], positive['exam2'], c='b', label='Admitted')
ax.scatter(negetive['exam1'], negetive['exam2'], s=50, c='r', marker='x', label='Not Admitted')
# 设置图例显示在图的上方
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width , box.height* 0.8])
ax.legend(loc='center left', bbox_to_anchor=(0.2, 1.12),ncol=3)
# 设置横纵坐标名
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
'''
print(data.info())  #查看数据的格式，很有作用
data.to_csv('1.csv', index=False,header=False)  #用于将数据保存为csv格式，index参数设置列序号,sep=",",默认逗号分隔符,header=False不需要头部

print("----------------------------------------------")
print(data[0:5])  
#df.loc['20']  # 获取指定索引的行，返回的是一个Series,loc[]传入的是索引名
print(type(data.iloc[3]))    # 获取指定序号的行，这里是第4行,iloc[]传入的是行数

print (data.iloc[0,0])    # 获取单个元素

print(data.values)  #打印为数值类型

for a,b,c in data.values:
    print (a,b,c)

#只实现了最基础的
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
def sigmoid(z):                                   #逻辑回归函数
    return 1 / (1 + np.exp(- z))
def cost(theta, X, y):                            #代价函数
    first = (-y) * np.log(sigmoid(X @ theta))     #用数组作为矩阵相乘 ，等价于np.dot()
    second = (1 - y)*np.log(1 - sigmoid(X @ theta))
    return np.mean(first - second)

data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
# add a ones column - this makes the matrix multiplication work out easier
if 'Ones' not in data.columns:   #列标签
    data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
X = data.iloc[:, :-1].values # Convert the frame to its Numpy-array representation.转化为数组
y = data.iloc[:, -1].values  # Return is NOT a Numpy-matrix, rather, a Numpy-array.
theta = np.zeros(X.shape[1])   #需要更新的参数值，初始化
print(cost(theta, X, y))       #打印初始代价
# 0.6931471805599453

#尝试使用最基础的梯度下降法更新参数，成功，这才是真真的调参，只不过训练了120000次，学习率0.001，可以尝试一下集成函数
#只实现了最基础的
#自动求偏导是如何进行和实现的？
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
def sigmoid(z):                                   #逻辑回归函数
    return 1 / (1 + np.exp(- z))
def computeCost(X, y, theta):                            #代价函数
    first = (-y) * np.log(sigmoid(X @ theta))     #用数组作为矩阵相乘 ，等价于np.dot()
    second = (1 - y)*np.log(1 - sigmoid(X @ theta))
    return np.mean(first - second)

def gradientDescent(X, y, theta, alpha, epoch):
    """reuturn theta, cost"""
    
    temp = np.matrix(np.zeros(theta.shape))  # 初始化一个 θ 临时矩阵(1, 2)
    parameters = int(theta.flatten().shape[0])  # 参数 θ的数量
    cost = np.zeros(epoch)  # 初始化一个ndarray，包含每次epoch的cost
    m = X.shape[0]  # 样本数量m
    
    for i in range(epoch):
        # 利用向量化一步求解
        temp =theta -  (X.T @ (sigmoid(X @ theta) - y))/len(X) *alpha
        

            
        theta = temp
        cost[i] = computeCost(X, y, theta)
        
    return theta, cost




data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
#data = (data - data.mean()) / data.std()  #对矩阵进行标准化,经过验证在这里进行标准化是不正确的，因为y为0——1变量
# add a ones column - this makes the matrix multiplication work out easier
if 'Ones' not in data.columns:   #列标签
    data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
X = data.iloc[:, :-1].values # Convert the frame to its Numpy-array representation.转化为数组
y = data.iloc[:, -1].values  # Return is NOT a Numpy-matrix, rather, a Numpy-array.
theta = np.zeros(X.shape[1])   #需要更新的参数值，初始化，此时维度已经为n+1维向量
print(computeCost( X, y,theta))       #打印初始代价
# 0.6931471805599453
epoch=120000
alpha=0.001
final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
#print(cost)
print(final_theta)
#这个代价函数近似呈现直线下降，有点奇怪
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(np.arange(epoch), cost, 'r')  # np.arange()返回等差数组
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

#画出分界线
positive = data[data.admitted.isin(['1'])]  # 1
negetive = data[data.admitted.isin(['0'])]  # 0
x1 = np.arange(130, step=0.1)
x2 = -(final_theta[0] + x1*final_theta[1]) / final_theta[2]   #对原始式子进行了一个移项

fig, ax = plt.subplots(figsize=(8,5))
ax.scatter(positive['exam1'], positive['exam2'], c='b', label='Admitted')
ax.scatter(negetive['exam1'], negetive['exam2'], s=50, c='r', marker='x', label='Not Admitted')
ax.plot(x1, x2)
ax.set_xlim(0, 130)
ax.set_ylim(0, 130)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('Decision Boundary')
plt.show()

#实现使用集成方法进行梯度下降验证
#高级优化算法，运行速度通常远远超过梯度下降，且可以自动选择学习率，自动调整学习率，缺点是内部算法过于复杂。
#使用fimin_tnc或者minimize方法来拟合，minimize中method可以选择不同的算法来计算，其中包括TNC
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.optimize as opt
def sigmoid(z):                                   #逻辑回归函数
    return 1 / (1 + np.exp(- z))
def computeCost(theta,X, y):     #代价函数,这里需要注意theta参数需要在第一个位置，和下面定义计算梯度的顺序相同否则会报错
    first = (-y) * np.log(sigmoid(X @ theta))     #用数组作为矩阵相乘 ，等价于np.dot()
    second = (1 - y)*np.log(1 - sigmoid(X @ theta))
    return np.mean(first - second)
def gradient(theta, X, y):
    return (X.T @ (sigmoid(X @ theta) - y))/len(X)  
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
if 'Ones' not in data.columns:   #列标签
    data.insert(0, 'Ones', 1)

X = data.iloc[:, :-1].values # Convert the frame to its Numpy-array representation.转化为数组
y = data.iloc[:, -1].values  # Return is NOT a Numpy-matrix, rather, a Numpy-array.
theta = np.zeros(X.shape[1])   #需要更新的参数值，初始化，此时维度已经为n+1维向量
#print(computeCost( X, y,theta))       #打印初始代价
# 0.6931471805599453
#集成方法一
result = opt.fmin_tnc(func=computeCost, x0=theta, fprime=gradient, args=(X, y)) #approx_grad :如果设置为True，会给出近似梯度
print(result)
print("================================================================")
#集成方法二
res = opt.minimize(fun=computeCost, x0=theta, args=(X, y), method='TNC', jac=gradient)
print(res)
#https://www.cnblogs.com/tongtong123/p/10634716.html  参考博客
'''
  NIT   NF   F                       GTG
    0    1  6.931471805599453E-01   2.71082898E+02
    1    3  6.318123602631309E-01   7.89087138E-01
    2    5  5.892425303175509E-01   7.39225780E+01
    3    7  4.227824571707321E-01   1.85266580E+01
    4    9  4.072926869723624E-01   1.68671186E+01
    5   11  3.818855110233386E-01   1.07734961E+01
    6   13  3.786234910783183E-01   2.31585002E+01
tnc: stepmx = 1000
    7   16  2.389267599630711E-01   3.00820661E+00
    8   18  2.047203831000229E-01   1.52223779E-01
    9   20  2.046713862657213E-01   6.62489681E-02
   10   22  2.035303178562529E-01   9.30772355E-04
tnc: fscale = 32.7777
   11   24  2.035293537528956E-01   8.07516648E-06
   12   26  2.035251135439432E-01   1.80172613E-04
   13   28  2.034984114325442E-01   5.02823356E-04
   14   30  2.034978387668955E-01   9.95781243E-06
   15   32  2.034977910481771E-01   3.78736114E-06
   16   34  2.034977391096564E-01   1.95536469E-05
   17   36  2.034977015894747E-01   2.28936970E-13
tnc: |pg| = 1.45975e-08 -> local minimum
   17   36  2.034977015894747E-01   2.28936970E-13
tnc: Local minima reach (|pg| ~= 0)
(array([-25.1613186 ,   0.20623159,   0.20147149]), 36, 0)
================================================================
     fun: 0.20349770158947475
     jac: array([8.86249424e-09, 7.33646598e-08, 4.72732538e-07])
 message: 'Local minimum reached (|pg| ~= 0)'
    nfev: 36
     nit: 17
  status: 0
 success: True
       x: array([-25.1613186 ,   0.20623159,   0.20147149])
请按任意键继续. . .
'''

#一个拟合数据的更好的方法是从每个数据点创建更多的特征,将把这些特征映射到所有的x1和x2的多项式项上，直到第六次幂。
#即使有很多参数与特征，正则化依然可以解决过度拟合的问题
'''
a={} 
a["f{}{}".format(0,0)]=1
print(a)  #{'f00': 1}
'''
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
def plot_data(data):            #化散点图，可视化
    positive = data2[data2['Accepted'].isin([1])]
    negative = data2[data2['Accepted'].isin([0])]

    fig, ax = plt.subplots(figsize=(8,5))
    ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
    ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Rejected')
    ax.legend()
    ax.set_xlabel('Test 1 Score')
    ax.set_ylabel('Test 2 Score')
    plt.show()

def feature_mapping(x1, x2, power):
    data = {
    
    }
    for i in np.arange(power + 1):
        for p in np.arange(i + 1):
            data["f{}{}".format(i - p, p)] = np.power(x1, i - p) * np.power(x2, p)

#     data = {"f{}{}".format(i - p, p): np.power(x1, i - p) * np.power(x2, p)
#                 for i in np.arange(power + 1)
#                 for p in np.arange(i + 1)
#             }
    return pd.DataFrame(data)
def costReg(theta, X, y, l=1):
    # 不惩罚第一项
    _theta = theta[1: ]
    reg = (l / (2 * len(X))) *(_theta @ _theta)  # _theta@_theta == inner product
    
    return cost(theta, X, y) + reg
def cost(theta, X, y):
    first = (-y) * np.log(sigmoid(X @ theta))
    second = (1 - y)*np.log(1 - sigmoid(X @ theta))
    return np.mean(first - second)
def sigmoid(z):                                   #逻辑回归函数
    return 1 / (1 + np.exp(- z))

def gradientReg(theta, X, y, l=1):
    reg = (1 / len(X)) * theta
    reg[0] = 0  
    return gradient(theta, X, y) + reg
def gradient(theta, X, y):
    return (X.T @ (sigmoid(X @ theta) - y))/len(X)  
def predict(theta, X):
    probability = sigmoid(X@theta)
    return [1 if x >= 0.5 else 0 for x in probability]  # return a list

data2 = pd.read_csv('ex2data2.txt', names=['Test 1', 'Test 2', 'Accepted'])
#print(data2.head())
#plot_data(data2)
x1 = data2['Test 1'].values
x2 = data2['Test 2'].values
_data2 = feature_mapping(x1, x2, power=6)
#print(_data2.head())   新数据，特征映射允许我们构建一个更有表现力的分类器，但它也更容易过拟合。
#利用正则化可以减少过拟合问题
X = _data2.values 
y = data2['Accepted'].values
theta = np.zeros(X.shape[1])
#print(costReg(theta, X, y, l=1))  #     0.6931471805599454

result2 = opt.fmin_tnc(func=costReg, x0=theta, fprime=gradientReg, args=(X, y, 2))
print(result2)  #最后拟合的参数

print("======================================================================")
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X, y.ravel())
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
print(model.score(X, y))    #准确率
print("======================================================================")
#自定义计算准确率
#方法一
final_theta = result2[0]
predictions = predict(final_theta, X)
correct = [1 if a==b else 0 for (a, b) in zip(predictions, y)]
accuracy = sum(correct) / len(correct)
print(accuracy)
#方法二
#print(classification_report(y, predictions))

#决策边界
x = np.linspace(-1, 1.5, 250)
xx, yy = np.meshgrid(x, x)

z = feature_mapping(xx.ravel(), yy.ravel(), 6).values
z = z @ final_theta
z = z.reshape(xx.shape)
positive = data2[data2['Accepted'].isin([1])]
negative = data2[data2['Accepted'].isin([0])]
fig, ax = plt.subplots(figsize=(8,5))
ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.legend()
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
#plt.show()   这里不能加
plt.contour(xx, yy, z, 0)
plt.ylim(-.8, 1.2)
plt.show()


'''
>>> import numpy as np
>>> a=np.array([1,2,3])
>>> b=np.array([4,5,6])
>>> a*b
array([ 4, 10, 18])
>>> np.dot(a,b)
32
>>> a@b
32
>>>
'''

吴恩达作业2+python+逻辑回归

猜你喜欢