版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_27668313/article/details/79048428
from scipy import io as spio
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from numpy import *
from sklearn.preprocessing import StandardScaler
from scipy import optimize
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
time_start = time.time()
# ********************************************************************
# 使用sklearn库中的LR函数
# ********************************************************************
# def loadDataSet(filedir):
# DateSet = pd.read_csv(filedir)
# Data = DateSet.iloc[1:12001, 1:].as_matrix() # dataframe格式转成元组
# print('Data size: ', np.shape(Data))
# label = DateSet.iloc[1:12001, :1].as_matrix()
# # 分配训练数据和测试数据
# train_data, test_data, train_label, test_label = train_test_split(Data, label, test_size=0.3, random_state=0)
# return train_data, test_data, train_label, test_label
# filedir = 'E:\ProgramData\Python3Project\Digit Recognizer/train.csv'
# train_data, test_data, train_label, test_label = loadDataSet(filedir)
# train_label = np.ravel(train_label)
# test_label = np.ravel(test_label)
# model = LogisticRegression()
# model.fit(train_data, train_label)
# predict = model.predict(test_data)
# accuracy = np.mean(np.float64(predict == test_label)*100)
# print(accuracy)
# time_end = time.time()
# print('time: ', (time_end - time_start))
# =======================================================================
# *********************************************************************
# 自己编写LR函数
# *********************************************************************
def loadDataSet(filedir):
DateSet = pd.read_csv(filedir)
Data = DateSet.iloc[1:42001, 1:].as_matrix() # dataframe格式转成元组
print('Data size: ', np.shape(Data))
label = DateSet.iloc[1:42001, :1].as_matrix()
# 分配训练数据和测试数据
train_data, test_data, train_label, test_label = train_test_split(Data, label, test_size=0.3, random_state=0)
return train_data, test_data, train_label, test_label
def sigmoid(inX):
return 1.0/(1+np.exp(-inX))
def costfunction(initial_theta, X, y, initial_Lambda):
# initial_theta是1行m列, y是m行1列
m = len(y)
J = 0
h = sigmoid(np.dot(X, np.transpose(initial_theta)))
theta1 = initial_theta.copy()
theta1[0] = 0
temp = np.dot(theta1, np.transpose(theta1))
J = (-np.dot(np.transpose(y), np.log(h)) -
np.dot(np.transpose(1-y), np.log(1-h)) + temp*initial_Lambda /2)/m
return J
def gradient(initial_theta, X, y, initial_lambda):
m = len(y)
grad = np.zeros(initial_theta.shape)
h = sigmoid(np.dot(X, np.transpose(initial_theta))) # h是m行1列
theta1 = initial_theta.copy()
theta1[0] = 0
# grad是m行1列,每行表示相应参数的梯度
grad = np.dot(np.transpose(X), h-y)/m + initial_lambda/m * theta1
return grad
def oneVsAll(X, y, num_labels, Lambda):
m, n = X.shape
all_theta = np.zeros((num_labels, n+1)) # add ones to the trainData matrix
X = np.hstack((ones((m, 1)), X))
initial_theta = np.zeros((1, n+1))
class_y = np.zeros((m, num_labels))
for i in range(num_labels):
class_y[:, i] = int32(y == i).reshape(1, -1)
# print('class_y: ', class_y[0:10, :])
# print('y: ', y[0:10])
'''遍历每个分类,计算对应的theta值'''# 使用fmin_bfgs计算出最佳参数
for i in range(num_labels):
result = optimize.fmin_bfgs(costfunction, initial_theta, fprime=gradient,
args=(X, class_y[:, i], Lambda))
all_theta[i, :] = result.reshape(1, -1)
# all_theta每行存储的是一个分类器的参数,共有十个分类器
# 每个分类器代表一个数字的最佳映射函数,例如第1行是数字0的参数
all_theta = np.transpose(all_theta)
return all_theta
def predict(all_theta, X, y):
m = X.shape[0]
num_labels = all_theta.shape[0]
p = np.zeros((m, 1))
X = np.hstack((np.ones((m, 1)), X)) # 加上截距
h = sigmoid(np.dot(X, all_theta)) # 预测
h = np.array(h)
# print('h: ', h)
p = np.argmax(h, axis=1) #返回h中每行最大值的索引,索引值其实与预测的数字相同
nums = 0
for i in range(m):
if p[i] == y[i]:
nums += 1
accuracy = nums/m * 100
return accuracy
filedir = 'E:\ProgramData\Python3Project\Digit Recognizer/train.csv'
train_data, test_data, train_label, test_label = loadDataSet(filedir)
# 归一化,使用归一化后精确度有很大的提高
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.fit_transform(test_data)
alltheta = oneVsAll(train_data, train_label, 10, 0.001)
print('alltheta size: ', alltheta.shape)
acc = predict(alltheta, test_data, test_label)
print('accuracy: ', acc)
time_end = time.time()
print('time: ', (time_end-time_start))