逻辑回归的python实现

github上有一些开源的，在开源的基础上进行了些需修改。
本文数据及其代码下载页：
http://download.csdn.net/download/a1b2c3d4123456/10015925
#coding=utf-8

import math
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics


#数据的标准化/归一化处理
def  Normalization(data):
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    return min_max_scaler.fit_transform(data)


#数据加载及处理
def DealData(DataPath):
    df=pd.read_csv(DataPath,header=0)
    df.columns=['f1','f2','label']
    X = df[["f1", "f2"]]
    X = np.array(X)
    X = Normalization(X)
    Y=df['label'].map(lambda x: float(x.rstrip(';')))
    Y=np.array(Y)
    return X,Y


#Sigmoid函数
def Sigmoid(z):
    return  float(1.0/(float(1+math.exp(-1.0*z))))


#计算H函数值
def Hypothesis(theta,xi):
    z = 0.0
    for i in xrange(len(theta)):
        z += xi[i]*theta[i]
    return Sigmoid(z)

#计算每一个theta向量的每一个theta的梯度，使用的是批量梯度下降的方法
def Cost_Function_Derivative(X,Y,theta,j,alpha):
    sum_errors=0.0
    m=len(Y)
    for i in xrange(m):
        xi=X[i]
        xij=xi[j]
        hi=Hypothesis(theta,xi)
        sum_errors += ((hi-Y[i])*xij)
    constant = float(alpha)/float(m)
    res = constant*sum_errors
    return res


#梯度下降，更新theta的每一个theta的值
def Gradient_Descent(X,Y,theta,alpha):
    new_theta=[]
    for j in xrange(len(theta)):
        CFDerivative = Cost_Function_Derivative(X,Y,theta,j,alpha)  #计算第i个权重的梯度
        new_theta_value=theta[j]-CFDerivative
        new_theta.append(new_theta_value)
    return new_theta


#损失函数，用于计算模型的loss
def Cost_Function(X,Y,theta):
    sum_errors=0.0
    m=len(Y)
    for i in xrange(m):
        xi=X[i]
        hi=Hypothesis(theta,xi)
        if Y[i] == 1:
            error=Y[i]*math.log(hi)
        elif Y[i] == 0:
            error=(1-Y[i])*math.log(1-hi)
        sum_errors += error
    J=(-1.0/m) * sum_errors
    return J

#逻辑回归的主体
def Logittic_Regression(X,Y,alpha,theta,num_iters):

    for x in xrange(num_iters): #多次迭代
        new_theta=Gradient_Descent(X,Y,theta,alpha)
        theta=new_theta
        #每100个样本计算一下损失，这一步可有可无，可以在最后计算一下即可
        if x % 100 == 0:
            res=Cost_Function(X,Y,theta)
            print 'cost is ',res
    return theta


#模型保存
def Save_Model(mode,modelPath):
    save=open(modelPath,'w')
    for i in mode:
        save.write(str(i))
        save.write('\n')
    return


#根据已经训练好的模型，进行预测，返回的是每一个样本属于1的概率组成的list
def Logittic_Regression_Predict(model,X):
    predictOut = []
    for i in xrange(len(X)):
        probability = Hypothesis(model,X[i])
        predictOut.append(probability)
    return predictOut


#根据预测值与真实值计算AUC
def Logittic_Regression_Auc(Predict,TrueValue):
    fpr, tpr, thresholds = metrics.roc_curve(TrueValue, Predict, pos_label=1)
    return metrics.auc(fpr, tpr)



if __name__ == '__main__':

    train_X,train_Y=DealData('train.csv') #处理数据，得到符合训练格式的数据
    test_X, test_Y=DealData('test.csv')#处理数据，得到符合测试格式的数据

    # 初始化参数
    initial_theta = [0.0 for i in range(train_X.ndim)]  #theta初始化
    alpha = 0.1 #学习率
    iterations = 10   #迭代次数

    model_lr=Logittic_Regression(train_X,train_Y,alpha,initial_theta,iterations)
    Save_Model(model_lr,'LRModel_v1')
    test_predict=Logittic_Regression_Predict(model_lr,test_X)
    auc=Logittic_Regression_Auc(test_predict,test_Y)
    print auc
逻辑回归的python实现

猜你喜欢