[实践篇] 逻辑回归

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/shenziheng1/article/details/83048500

代码、数据已经上传,可以自主下载。https://download.csdn.net/download/shenziheng1/10719760

1. 训练过程

import numpy as np

def load_data(file_name):
    """
    input: file_name(string)
    output: feature_data(mat)
            label_data(mat)
    """
    f = open(file_name,"r")
    feature_data = []
    label_data = []
    for line in f.readlines():
        feature_tmp = []
        label_tmp = []
        lines = line.strip().split("\t")
        feature_tmp.append(1)  # x0 = 1
        for i in xrange(len(lines)-1):
            feature_tmp.append(float(lines[i]))
        label_tmp.append(float(lines[-1]))

        feature_data.append(feature_tmp)
        label_data.append(label_tmp)
    f.close()
    return np.mat(feature_data), np.mat(label_data)


def sig(x):
    """sigmiod function"""
    return 1.0 / (1 + np.exp(-x))


def error_rate(h, label):
    """
    input:  h(mat)      predicting data
            label(mat)  labels
    output: err/m(float) false ratio
    """
    m = np.shape(h)[0]
    sum_err = 0.0
    for i in xrange(m):
        if h[i,0] > 0 and (1 - h[i,0]) >0:
            sum_err = sum_err - (label[i,0] * np.log(h[i,0]) +
                      (1-label[i,0]) * np.log(1-h[i,0]))
        else:
            sum_err = sum_err
    return sum_err / m


def lr_train_bgd(feature, label, maxCycle, alpha):
    """
    input: feature(mat)
           label(mat)
           maxcycle(int)
           alpha(float)
    output: w(mat) weights
    """
    n = np.shape(feature)[1] # the number of feature
    w = np.mat(np.ones((n,1))) # the number of weight
    i = 0
    while i <= maxCycle:
        i = i+1
        h = sig(feature * w)
        err = label - h
        if i % 100 == 0:
            print "\t--------iter=" + str(i) + \
                ", train error rate=" + str(error_rate(h, label))
            w = w + alpha * feature.T * err # modifying weights
    return w


def save_model(file_name, w):
    """
    input: file_name(string) the filepath for saving model
           w weights
    """
    m = np.shape(w)[0]
    f_w = open(file_name, "w")
    w_array = []
    for i in xrange(m):
        w_array.append(str(w[i,0]))
    f_w.write("\t".join(w_array))
    f_w.close()


#def imgplot(feature, w):


if __name__ == "__main__":
    """ import training data """
    print "--------load data--------"
    feature, label = load_data("data.txt")
    """ training logistic regression model """
    print "--------training--------"
    w = lr_train_bgd(feature, label, 1000, 0.01)
    """ save model """
    print "--------save model---------"
    save_model("weights",w)

训练结果为:

2. 测试代码

import numpy as np
from logistic_training import sig


def load_weight(w):
    f = open(w)
    w = []
    for line in f.readlines():
        lines = line.strip().split("\t")
        w_tmp = []
        for x in lines:
            w_tmp.append(float(x))
        w.append(w_tmp)
    f.close()
    return np.mat(w)


def load_data(file_name, n):
    f = open(file_name)
    feature_data = []
    for line in f.readlines():
        feature_tmp = []
        lines = line.strip().split("\t")
        if len(lines) <> n - 1:
            continue
        feature_tmp.append(1)
        for x in lines:
            feature_tmp.append(float(x))
        feature_data.append(feature_tmp)
    f.close()
    return np.mat(feature_data)


def predict(data, w):
    h = sig(data * w.T)
    m = np.shape(h)[0]
    for i in xrange(m):
        if h[i, 0] < 0.5:
            h[i, 0] = 0.0
        else:
            h[i, 0] = 1.0
    return h


def save_result(file_name, result):
    m = np.shape(result)[0]
    tmp = []
    for i in xrange(m):
        tmp.append(str(result[i, 0]))
    f_result = open(file_name, "w")
    f_result.write("\t".join(tmp))
    f_result.close()

if __name__ == "__main__":
    """loading LR model"""
    print "--------load model---------"
    w = load_weight("weights")
    n = np.shape(w)[1]
    """loading testing data"""
    testData = load_data("test_data", n)
    """predicting test data"""
    print "--------prediction--------"
    h = predict(testData, w)
    print h
    """save prediction results"""
    print "--------save prediction--------"
    save_result("results", h)

3. 补充知识

  • readlines():  用于读取所有行(直到结束符 EOF)并返回列表,该列表可以由 Python for... in ... 结构进行处理。如果碰到结束符 EOF 则返回空字符串。
  • strip():用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
str = "00000003210Runoob01230000000"; 
print str.strip( '0' ); # 去除首尾字符 0

>>> 3210Runoob0123
  • join(): 用于将序列中的元素以指定的字符连接生成一个新的字符串
str = "-"; 
seq = ("a", "b", "c"); # 字符串序列
print str.join( seq );

>>> a-b-c

猜你喜欢

转载自blog.csdn.net/shenziheng1/article/details/83048500