机器学习--逻辑回归模型(Logistic-Regression)的C++实现

实验思路

在octave中，利用rand函数生成两类随机数，一类的y > x^2，另一类的y < x^2
将数据导出，存为txt文件
用代价函数评估优化模型性能，用precision，recall来对模型性能进行评估
采用梯度下降法对模型进行优化

代码展示

//
//  main.cpp
//  logistic regression
//
//  Created by zsp on 16/07/2018.
//  Copyright © 2018 zsp. All rights reserved.
//

#include <iostream>
#include <cmath>
#include <fstream>

const int SAMPLE = 402;     // 样本数
const int PARAMETER = 2;    // 特征数

double hypoVal(double para[], double fea[], int count);
double costVal(double para[], double allX[][PARAMETER + 1], double label[], int amount, int count);

using namespace std;

int main()
{
    // 定义二维数组存储数据
    double X[SAMPLE][PARAMETER + 1] = { 0 };
    // 打开文件
    fstream infile;
    infile.open("dataset.txt");
    if (!infile)
    {
        cout << "cannot open file!" << endl;
        return -1;
    }
    // 写入数据
    for (int i = 0; i < SAMPLE; i++)
    {
        for (int j = 0; j < PARAMETER + 1; j++)
        {
            infile >> X[i][j];
        }
    }
    // 验证是否写入数据
    for (int i = 0; i < SAMPLE; i++)
    {
        for (int j = 0; j < PARAMETER + 1; j++)
        {
            cout << X[i][j] << "\t\t";
        }
        cout << endl;
    }
    infile.close();

    double y[SAMPLE] = { 0 };   // label vector
    double theta[2 * PARAMETER] = { 0 };   // parameter vector
    for (int i = 0; i < SAMPLE; i++)
    {
        y[i] = X[i][PARAMETER];
    }
    double h = hypoVal(theta, X[0], PARAMETER); // hypothesis function
    double cost = costVal(theta, X, y, SAMPLE, PARAMETER);    // cost function
    double a = 0.314; // set the learning rate as
    // 梯度下降法求最优解
    double tempCost = 0;
    double temp[2 * PARAMETER] = { 0 };
    double der[2 * PARAMETER] = { 0 };
    int cnt = 0;
    do {
        tempCost = cost;
        double sum = 0;
        for (int j = 0; j < PARAMETER; j++)
        {
            for (int i = 0; i < SAMPLE; i++)
            {
                sum += (hypoVal(theta, X[i], PARAMETER) - y[i]) * X[i][j];
            }
            der[j] = (1.0 / double (SAMPLE)) * sum;
            temp[j] = theta[j] - a * der[j];
            sum = 0;
        }
        for (int j = 0; j < PARAMETER; j++)
        {
            for (int i = 0; i < SAMPLE; i++)
            {
                sum += (hypoVal(theta, X[i], PARAMETER) - y[i]) * pow(X[i][j], 2.0);
            }
            der[j + PARAMETER] = (1.0 / double (SAMPLE)) * sum;
            temp[j + PARAMETER] = theta[j + PARAMETER] - a * der[j + PARAMETER];
        }
        cout << "now the theta parameters are: ";
        for (int j = 0; j < 2 * PARAMETER; j++)
        {
            theta[j] = temp[j];
            cout << theta[j] << " ";
        }
        cout << endl;
        cost = costVal(theta, X, y, SAMPLE, PARAMETER);
        cnt++;
    } while (tempCost - cost > 0.00001);
    cout << "共进行" << cnt << "次梯度下降迭代" << endl;
    // 写入测试集
    infile.open("testset.txt");
    if (!infile)
    {
        cout << "cannot open file!" << endl;
        return -1;
    }
    // 写入数据
    for (int i = 0; i < SAMPLE; i++)
    {
        for (int j = 0; j < PARAMETER + 1; j++)
        {
            infile >> X[i][j];
        }
    }
    infile.close();
    double tp = 0, fp = 0, tn = 0, fn = 0;
    for (int i = 0; i < SAMPLE; i++)
    {
        h = hypoVal(theta, X[i], PARAMETER);
        if (h > 0.5 && X[i][PARAMETER] == 1)
        {
            tp++;
        }
        if (h > 0.5 && X[i][PARAMETER] == 0)
        {
            fp++;
        }
        if (h < 0.5 && X[i][PARAMETER] == 0)
        {
            tn++;
        }
        if (h < 0.5 && X[i][PARAMETER] == 1)
        {
            fn++;
        }
    }
    cout << "precision: " << tp / (tp + fp) << " ";
    cout << "recall: " << tp / (tp + fn) << endl;
    return 0;
}
double hypoVal(double para[], double fea[], int count)  //计算假设函数的值
{
    double sum = 0;
    //sum += para[0] * fea[0] + para[1] * fea[1] + para[2] * fea[2] * fea[2] + para[3] * fea[3] * fea[3];
    for (int i = 0; i < count; i++)
    {
        sum += para[i] * fea[i] + para[i + PARAMETER] * fea[i] * fea[i];
    }
    return 1.0 / (1.0 + exp(-sum));
}
double costVal(double para[], double allX[][PARAMETER + 1], double label[], int amount, int count)  //计算代价函数的值
{
    double sum = 0;
    for (int i = 0; i < amount; i++)
    {
        double hy = hypoVal(para, allX[i], PARAMETER);
        sum += label[i] * log(hy) + (1 - label[i]) * log(1 - hy);
    }
    cout << "now the cost value is: " << -(1.0 / double (amount)) * sum << endl;
    return -(1.0 / double (amount)) * sum;
}

验证结果

now the cost value is: 0.693147
now the theta parameters are: -7.8047e-19 0.0785866 -3.03516e-18 0.0549665 
...
now the cost value is: 0.0888141
now the theta parameters are: 0.133379 9.82312 -8.23081 -1.38364 
now the cost value is: 0.0887954
now the theta parameters are: 0.133374 9.82487 -8.23255 -1.38377 
...
now the theta parameters are: 0.127623 11.3934 -9.78998 -1.47413 
now the cost value is: 0.0743945
共进行2996次梯度下降迭代
precision: 0.971014 recall: 1

可以看到代价函数的值为0.074
可以认为训练结果很好
可以看到在测试集上，准确率为97%，查全率为100%
可以认为学习器泛化能力很强，分类效果非常好

决策边界

根据结果我们可以绘制出学习结果，如下图所示

机器学习--逻辑回归模型(Logistic-Regression)的C++实现

实验思路

代码展示

验证结果

决策边界

猜你喜欢