实验思路
- 在octave中,利用rand函数生成两类随机数,一类的y > x^2,另一类的y < x^2
- 将数据导出,存为txt文件
- 用代价函数评估优化模型性能,用precision,recall来对模型性能进行评估
- 采用梯度下降法对模型进行优化
代码展示
//
// main.cpp
// logistic regression
//
// Created by zsp on 16/07/2018.
// Copyright © 2018 zsp. All rights reserved.
//
#include <iostream>
#include <cmath>
#include <fstream>
const int SAMPLE = 402; // 样本数
const int PARAMETER = 2; // 特征数
double hypoVal(double para[], double fea[], int count);
double costVal(double para[], double allX[][PARAMETER + 1], double label[], int amount, int count);
using namespace std;
int main()
{
// 定义二维数组存储数据
double X[SAMPLE][PARAMETER + 1] = { 0 };
// 打开文件
fstream infile;
infile.open("dataset.txt");
if (!infile)
{
cout << "cannot open file!" << endl;
return -1;
}
// 写入数据
for (int i = 0; i < SAMPLE; i++)
{
for (int j = 0; j < PARAMETER + 1; j++)
{
infile >> X[i][j];
}
}
// 验证是否写入数据
for (int i = 0; i < SAMPLE; i++)
{
for (int j = 0; j < PARAMETER + 1; j++)
{
cout << X[i][j] << "\t\t";
}
cout << endl;
}
infile.close();
double y[SAMPLE] = { 0 }; // label vector
double theta[2 * PARAMETER] = { 0 }; // parameter vector
for (int i = 0; i < SAMPLE; i++)
{
y[i] = X[i][PARAMETER];
}
double h = hypoVal(theta, X[0], PARAMETER); // hypothesis function
double cost = costVal(theta, X, y, SAMPLE, PARAMETER); // cost function
double a = 0.314; // set the learning rate as
// 梯度下降法求最优解
double tempCost = 0;
double temp[2 * PARAMETER] = { 0 };
double der[2 * PARAMETER] = { 0 };
int cnt = 0;
do {
tempCost = cost;
double sum = 0;
for (int j = 0; j < PARAMETER; j++)
{
for (int i = 0; i < SAMPLE; i++)
{
sum += (hypoVal(theta, X[i], PARAMETER) - y[i]) * X[i][j];
}
der[j] = (1.0 / double (SAMPLE)) * sum;
temp[j] = theta[j] - a * der[j];
sum = 0;
}
for (int j = 0; j < PARAMETER; j++)
{
for (int i = 0; i < SAMPLE; i++)
{
sum += (hypoVal(theta, X[i], PARAMETER) - y[i]) * pow(X[i][j], 2.0);
}
der[j + PARAMETER] = (1.0 / double (SAMPLE)) * sum;
temp[j + PARAMETER] = theta[j + PARAMETER] - a * der[j + PARAMETER];
}
cout << "now the theta parameters are: ";
for (int j = 0; j < 2 * PARAMETER; j++)
{
theta[j] = temp[j];
cout << theta[j] << " ";
}
cout << endl;
cost = costVal(theta, X, y, SAMPLE, PARAMETER);
cnt++;
} while (tempCost - cost > 0.00001);
cout << "共进行" << cnt << "次梯度下降迭代" << endl;
// 写入测试集
infile.open("testset.txt");
if (!infile)
{
cout << "cannot open file!" << endl;
return -1;
}
// 写入数据
for (int i = 0; i < SAMPLE; i++)
{
for (int j = 0; j < PARAMETER + 1; j++)
{
infile >> X[i][j];
}
}
infile.close();
double tp = 0, fp = 0, tn = 0, fn = 0;
for (int i = 0; i < SAMPLE; i++)
{
h = hypoVal(theta, X[i], PARAMETER);
if (h > 0.5 && X[i][PARAMETER] == 1)
{
tp++;
}
if (h > 0.5 && X[i][PARAMETER] == 0)
{
fp++;
}
if (h < 0.5 && X[i][PARAMETER] == 0)
{
tn++;
}
if (h < 0.5 && X[i][PARAMETER] == 1)
{
fn++;
}
}
cout << "precision: " << tp / (tp + fp) << " ";
cout << "recall: " << tp / (tp + fn) << endl;
return 0;
}
double hypoVal(double para[], double fea[], int count) //计算假设函数的值
{
double sum = 0;
//sum += para[0] * fea[0] + para[1] * fea[1] + para[2] * fea[2] * fea[2] + para[3] * fea[3] * fea[3];
for (int i = 0; i < count; i++)
{
sum += para[i] * fea[i] + para[i + PARAMETER] * fea[i] * fea[i];
}
return 1.0 / (1.0 + exp(-sum));
}
double costVal(double para[], double allX[][PARAMETER + 1], double label[], int amount, int count) //计算代价函数的值
{
double sum = 0;
for (int i = 0; i < amount; i++)
{
double hy = hypoVal(para, allX[i], PARAMETER);
sum += label[i] * log(hy) + (1 - label[i]) * log(1 - hy);
}
cout << "now the cost value is: " << -(1.0 / double (amount)) * sum << endl;
return -(1.0 / double (amount)) * sum;
}
验证结果
now the cost value is: 0.693147
now the theta parameters are: -7.8047e-19 0.0785866 -3.03516e-18 0.0549665
...
now the cost value is: 0.0888141
now the theta parameters are: 0.133379 9.82312 -8.23081 -1.38364
now the cost value is: 0.0887954
now the theta parameters are: 0.133374 9.82487 -8.23255 -1.38377
...
now the theta parameters are: 0.127623 11.3934 -9.78998 -1.47413
now the cost value is: 0.0743945
共进行2996次梯度下降迭代
precision: 0.971014 recall: 1
可以看到代价函数的值为0.074
可以认为训练结果很好
可以看到在测试集上,准确率为97%,查全率为100%
可以认为学习器泛化能力很强,分类效果非常好
决策边界
根据结果我们可以绘制出学习结果,如下图所示