C language shredded Gaussian Naive Bayesian - guess gender by height, weight and lung capacity (machine learning)

Table of contents

brief description

data

the code

result


brief description

Based on Gaussian Naive Bayesian - predict gender through height and weight , this time add lung capacity feature. This project consists of the following four parts, this section is Section 2:

1. Deduce gender by height and weight

2. Estimate gender based on height, weight, and vital capacity

3. MPI optimization

4. OpenMP optimization

data

Part of the data is as follows, the four columns are: gender, height, weight, vital capacity.

the code

The code is still divided into three files. In fact, according to the correct approach, handleData.cpp and naiveBayes.cpp should be merged, but they are separated for the convenience of debugging. The two files will be merged later.

//allHead.h

#ifndef _STDIO_H_ 
#define _STDIO_H_

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>

#define PI 3.1415926535898

//单条数据的长度
#define MAX_LINE 100
//数据集的长度
#define DATA_LEN 100000
//特征值的个数(身高、体重、肺活量)
#define EIGEN_NUM 4


/*声明函数*/
//获取数据集
void getData(const char *fileLocation);

//获得平均值和标准差:给 Parameter 赋值
void getParameter();
//计算平均值和标准差
double* calParameter(int column,int sex);

//计算概率p(特征列column = x | 性别)
double getProbability(double x,int column,int sex);
//正态分布 x:随机变量的值 u:样本平均值 p:标准差
double gaussianDistribution(double x,double u,double p);

//返回性别字符结果
char* sexResult(float height,float weight);
//返回性别ID结果
int sexIDResult(float height,float weight);
//准确度判断
float precision();

//数据集有肺活量(VC),返回性别字符结果
char* addVCSexResult(float height,float weight,float VC);
//数据集有肺活量(VC),返回性别ID结果
int addVCSexIDResult(float height,float weight,float VC);
//数据集有肺活量(VC),准确度判断
float addVCPrecision();

#endif 
//handleData.cpp

#include"allHead.h"

/*
basicData.csv:性别、身高、体重 [i][0,1,2]
addVitalCapacityData.csv:性别、身高、体重、肺活量 [i][0,1,2,3]
*/

//性别:0表示未知,1表示男,2表示女

float dataSet[DATA_LEN][EIGEN_NUM];	//数据集
double maleNum=0;//男性总数
double femaleNum=0;//女性总数
double meanValue[2][3];			    //男性、女性的所有均值[0][0,1,2]=[男][身高、体重、肺活量]
double standardDeviation[2][3];     //男性、女性的所有标准差[0][0,1,2]=[男][身高、体重、肺活量]

int dataLen;				//从1开始计算的数据集的长度,数学计算用的长度
char *basicInfo[] = {"性别","身高","体重","肺活量"};
//char *addVitalCapacityInfo[] = {"性别","身高","体重","肺活量"};

//读取数据
void getData(const char *fileLocation)
{
	char buf[MAX_LINE];		//缓冲区
	FILE *fp;				//文件指针s
	int len;				//行字符个数

	//printf("正在读取文件\n");

	//读取文件
	if((fp = fopen(fileLocation,"r")) == NULL)
	{
		perror("fail to read");
		exit (1) ;
	}

	//逐行读取及写入数组
	char *token;
	const char s[2] = ",";
	int i=0;
	int j=0;
	while(fgets(buf,MAX_LINE,fp) != NULL && i< DATA_LEN-1)
	{
		len = strlen(buf);
		buf[len-1] = '\0';  //删去换行符
		//分割字符串
		token = strtok(buf, s);
		//继续分割字符串
		j = 0;
		while( token != NULL ) {
			dataSet[i][j]=atof(token);
			//printf("dataSet[%d][%d] = %f\n", i,j, dataSet[i][j] );
			//printf( "%f\n", dataRuslt[i][j]);
			token = strtok(NULL, s);
			j = j+1;
		 }
		i = i + 1;
	}
	dataLen=i;
	//printf("%d条数据读取完毕\n",dataLen);
	//计算男女个数
	for(i=0;i<dataLen;i++){
		if(dataSet[i][0]==1){maleNum=maleNum+1;}
		if(dataSet[i][0]==2){femaleNum=femaleNum+1;}
		}
	getParameter();
}

//获得平均值和标准差:给 Parameter 赋值
void getParameter()
{
	double *p;
	for(int i=0;i<2;i++)
	{
		for(int j=0;j<EIGEN_NUM-1;j++){
			p=calParameter(j+1,i+1);
			meanValue[i][j]=*p;
			standardDeviation[i][j]=*(p+1);
		}
	}
}

//计算平均值和标准差
double* calParameter(int column,int sex)
{
	double  r[2];
	double sexNum;//性别人数
	double u=0; //平均值
	double p=0; //标准差
	double sum=0;
	int i=0;

	//判断性别
	char *gender;
	if(sex==1)
	{
		sexNum=maleNum;
		gender="男性";
	}
	if(sex==2)
	{
		sexNum=femaleNum;
		gender="女性";
	}

	//求平均值
	for(i=0;i<dataLen;i++){
		if(dataSet[i][0]==sex){
			sum=sum+1;
			u=u+dataSet[i][column];
			//printf("%lf\n",dataSet[i][column]);
		}
	}
	u=u/sum;

	//求标准差
	double numSum=0;
	for(i=0;i<dataLen;i++){
		if(dataSet[i][0]==sex){
			numSum=numSum+pow( dataSet[i][column]-u , 2 );
		}
	}
	p=sqrt(numSum/sexNum);
	//printf("numSum=%f\n",numSum);
	printf("%s%s平均值=%.16lf\n标准差=%.16lf\n",gender,basicInfo[column],u,p);

	r[0]=u;
	r[1]=p;
	return r;
}


//计算概率p(特征列column = x | 性别)
double getProbability(double x,int column,int sex)
{
	double Probability;	//计算出的概率
	double u = meanValue[sex-1][column-1];
	double p = standardDeviation[sex-1][column-1];
	Probability = gaussianDistribution(x,u,p);

	//printf("p(%s=%lf|性别=%s)=%.16lf\n",basicInfo[column],x,gender,Probability);

	return Probability;
}


//正态分布 x:随机变量的值 u:样本平均值 p:标准差 y:概率
double gaussianDistribution(double x,double u,double p)
{
	double y;
	p=pow(p,2);
	y = (1 / (2*PI*p)) * exp( -pow((x-u),2) / (2*p) );
	return y;
}

//返回性别字符结果
char* sexResult(float height,float weight)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1) * getProbability(weight,2,1);
	//printf("\n");
	femaleP = a * getProbability(height,1,2) * getProbability(weight,2,2);
	//printf("\n");

	if(maleP > femaleP){return "男性";}
	if(maleP < femaleP){return "女性";}
	if(maleP == femaleP){return "未知";}

}

//返回性别ID结果
int sexIDResult(float height,float weight)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1) * getProbability(weight,2,1);
	//printf("\n");
	femaleP = a * getProbability(height,1,2) * getProbability(weight,2,2);
	//printf("\n");

	if(maleP > femaleP){return 1;}
	if(maleP < femaleP){return 2;}
	if(maleP == femaleP){return 0;}
}

//准确度判断
float precision()
{
	int i;
	float preSexID;
	float right=0;
	float error=0;
	for(i=0;i<dataLen;i++){
		preSexID=sexIDResult(dataSet[i][1],dataSet[i][2]);
		//printf("预测ID:%f  实际ID:%f \n",preSexID,dataSet[i][0]);
		if(dataSet[i][0]==preSexID){right=right+1;}
		else{error=error+1;}
	}
	printf("Right:%f\nError:%f\n",right,error);
	return right/(error+right);
}


//以下VC表肺活量
//数据集有肺活量(VC),返回性别ID结果
char* addVCSexResult(float height,float weight,float VC)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1) * getProbability(weight,2,1) * getProbability(VC,3,1);
	//printf("\n");
	femaleP = a * getProbability(height,1,2) * getProbability(weight,2,2) * getProbability(VC,3,2);
	//printf("\n");

	if(maleP > femaleP){return "男性";}
	if(maleP < femaleP){return "女性";}
	if(maleP == femaleP){return "位置";}
}

//数据集有肺活量(VC),返回性别ID结果
int addVCSexIDResult(float height,float weight,float VC)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1) * getProbability(weight,2,1) * getProbability(VC,3,1);
	//printf("\n");
	femaleP = a * getProbability(height,1,2) * getProbability(weight,2,2) * getProbability(VC,3,2);
	//printf("\n");

	if(maleP > femaleP){return 1;}
	if(maleP < femaleP){return 2;}
	if(maleP == femaleP){return 0;}
}

//数据集有肺活量(VC),准确度判断
float addVCPrecision()
{
	int i;
	float preSexID;
	float right=0;
	float error=0;
	for(i=0;i<dataLen;i++){
		preSexID=addVCSexIDResult(dataSet[i][1],dataSet[i][2],dataSet[i][3]);
		//printf("预测ID:%f  实际ID:%f \n",preSexID,dataSet[i][0]);
		if(dataSet[i][0]==preSexID){right=right+1;}
		else{
			//printf("预测ID:%.0f  实际ID:%.0f \n",preSexID,dataSet[i][0]);
			//printf("性别:%.0f,身高:%.2f,体重:%.2f,肺活量:%.0f \n",dataSet[i][0],dataSet[i][1],dataSet[i][2],dataSet[i][3]);
			error=error+1;}
	}
	printf("Right:%f\nError:%f\n",right,error);
	return right/(error+right);
}
//naiveBayes.cpp

#include"allHead.h"
#include<time.h>

extern float dataSet[DATA_LEN][EIGEN_NUM];
extern int dataLen;
extern double meanValue[2][3];			    //男性、女性的所有均值[0][0,1,2]=[男][身高、体重、肺活量]
extern double standardDeviation[2][3];     //男性、女性的所有标准差[0][0,1,2]=[男][身高、体重、肺活量]

/*
	性别:0表示未知,1表示男,2表示女
	basicData.csv:性别、身高、体重 [i][0,1,2]
	addVitalCapacityData.csv:性别、身高、体重、肺活量
*/

void main(){
	//给全局变量dataSet赋值
	getData("addVitalCapacityData.csv");
	//printf("%f",meanValue[1][2]);

	/*
	//抽查数据集
	int i = dataLen-1;
	int j = 2;
	printf("dataSet[%d][%d] = %f\n", i,j, dataSet[i][j] );
	*/

	/*
	//抽查准确率
	float Height=178;
	float Weight=60;
	float VC=4200;
	printf("身高:%.2f,体重:%.2f\n根据高斯-朴素贝叶斯算法判断\n此人的性别可能为:%s\n\n",Height,Weight,addVCSexResult(Height,Weight,VC));
	*/

	printf("Accuracy:%f\n",addVCPrecision());
}

result

From the screenshots of the experiment, it can be seen that the accuracy rate of Gaussian Naive Bayesian inferring gender through height, weight, and lung capacity is 89.2%, which is still relatively ideal, which is 4.4% higher than the accuracy rate of 84.7% inferring gender only through height and weight. Predicting gender through height and lung capacity also has an accuracy of 86.41%. However, the highest accuracy is the accuracy of lung capacity and height prediction, with an accuracy of 90.76%. It can be seen that the correlation between vital capacity and gender is relatively high, followed by height and weight respectively.

The ranking of the accuracy of predicting gender for each feature combination is:

Height + vital capacity (90%) > height + weight + vital capacity (89%) > height + vital capacity (86%) > height + weight (84%)

Guess you like

Origin blog.csdn.net/admiz/article/details/109758947