OpenMP并行优化高斯朴素贝叶斯算法 - 通过身高、体重和肺活量推测性别(机器学习)

目录

OpenMP并行设计

用OpenMP进行共享内存并行计算

使用pragma omp parallel for提高训练速度

程序代码

结果


本项目由以下四部分组成,本节是第4节:

1、通过身高、体重推测性别

2、通过身高、体重、肺活量推测性别

3、MPI优化

4、OpenMP优化

OpenMP并行设计

在串行代码的基础上,OpenMP通过# pragma omp parallel for num_threads(thread_count) 指令,对所有涉及大型数组的for循环进行多线程处理,以此达到并行效果。

用OpenMP进行共享内存并行计算

使用pragma omp parallel for提高训练速度

由于本项目串行需要多次通过for(i=0;i<dataLen;i++)循环遍历整个数组来求值,因此十分适合使用pragma omp parallel for指令来进行共享内存编程。该指令表示所有线程平分(dataLen / comm_sz)个数据,并以这些数据进行循环计算,各线程再将数据返回主线程,与MPI的MPI_Scatterv()函数异曲同工。

pragma omp parallel for指令后面的子句有:num_threads(thread_count)、reduction(+:maleNum)、shared(dataSet,dataLen) 、private(i)。num_threads(thread_count)表示线程数,reduction(+:A)表示maleNum是归约变量,最后要将所有线程计算的A累加。shared(dataSet,dataLen) 表示dataSet,dataLen数组数所有线程都可以共享的,private(i)则表示每个线程的i都是私有的,不受其它线程影响。

参考代码如下(求sum[2]为例):

/*初始化,reduction内的变量一定要有初始值*/

double A,B;

A=0;B=0;

double sum[2]={0,0};

# pragma omp parallel for num_threads(thread_count) \

reduction(+:A) reduction(+:B)\

shared(dataSet,dataLen) private(i)

for(i=0;i<dataLen;i++){ ;

A+=dataSet[i*EIGEN_NUM+1];

B+=dataSet[i*EIGEN_NUM+2];}

# pragma omp barrier

sum[0]=A;sum[1]=B;

程序代码

#include <iostream>
#include <vector>
#include <cstdlib>
#include <time.h>
#include <cassert>
#include <cstring>
#include <cmath>
#include<omp.h>


#define PI 3.1415926535898

//单条数据的长度
#define MAX_LINE 20
//数据集的长度(从1开始计算)
#define DATA_LEN 11000000

#define EIGEN_NUM 4

//float dataSet[DATA_LEN * EIGEN_NUM];	//数据集
float (*dataSet)=(float(*))malloc(sizeof(float)*DATA_LEN*EIGEN_NUM);  

int dataLen;//数据集的行数
double maleNum=0;//男性总数
double femaleNum=0;//女性总数

int main(int argc, char **argv) {

	int i=0;
	int j=0;

	double start = omp_get_wtime( );


	/************************读取文件************************/

	char buf[MAX_LINE];		//缓冲区
	FILE *fp;				//文件指针s
	int len;				//行字符个数

		//读取文件
		const char* fileLocation="E:\\test\\addVitalCapacityData.csv";
		fp = fopen(fileLocation,"r");
		if(fp  == NULL)
		{
			perror("fp  == NULL");
			exit (1) ;
		}

		//逐行读取及写入数组
		char *token;
		const char s[2] = ",";
		while(fgets(buf,MAX_LINE,fp) != NULL && i< DATA_LEN)
		{
			len = strlen(buf);
			//删去换行符
			buf[len-1] = '\0';
			//分割字符串
			token = strtok(buf, s);
			//继续分割字符串
			j = 0;
			while( token != NULL ) 
			{
				dataSet[i*EIGEN_NUM + j]=atof(token);
				token = strtok(NULL, s);
				j = j+1;
			 }
			i = i + 1;
		}
		dataLen=i;
		printf("%d行4列的数据读取完毕\n",dataLen);
		fclose(fp);

		

		double readTime = omp_get_wtime( );



	/************************开始OpenMP计算************************/

	int thread_count = strtol(argv[1],NULL,10);

	/***********计算高斯分布***********/
	char *maenInf[6]={"maleLength","maleWeight","maleVC","femaleLength","femaleWeight","femaleVC"};

	double A,B,C,D,E,F,G;
	A=0;B=0;C=0;D=0;E=0;F=0;G=0;

	double sum[6]={0,0,0,0,0,0};
	double mean[6]={0,0,0,0,0,0};


#	pragma omp parallel for num_threads(thread_count) \
	reduction(+:maleNum) reduction(+:femaleNum) \
	reduction(+:A) reduction(+:B) reduction(+:C)\
	reduction(+:D) reduction(+:E) reduction(+:F)\
	shared(dataSet,dataLen) private(i)
	for(i=0;i<dataLen;i++)
	{
		if(dataSet[i*EIGEN_NUM]==1)
		{
			maleNum=maleNum+1;
			A+=dataSet[i*EIGEN_NUM+1];
			B+=dataSet[i*EIGEN_NUM+2];
			C+=dataSet[i*EIGEN_NUM+3];
		}
		else if(dataSet[i*EIGEN_NUM]==2)
		{
			femaleNum=femaleNum+1;
			D+=dataSet[i*EIGEN_NUM+1];
			E+=dataSet[i*EIGEN_NUM+2];
			F+=dataSet[i*EIGEN_NUM+3];
		}
		else
		{
			printf("dataSet[%d]=%f,性别有误\n",i*EIGEN_NUM,dataSet[i*EIGEN_NUM]);
		}
		//printf("sum[0]=%f \n",sum[0]);
		//printf("%d行4列的数据求和完毕\n",i);
	}
#	pragma omp barrier
	sum[0]=A;
	sum[1]=B;
	sum[2]=C;
	sum[3]=D;
	sum[4]=E;
	sum[5]=F;


	//printf("maleNum=%.0f\nfemaleNum=%.0f\n",maleNum,femaleNum);


	/*for(i=0;i<6;i++)
	{
		printf("sum[%d]=%.0f\n",i,sum[i]);
	}*/

	//计算平均值
	for(i=0;i<6;i++)
	{
		if(i<3){mean[i]=sum[i]/maleNum;}
		if(i>2){mean[i]=sum[i]/femaleNum;}
		//printf("mean-%s = %.5f \n",maenInf[i],mean[i]);
	}

	//计算累加
	A=0;B=0;C=0;D=0;E=0;F=0;G=0;
	double Sigma[6]={0,0,0,0,0,0};
#	pragma omp parallel for num_threads(thread_count) default(none) \
	reduction(+:A) reduction(+:B) reduction(+:C)\
	reduction(+:D) reduction(+:E) reduction(+:F)\
	shared(dataSet,dataLen,mean) private(i)
	for(i=0;i<dataLen;i++)
	{
		if(dataSet[i*EIGEN_NUM]==1)
		{
			A+=pow(dataSet[i*EIGEN_NUM+1]-mean[0] , 2 );
			B+=pow(dataSet[i*EIGEN_NUM+2]-mean[1] , 2 );
			C+=pow(dataSet[i*EIGEN_NUM+3]-mean[2] , 2 );
		}
		else if(dataSet[i*EIGEN_NUM]==2)
		{
			D+=pow(dataSet[i*EIGEN_NUM+1]-mean[3] , 2 );
			E+=pow(dataSet[i*EIGEN_NUM+2]-mean[4] , 2 );
			F+=pow(dataSet[i*EIGEN_NUM+3]-mean[5] , 2 );
		}
		else
		{
			printf("dataSet[i*EIGEN_NUM]=%f,性别有误",dataSet[i*EIGEN_NUM]);
		}
	}
#	pragma omp barrier
	Sigma[0]=A;
	Sigma[1]=B;
	Sigma[2]=C;
	Sigma[3]=D;
	Sigma[4]=E;
	Sigma[5]=F;


	//计算标准差
	double standardDeviation[6];	//标准差
	double sexNum;//各性别人数
	for(i=0;i<6;i++){
		if(i<3){sexNum=maleNum;}
		if(i>=3){sexNum=femaleNum;}
		standardDeviation[i]=sqrt(Sigma[i]/sexNum);
		//printf("Sigma[%d]=%f maleNum=%f",i,Sigma[i],sexNum);
		//printf("第%d个标准差=%.5f\n",i,standardDeviation[i]);
		}



	/*********** 朴素贝叶斯 & 准确率测试 ***********/
	//数据集有肺活量(VC),准确度判断
	float preSexID;
	float Right=0;
	float Error=0;
	//声明性别ID判断函数
	int sexIDResult(float height,float weight,float VC,double *mean,double *standardDeviation);

#	pragma omp parallel for num_threads(thread_count)  default(none) \
	reduction(+:Right) reduction(+:Error) \
	shared(dataSet,dataLen,mean,standardDeviation) private(i,preSexID)
	for(i=0;i<dataLen;i++){
		preSexID=sexIDResult(dataSet[i*EIGEN_NUM+1],dataSet[i*EIGEN_NUM+2],dataSet[i*EIGEN_NUM+3],mean,standardDeviation);
		if(dataSet[i*EIGEN_NUM]==preSexID){
			Right=Right+1;
		}
		else{
			Error=Error+1;
			//printf("预测ID:%.0f  实际ID:%.0f \n",preSexID,receiveBuf[i*EIGEN_NUM]);
			//printf("性别:%.0f,身高:%.2f,体重:%.2f,肺活量:%.0f \n",receiveBuf[i*EIGEN_NUM],receiveBuf[i*EIGEN_NUM+1],receiveBuf[i*EIGEN_NUM+2],receiveBuf[i*EIGEN_NUM+3]);
			}
	}

	printf("Right:%.0f\nError:%.0f\n",Right,Error);
	double accuracy  = Right/(Error+Right);
	printf("Accuracy:%f\n",accuracy);

	double end = omp_get_wtime( );

	//printf("start = %.16g\nend = %.16g\ndiff = %.16g\n", start, end, end - start);
	printf("整体耗时 = %.16f\n", end - start);
	printf("读取时长 = %.16f\n", readTime - start);
	printf("计算时长 = %.16f\n", end - readTime);

	return 0;
}






/*****************函数*****************/



/***********高斯分布函数***********/
//求和
double getSum(float *data,int recDatalen,int sex,int column)
{
	double Sum=0;
	for(int i=0;i<(recDatalen/EIGEN_NUM);i++)
	{
		if(data[i*EIGEN_NUM]==sex){
			Sum=Sum+data[i*EIGEN_NUM+column];
		}
	}
	return Sum;
}

//求pow((data[i]-mean),2)的累加
double getSigma(float *data,int recDatalen,double mean,int sex,int column){
	double Sigma=0;
	for(int i=0;i<(recDatalen/EIGEN_NUM);i++){
		if(data[i*EIGEN_NUM]==sex){
			Sigma=Sigma+pow(data[i*EIGEN_NUM+column]-mean , 2 );
			//printf("sex=%d data[i]=%f mean=%f \n",sex,data[i*EIGEN_NUM+column],mean);
		}
	}
	return Sigma;
}



/***********朴素贝叶斯函数***********/

//计算概率p(特征列column = x | 性别)
double getProbability(double x,int column,int sex,double mean,double standardDeviation)
{
	double Probability;	//计算出的概率
	double u = mean;
	double p = standardDeviation;

	//高数分布概率密度函数 x:预测变量 u:样本平均值 p:标准差
	p=pow(p,2);
	Probability = (1 / (2*PI*p)) * exp( -pow((x-u),2) / (2*p) );

	//printf("p(%s=%lf|性别=%s)=%.16lf\n",basicInfo[column],x,gender,Probability);

	return Probability;
}

//返回性别ID结果
int sexIDResult(float height,float weight,float VC,double *mean,double *standardDeviation)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1,mean[0],standardDeviation[0]) * getProbability(weight,2,1,mean[1],standardDeviation[1]) 
		* getProbability(VC,3,1,mean[2],standardDeviation[2]);

	femaleP = a * getProbability(height,1,2,mean[3],standardDeviation[3]) * getProbability(weight,2,2,mean[4],standardDeviation[4]) 
		* getProbability(VC,3,2,mean[5],standardDeviation[5]);

	if(maleP > femaleP){return 1;}
	if(maleP < femaleP){return 2;}
	if(maleP == femaleP){return 0;}
}

结果

linux服务器:

使用gcc编译,代码:https://download.csdn.net/download/admiz/16162424

猜你喜欢

转载自blog.csdn.net/admiz/article/details/109831771