海思NNIE开发(三):FasterRCNN在海思NNIE平台上的执行流程(二)

系列文章

海思NNIE开发(一):海思Hi3559AV100/Hi3519AV100 NNIE深度学习模块开发与调试记录

海思NNIE开发(二):FasterRCNN在海思NNIE平台上的执行流程(一)

海思NNIE开发(三):FasterRCNN在海思NNIE平台上的执行流程(二)

海思NNIE开发(四):NNIE模块读入JPEG图片或视频

海思NNIE开发(五):基于Hi3559AV100的FasterRCNN、RFCN、SSD、Yolov2、Yolov3性能综合测评

------------------------------------------------------------------------------------------------------------------------------------

正文

本篇文章我们接着上一篇(海思NNIE开发二),继续分析FasterRCNN在海思NNIE平台上的执行流程。

1. 解析网络模型信息

首先我们来看以下加载模型网络信息的函数:

s32Ret = SAMPLE_COMM_SVP_NNIE_LoadModel(pcModelName,&s_stFasterRcnnModel);

这里的pcModelName就是wk文件的路径,s_stFasterRcnnModel是SAMPLE_SVP_NNIE_MODEL_S结构体,我们在上一篇(海思NNIE开发二)文章中已对该结构体做分析。我们进入该函数分析:

HI_S32 SAMPLE_COMM_SVP_NNIE_LoadModel(HI_CHAR * pszModelFile,
    SAMPLE_SVP_NNIE_MODEL_S *pstNnieModel)
{
	HI_S32 s32Ret = HI_INVALID_VALUE;
	HI_U64 u64PhyAddr = 0;
	HI_U8 *pu8VirAddr = NULL;
	HI_SL slFileSize = 0;
	
	/*打开网络模型文件,即*.wk文件, 再获取文件大小*/	
	FILE *fp=fopen(pszModelFile,"rb");	
    s32Ret = fseek(fp,0L,SEEK_END); // 文件指针指向文件尾
    slFileSize = ftell(fp);  // 获取文件字节大小
	s32Ret = fseek(fp,0L,SEEK_SET);// 再将文件指针指向文件头

	/*malloc model file mem,根据文件大小计算需分配的物理地址及虚拟地址大小*/
	s32Ret = SAMPLE_COMM_SVP_MallocMem("SAMPLE_NNIE_MODEL",NULL,(HI_U64*)&u64PhyAddr,(void**)&pu8VirAddr,slFileSize);

	pstNnieModel->stModelBuf.u32Size = (HI_U32)slFileSize; 	/*文件大小*/
	pstNnieModel->stModelBuf.u64PhyAddr = u64PhyAddr;		/*物理地址*/
	pstNnieModel->stModelBuf.u64VirAddr = (HI_U64)pu8VirAddr;/*虚拟地址*/

	/*读取整个wk文件到虚拟地址*/
    s32Ret = fread(pu8VirAddr, slFileSize, 1, fp);
	SAMPLE_SVP_CHECK_EXPR_GOTO(1 != s32Ret,FAIL_1,SAMPLE_SVP_ERR_LEVEL_ERROR,
		"Error,read model file failed!\n");

	/*load model,从wk文件数据buf 中的模型中解析出网络模型*/
	s32Ret = HI_MPI_SVP_NNIE_LoadModel(&pstNnieModel->stModelBuf/*输入:模型数据buf*/,
	&pstNnieModel->stModel/*输出:网络模型结构体*/
	);

	fclose(fp);
    return HI_SUCCESS;
FAIL_1:
    SAMPLE_SVP_MMZ_FREE(pstNnieModel->stModelBuf.u64PhyAddr,pstNnieModel->stModelBuf.u64VirAddr);
    pstNnieModel->stModelBuf.u32Size  = 0;
FAIL_0:
    if (NULL != fp)
    {
        fclose(fp);
    }

	return HI_FAILURE;
}

这个函数执行以下步骤:

  • 获取wk文件字节大小
  • 分配存储wk文件的内存空间
  • 读取wk文件到内存空间
  • 从wk文件的内存buf中解析出网络模型信息

执行完后,模型信息在s_stFasterRcnnModel.stModel结构体里,这个结构体里存储的是什么信息,可参考我上一篇文章(海思NNIE开发二),这里简单罗列各个段、输入输出节点的信息如下:

段类型/段类型值 输入/输出 节点名 节点类型/节点类型值

第1段

SVP_NNIE_NET_TYPE_CNN/0 输入 data SVP_BLOB_TYPE_S32/0
输出 conv5 SVP_BLOB_TYPE_S32/0
rpn_cls_score SVP_BLOB_TYPE_S32/0
rpn_bbox_pred SVP_BLOB_TYPE_S32/0
rpn_cls_prob_reshape SVP_BLOB_TYPE_S32/0

第2段

SVP_NNIE_NET_TYPE_ROI/1 输入 conv5 SVP_BLOB_TYPE_S32/0
输出 bbox_pred SVP_BLOB_TYPE_VEC_S32/4
cls_prob SVP_BLOB_TYPE_VEC_S32/4

2. 初始化

解析完网络模型信息之后,结构体指针给到 SAMPLE_SVP_NNIE_PARAM_S s_stFasterRcnnNnieParam这个结构体中,如下:

s_stFasterRcnnNnieParam.pstModel = &s_stFasterRcnnModel.stModel;

我们接着看以下初始化函数:

s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_ParamInit(
&stNnieCfg,
&s_stFasterRcnnNnieParam,
&s_stFasterRcnnSoftwareParam);

这个函数里面执行稍复杂,简单来说就是使用stNnieCfg等信息来初始化s_stFasterRcnnNnieParam,再使用s_stFasterRcnnNnieParam等来初始化s_stFasterRcnnSoftwareParam。该函数的实现如下:

static HI_S32 SAMPLE_SVP_NNIE_FasterRcnn_ParamInit(SAMPLE_SVP_NNIE_CFG_S* pstFasterRcnnCfg/*图片及框等信息*/,
    SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam/*模型信息*/, SAMPLE_SVP_NNIE_FASTERRCNN_SOFTWARE_PARAM_S* pstSoftWareParam)
{
    HI_S32 s32Ret = HI_SUCCESS;
    /*init hardware parameter*/
    s32Ret = SAMPLE_COMM_SVP_NNIE_ParamInit(pstFasterRcnnCfg,pstNnieParam);

    /*init software parameter*/
    s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_SoftwareInit(
    pstFasterRcnnCfg,
    pstNnieParam,
        pstSoftWareParam);

    return s32Ret;
INIT_FAIL_0:
    s32Ret = SAMPLE_SVP_NNIE_FasterRcnn_Deinit(pstNnieParam,pstSoftWareParam,NULL);

    return HI_FAILURE;

}

分为SAMPLE_SVP_NNIE_ParamInit与SAMPLE_SVP_NNIE_FasterRcnn_SoftwareInit两个函数。我们首先看SAMPLE_COMM_SVP_NNIE_ParamInit,这个函数的实现里做了一些输入参数的有效判断后,就直接调用SAMPLE_SVP_NNIE_ParamInit,因此我们就直接看SAMPLE_SVP_NNIE_ParamInit的实现,在这个函数里首先调用:

s32Ret = SAMPLE_SVP_NNIE_FillForwardInfo(pstNnieCfg,pstNnieParam);

这个函数的实质就是使用pstNnieParam->pstModel->astSeg的信息来初始化pstNnieParam->astForwardWithBboxCtrl与pstNnieParam->astSegData这两个结构体,其实现如下:

static HI_S32 SAMPLE_SVP_NNIE_FillForwardInfo(
	SAMPLE_SVP_NNIE_CFG_S *pstNnieCfg/*图片及框等信息*/,
	SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam/*模型信息*/)
{
	HI_U32 i = 0, j = 0;
	HI_U32 u32Offset = 0;
	HI_U32 u32Num = 0;
	/*u32NetSegNum:网络模型中 NNIE 执行的网络分段数,在FasterRCNN中为2*/	
	for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++)
	{       
		if(SVP_NNIE_NET_TYPE_ROI == pstNnieParam->pstModel->astSeg[i].enNetType)/*网络段的类型,SVP_NNIE_NET_TYPE_ROI为1*/
		{
			/*astForwardWithBboxCtrl:有 Bbox 输入的目标检测网络预测控制参数*/
			pstNnieParam->astForwardWithBboxCtrl[i].enNnieId = pstNnieCfg->aenNnieCoreId[i];//网络段的段序号,初始化时设置为0
			pstNnieParam->astForwardWithBboxCtrl[i].u32SrcNum = pstNnieParam->pstModel->astSeg[i].u16SrcNum;//网络段的输入节点数,这里为1
			pstNnieParam->astForwardWithBboxCtrl[i].u32DstNum = pstNnieParam->pstModel->astSeg[i].u16DstNum;//网络段的输出节点数,这里为2
			pstNnieParam->astForwardWithBboxCtrl[i].u32ProposalNum = 1;
			pstNnieParam->astForwardWithBboxCtrl[i].u32NetSegId = i;//网络段的段序号,这里为1
			pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf;// 辅助内存
			pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64PhyAddr= pstNnieParam->stTaskBuf.u64PhyAddr+u32Offset;// 内存块物理地址
			pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64VirAddr= pstNnieParam->stTaskBuf.u64VirAddr+u32Offset; // 内存块虚拟地址
			pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u32Size= pstNnieParam->au32TaskBufSize[i];  // 内存块字节数
		}
		else if(SVP_NNIE_NET_TYPE_CNN == pstNnieParam->pstModel->astSeg[i].enNetType ||
            SVP_NNIE_NET_TYPE_RECURRENT== pstNnieParam->pstModel->astSeg[i].enNetType)
		{
			pstNnieParam->astForwardCtrl[i].enNnieId = pstNnieCfg->aenNnieCoreId[i];//网络段的段序号,初始化时设置为0
			pstNnieParam->astForwardCtrl[i].u32SrcNum = pstNnieParam->pstModel->astSeg[i].u16SrcNum;//网络段的输入节点数,这里为1
			pstNnieParam->astForwardCtrl[i].u32DstNum = pstNnieParam->pstModel->astSeg[i].u16DstNum;//网络段的输出节点数,这里为4
			pstNnieParam->astForwardCtrl[i].u32NetSegId = i;//网络段的段序号,这里为0
			pstNnieParam->astForwardCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf; // 辅助内存,这里为0
			pstNnieParam->astForwardCtrl[i].stTskBuf.u64PhyAddr= pstNnieParam->stTaskBuf.u64PhyAddr+u32Offset; // 内存块物理地址,这里为0
			pstNnieParam->astForwardCtrl[i].stTskBuf.u64VirAddr= pstNnieParam->stTaskBuf.u64VirAddr+u32Offset; // 内存块虚拟地址,这里为0
			pstNnieParam->astForwardCtrl[i].stTskBuf.u32Size= pstNnieParam->au32TaskBufSize[i]; // 内存块字节数,这里为0
		}
		u32Offset += pstNnieParam->au32TaskBufSize[i];// 网络任务各段辅助内存,这里为0

        /*fill src blob info,从pstModel中获取每一段的输入节点信息,存储于astSegData[i].astSrc*/
		for(j = 0; j < pstNnieParam->pstModel->astSeg[i].u16SrcNum; j++)// 第i段的第j个输入节点,FasterRCNN中每段都只有1个输入节点
	    {
            /*FasterRCNN中第1段与第2段的第1个输入节点类型为SVP_BLOB_TYPE_U8*/
            if(SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType) // 0x5类型
            {
                pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u32Dim = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.u32Dim;
                pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieCfg->u32MaxInputNum; // 1
                pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u64VirAddrStep = pstNnieCfg->au64StepVirAddr[i*SAMPLE_SVP_NNIE_EACH_SEG_STEP_ADDR_NUM];
            }
            else
            {
    		    pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].enType; // 节点类型,这里为SVP_BLOB_TYPE_U8
    	        pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Chn = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Chn; //节点输入通道数
    	        pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Height = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Height; // 节点输入的高度
    	        pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Width = pstNnieParam->pstModel->astSeg[i].astSrcNode[j].unShape.stWhc.u32Width; // 节点输入的宽度
    	        pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieCfg->u32MaxInputNum; // 1
            }
	    }
        /* FasterRCNN中第1段的类型为SVP_NNIE_NET_TYPE_CNN, 第2段的类型为SVP_NNIE_NET_TYPE_ROI
		* u32MaxRoiNum为300,u32MaxInputNum为1 */
		if(SVP_NNIE_NET_TYPE_ROI == pstNnieParam->pstModel->astSeg[i].enNetType) // 0x1
		{
			u32Num = pstNnieCfg->u32MaxRoiNum*pstNnieCfg->u32MaxInputNum; // 300
		}
		else
		{
			u32Num = pstNnieCfg->u32MaxInputNum; // 这里为1
		}
		// FasterRcnn第1段有4个输出节点,都是SVP_BLOB_TYPE_S32类型;第2段有2个输出节点,都是SVP_BLOB_TYPE_VEC_S32类型
		for(j = 0; j < pstNnieParam->pstModel->astSeg[i].u16DstNum; j++)// 第i段的第j个输出节点
		{
            if(SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType)// 0x5类型
            {
    			pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType;
    			pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u32Dim =
                    pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.u32Dim;
                pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num;
                pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u64VirAddrStep =
                    pstNnieCfg->au64StepVirAddr[i*SAMPLE_SVP_NNIE_EACH_SEG_STEP_ADDR_NUM+1];
            }
            else
            {
    		    pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->pstModel->astSeg[i].astDstNode[j].enType;// 节点类型,
    		    pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Chn = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Chn; // 通道
    		    pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Height = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Height;// 高度
    		    pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Width = pstNnieParam->pstModel->astSeg[i].astDstNode[j].unShape.stWhc.u32Width;// 宽度
    		    pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num; // 第1段NNIE网络为1,第2段NNIE网络为300
            }
		}
	}
	return HI_SUCCESS;
}

这个函数还对pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf、pstNnieParam->astForwardCtrl[i].stTskBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf、pstNnieParam->astForwardCtrl[i].stTmpBuf等参数进行初始化,其实这里没有必要,因为pstNnieParam->stTaskBuf、pstNnieParam->stTmpBuf等结构体的值也是空的,并没有做过申请内存的操作。

我们再来看第2个关键函数:

/*1. 计算网络各段的辅助内存大小
2. 计算第1段第1个输入节点的Blob的辅助内存大小
3. 计算各段第1个输出节点的Blob的辅助内存大小
*/
	s32Ret = SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize(pstNnieCfg,pstNnieParam,
	&u32TotalTaskBufSize,    /*输入&输出:输入值为0; 输出:网络各段辅助内存的总和*/
		&u32TmpBufSize,      /*输入&输出,输入值为0; 输出:模型辅助内存大小*/
		astBlobSize,        /*输入&输出:输入为空;  输出:各段第1个输入、输出节点辅助内存*/
		&u32TotalSize        /*输入&输出:输入值为0; 输出为:段辅助内存+模型辅助内存+第1段第1个输入节点辅助内存+各段第1个输出节点辅助内存大小*/);

这个函数是计算各个段、各个段中的各个节点的的辅助内存大小。我们知道,在之前的load模型的步骤中,是已经获取到模型的辅助内存(pstNnieParam->pstModel->u32TmpBufSize),但各段、段中各个节点的辅助内存是不知道的,因此该函数就是获取这些辅助内存。该函数的实现如下:

static HI_S32 SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize(SAMPLE_SVP_NNIE_CFG_S *pstNnieCfg,
    SAMPLE_SVP_NNIE_PARAM_S *pstNnieParam,
	HI_U32*pu32TotalTaskBufSize,/*输入&输出:输入值为0,输出:网络各段辅助内存的总和*/
	HI_U32*pu32TmpBufSize,/*输入&输出, 输入值为0, 输出:模型辅助内存大小*/
    SAMPLE_SVP_NNIE_BLOB_SIZE_S astBlobSize[], /*输入&输出:输入为空;输出:各段第1个输入、输出节点辅助内存*/
	HI_U32*pu32TotalSize/*输入&输出:输入值为0, 输出为:段辅助内存+模型辅助内存+第1段第1个输入节点辅助内存+各段第1个输出节点辅助内存大小*/)
{
	HI_S32 s32Ret = HI_SUCCESS;
	HI_U32 i = 0, j = 0;
    HI_U32 u32TotalStep = 0;

	/*Get each seg's task buf size*//*获取给定网络任务各段辅助内存大小*/
	s32Ret = HI_MPI_SVP_NNIE_GetTskBufSize(pstNnieCfg->u32MaxInputNum/*图片数量:1*/,
		pstNnieCfg->u32MaxRoiNum,// 输入,300
		pstNnieParam->pstModel,// 输入
		pstNnieParam->au32TaskBufSize,// 输出:网络任务各段辅助内存
		pstNnieParam->pstModel->u32NetSegNum);// 输入:网络任务的段数
	SAMPLE_SVP_CHECK_EXPR_RET(HI_SUCCESS != s32Ret,s32Ret,SAMPLE_SVP_ERR_LEVEL_ERROR,
		"Error,HI_MPI_SVP_NNIE_GetTaskSize failed!\n");

    /*Get total task buf size*/
	*pu32TotalTaskBufSize = 0;
	for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++)
	{
		*pu32TotalTaskBufSize += pstNnieParam->au32TaskBufSize[i]; /*累加网络任务各段辅助内存*/
	}

	/*Get tmp buf size*/
	*pu32TmpBufSize = pstNnieParam->pstModel->u32TmpBufSize; // 模型辅助内存大小
	*pu32TotalSize += *pu32TotalTaskBufSize + *pu32TmpBufSize;// 段辅助内存+模型辅助内存

	/*calculate Blob mem size*/
	for(i = 0; i < pstNnieParam->pstModel->u32NetSegNum; i++)
	{
        if(SVP_NNIE_NET_TYPE_RECURRENT == pstNnieParam->pstModel->astSeg[i].enNetType)
        {
            for(j = 0; j < pstNnieParam->astSegData[i].astSrc[0].u32Num; j++)
            {
                u32TotalStep += *((HI_S32*)pstNnieParam->astSegData[i].astSrc[0].unShape.stSeq.u64VirAddrStep+j);
            }
        }
		/*the first seg's Src Blob mem size, other seg's src blobs from the output blobs of
		those segs before it or from software output results*/
		if(i == 0)
		{
			/*计算第1段第1个输入节点的Blob的辅助内存大小*/
			SAMPLE_SVP_NNIE_GetBlobMemSize(
			&(pstNnieParam->pstModel->astSeg[i].astSrcNode[0]), /*输入,第i段的第1个输入节点信息*/
				pstNnieParam->pstModel->astSeg[i].u16SrcNum,  /*输入,这里是1*/
				u32TotalStep,/*输入,这里是0*/
				&(pstNnieParam->astSegData[i].astSrc[0]),/*第i段的第1个节点信息,在SAMPLE_SVP_NNIE_FillForwardInfo中已填充部分该结构体部分信息*/
				SAMPLE_SVP_NNIE_ALIGN_16, /*输入:内存对齐方式*/
				pu32TotalSize,/*输入&输出:输入为:段辅助内存+模型辅助内存;输出为:段辅助内存+模型辅助内存+输入节点辅助内存*/
				&(astBlobSize[i].au32SrcSize[0])/*输入&输出:输入为空;输出为:各个节点的Blob的辅助内存大小*/));
		}

		/*Get each seg's Dst Blob mem size*/
		/*计算第1个输出节点的Blob的辅助内存大小*/
		SAMPLE_SVP_NNIE_GetBlobMemSize(&(pstNnieParam->pstModel->astSeg[i].astDstNode[0]),
			pstNnieParam->pstModel->astSeg[i].u16DstNum,u32TotalStep,&(pstNnieParam->astSegData[i].astDst[0]),
			SAMPLE_SVP_NNIE_ALIGN_16, pu32TotalSize, &(astBlobSize[i].au32DstSize[0]));
	}
	return s32Ret;
}

在这个函数中,首先调用底层API  HI_MPI_SVP_NNIE_GetTskBufSize获取到网络任务的各段的辅助内存pstNnieParam->au32TaskBufSize,然后再调用SAMPLE_SVP_NNIE_GetBlobMemSize计算第1段的第1个输入节点Blob的辅助内存,以及每段的第1个输出节点的Blob辅助内存。

回到SAMPLE_SVP_NNIE_ParamInit函数中,SAMPLE_SVP_NNIE_GetTaskAndBlobBufSize执行完后,u32TotalSize为总的辅助内存大小(含模型、段、节点),此时调用:

s32Ret = SAMPLE_COMM_SVP_MallocCached("SAMPLE_NNIE_TASK",NULL,(HI_U64*)&u64PhyAddr,(void**)&pu8VirAddr,u32TotalSize);

分配内存空间。接着后面,再根据得到的虚拟内存地址、物理内存地址来初始化pstNnieParam->stTaskBuf、pstNnieParam->stTmpBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf、pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf、stNnieParam->astForwardCtrl[i].stTskBuf、stNnieParam->astSegData[i].astSrc[j]这些结构体中的内存地址值,这个才是真正的初始化,之前在SAMPLE_SVP_NNIE_FillForwardInfo函数中也有对这些结构体做初始化,但那是“假初始化”。

此致,SAMPLE_SVP_NNIE_ParamInit函数执行完毕。

发布了40 篇原创文章 · 获赞 51 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/zh8706/article/details/98031231