背景：之前代码过多无用代码，我们只需要神经网络的前馈运算的代码。卷积核需要不用gemm的方式而用更简单的卷积来运算。

目的：读懂简化版的yolo2_light代码，并将其中卷积改为9个并行的基本单元。

代码地址： https://github.com/AlexeyAB/yolo2_light

1.函数的功能与位置

~/datasets/xxr/yolo2_light/bin$ ./darknet detector test obj.data obj.cfg obj_6000.weights 17.jpg
layer     filters    size              input                output
    0 conv     16  3 x 3 / 1   416 x 416 x   3   ->   416 x 416 x  16
    1 max          2 x 2 / 2   416 x 416 x  16   ->   208 x 208 x  16
    2 conv     32  3 x 3 / 1   208 x 208 x  16   ->   208 x 208 x  32
    3 max          2 x 2 / 2   208 x 208 x  32   ->   104 x 104 x  32
    4 conv     64  3 x 3 / 1   104 x 104 x  32   ->   104 x 104 x  64
    5 max          2 x 2 / 2   104 x 104 x  64   ->    52 x  52 x  64
    6 conv    128  3 x 3 / 1    52 x  52 x  64   ->    52 x  52 x 128
    7 max          2 x 2 / 2    52 x  52 x 128   ->    26 x  26 x 128
    8 conv    256  3 x 3 / 1    26 x  26 x 128   ->    26 x  26 x 256
    9 max          2 x 2 / 2    26 x  26 x 256   ->    13 x  13 x 256
   10 conv    512  3 x 3 / 1    13 x  13 x 256   ->    13 x  13 x 512
   11 max          2 x 2 / 1    13 x  13 x 512   ->    13 x  13 x 512
   12 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024
   13 conv   1024  3 x 3 / 1    13 x  13 x1024   ->    13 x  13 x1024
   14 conv     30  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x  30
   15 detection
Loading weights from obj_6000.weights...Done!
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Skip layer: 3
 Fuse Convolutional layer                l->size = 3
 Fuse Convolutional layer                l->size = 3
 Fuse Convolutional layer                l->size = 1
 Skip layer: 21

 running yolov2_forward_netwrok_cpu
 layer num: 0    CONVOLUTIONAL
FilterNum:16,Input Channels:3,Input Height:416,input width:416,FilterSize:3
layer num: 1     MAXPOOL
layer num: 2     CONVOLUTIONAL
FilterNum:32,Input Channels:16,Input Height:208,input width:208,FilterSize:3
layer num: 3     MAXPOOL
layer num: 4     CONVOLUTIONAL
FilterNum:64,Input Channels:32,Input Height:104,input width:104,FilterSize:3
layer num: 5     MAXPOOL
layer num: 6     CONVOLUTIONAL
FilterNum:128,Input Channels:64,Input Height:52,input width:52,FilterSize:3
layer num: 7     MAXPOOL
layer num: 8     CONVOLUTIONAL
FilterNum:256,Input Channels:128,Input Height:26,input width:26,FilterSize:3
layer num: 9     MAXPOOL
layer num: 10    CONVOLUTIONAL
FilterNum:512,Input Channels:256,Input Height:13,input width:13,FilterSize:3
layer num: 11    MAXPOOL
layer num: 12    CONVOLUTIONAL
FilterNum:1024,Input Channels:512,Input Height:13,input width:13,FilterSize:3
layer num: 13    CONVOLUTIONAL
FilterNum:1024,Input Channels:1024,Input Height:13,input width:13,FilterSize:3
layer num: 14    CONVOLUTIONAL
FilterNum:30,Input Channels:1024,Input Height:13,input width:13,FilterSize:1
layer num: 15    REGION
Check done!
17.jpg: Predicted in 1.542322 seconds.
classes= 1  : 80%       (left_x:  124   top_y:   82   width:   45   height:   53)
classes= 1  : 70%       (left_x:  187   top_y:   17   width:   28   height:   40)
classes= 1  : 83%       (left_x:  195   top_y:  157   width:   50   height:   64)
Not compiled with OpenCV, saving to predictions.png instead

1.函数的功能与位置

只有前馈之后，网络的代码简短了不少。

主函数：mian.c中，main函数中，运用network_predict_cpu这个函数来获得图片的输出。

前馈计算：yolov2_forward_network.c中，定义了network_predict_cpu函数yolov2_forward_network_cpu函数作用就是一层一层的前馈计算。

卷积运算：forward_convolutional_layer_cpu 也在这个函数中

2.检验

2.1 程序输出的检验

~/datasets/xxr/yolo2_light/bin$ ./darknet detector test obj.data obj.cfg obj_6000.weights 17.jpg

predictions.png

2.2 卷积的检验

将每次卷积结果输入check.txt文件，

// yolov2_forward_network.c
void yolov2_forward_network_cpu(network net, network_state state)
{
	FILE* check=fopen("check.txt","w");
	
	printf("\n running yolov2_forward_netwrok_cpu\n ");
    state.workspace = net.workspace;
    int i;
    for (i = 0; i < net.n; ++i) {
		printf("layer num: %d \t",i);
        state.index = i;
        layer l = net.layers[i];

        if (l.type == CONVOLUTIONAL) {
			printf(" CONVOLUTIONAL \n");
            forward_convolutional_layer_cpu(l, state);
			fwrite(l.output,l.outputs,sizeof(float),check);
        }
        else if (l.type == MAXPOOL) {
			printf(" MAXPOOL \n");
            forward_maxpool_layer_cpu(l, state);
        }

        state.input = l.output;
    }
	fclose(check);
}

写入文件后，程序更改为这样，把每层卷积的l.output进行进行检验。


void yolov2_forward_network_cpu(network net, network_state state)
{
	FILE* check=fopen("check.txt","r");
	
	printf("\n running yolov2_forward_netwrok_cpu\n ");
    state.workspace = net.workspace;
    int i;
    for (i = 0; i < net.n; ++i) {
		printf("layer num: %d \t",i);
        state.index = i;
        layer l = net.layers[i];

        if (l.type == CONVOLUTIONAL) {
			printf(" CONVOLUTIONAL \n");
            forward_convolutional_layer_cpu(l, state);
			
			float *buffer=(float*)malloc(l.outputs*sizeof(float));
			fread(buffer,l.outputs,sizeof(float),check);
			for(int i=0;i<l.outputs;i++){
				if(buffer[i]!=l.output[i]){
					printf("Check error!\n");
					break;
				}
			}
        }
        else if (l.type == MAXPOOL) {
			printf(" MAXPOOL \n");
            forward_maxpool_layer_cpu(l, state);
        }

        state.input = l.output;
    }
	fclose(check);
	printf("Check done!\n");
}

//input
~/datasets/xxr/yolo2_light/bin$ ./darknet detector test voc.data head-hw-v2.cfg obj_9000.weights dog.jpg

//output
......
FilterNum:1024,Input Channels:1024,Input Height:13,input width:13,FilterSize:3
layer num: 14    CONVOLUTIONAL
FilterNum:30,Input Channels:1024,Input Height:13,input width:13,FilterSize:1
layer num: 15    REGION
Check done!
dog.jpg: Predicted in 1.412451 seconds.

有输出check done！表示验证正确。

3.卷积运算

3.1 zynqNet中卷积的运算

oid ProcessingElement::macc2d(const data_t pixels[9], const data_t weights[9],
                               data_t& result) {
#pragma HLS inline

  data_t accumulator = 0.0f;
  data_t multresult[9];
#pragma HLS ARRAY_PARTITION variable = multresult complete dim = 0

L_MACC_multiply:
  for (int i = 0; i < 9; i++) {
#pragma HLS UNROLL
    multresult[i] = pixels[i] * weights[i];
  }

L_MACC_accumulate:
  for (int i = 0; i < 9; i++) {
#pragma HLS UNROLL
    accumulator = accumulator + multresult[i];
  }

  LOG("PE: macc2D -> %.2f \n", accumulator);

  result = accumulator;
}

运用ARRAY_PARTATION指令可以9个一起并行运算。先乘再加。

3.2 原始的卷积语句

yolov2_forward_network.c文件中，forward_convolutional_layer_cpu 函数

    // l.n - number of filters on this layer
    // l.c - channels of input-array
    // l.h - height of input-array
    // l.w - width of input-array
    // l.size - width and height of filters (the same size for all filters)

	printf("FilterNum:%d,Input Channels:%d,Input Height:%d,input width:%d,FilterSize:%d\n",
	l.n,l.c,l.h,l.w,l.size);

    // 1. Convolution !!!
#ifndef GEMMCONV
    int fil;
    // filter index
#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
    for (fil = 0; fil < l.n; ++fil) {
        int chan, y, x, f_y, f_x;
        // channel index
        for (chan = 0; chan < l.c; ++chan)
            // input - y
            for (y = 0; y < l.h; ++y)
                // input - x
                for (x = 0; x < l.w; ++x)
                {
                    int const output_index = fil*l.w*l.h + y*l.w + x;
                    int const weights_pre_index = fil*l.c*l.size*l.size + chan*l.size*l.size;
                    int const input_pre_index = chan*l.w*l.h;
                    float sum = 0;

                    // filter - y
                    for (f_y = 0; f_y < l.size; ++f_y)
                    {
                        int input_y = y + f_y - l.pad;
                        // filter - x
                        for (f_x = 0; f_x < l.size; ++f_x)
                        {
                            int input_x = x + f_x - l.pad;
                            if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;

                            int input_index = input_pre_index + input_y*l.w + input_x;
                            int weights_index = weights_pre_index + f_y*l.size + f_x;

                            sum += state.input[input_index] * l.weights[weights_index];
                        }
                    }
                    // l.output[filters][width][height] +=
                    //        state.input[channels][width][height] *
                    //        l.weights[filters][channels][filter_width][filter_height];
                    l.output[output_index] += sum;
                }
    }

3.3 改为定值卷积

14层之外的卷积层大小为3*3，14层为1*1卷积核。定值为3*3

if(state.index!=14){
	for (f_y = 0; f_y < 3; ++f_y)
	{
		int input_y = y + f_y - l.pad;
		// filter - x
		for (f_x = 0; f_x < 3; ++f_x)
		{
			int input_x = x + f_x - l.pad;
			if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;

			int input_index = input_pre_index + input_y*l.w + input_x;
			int weights_index = weights_pre_index + f_y*l.size + f_x;

			sum += state.input[input_index] * l.weights[weights_index];
		}
	}
	
else{
	// filter - y
	f_y = 0;
	int input_y = y + f_y - l.pad;
		// filter - x
	f_x = 0;
	
	int input_x = x + f_x - l.pad;
	if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;

	int input_index = input_pre_index + input_y*l.w + input_x;
	int weights_index = weights_pre_index + f_y*l.size + f_x;

	sum += state.input[input_index] * l.weights[weights_index];
	
}

3.4 3*3的9并行

改为9的并行，将3*3的卷积作为基本的卷积单元。

float input_buffer[9],weight_buffer[9],product_buffer[9];
for(f_y=0;f_y<3;++f_y){
intinput_y=y+f_y-l.pad;
// filter - x
	for (f_x = 0; f_x < 3; ++f_x)
	{
		int input_x = x + f_x - l.pad;
		if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;

		int input_index = input_pre_index + input_y*l.w + input_x;
		int weights_index = weights_pre_index + f_y*l.size + f_x;

		int buffer_idx=3*f_y+f_x;
		input_buffer[buffer_idx]=state.input[input_index];
		weight_buffer[buffer_idx]=l.weights[weights_index];
		//sum += state.input[input_index] * l.weights[weights_index];
	}
}

for(i=0;i<9;i++){
	product_buffer[i]=input_buffer[i]*weight_buffer[i];
}
for(i=0;i<9;i++){
	sum+=product_buffer[i];
}

至此，我们将卷积改为了最基本的9个并行的运算。并用3.1中的方法验证通过。

YOLOv3：Darknet代码解析（五）简化的程序与卷积拆分

1.函数的功能与位置

2.检验

2.1 程序输出的检验

2.2 卷积的检验

3.卷积运算

3.1 zynqNet中卷积的运算

3.2 原始的卷积语句

3.3 改为定值卷积

3.4 3*3的9并行

猜你喜欢