YOLO源码详解（三）- 前向传播（forward）

本系列作者：木凌
时间：2016年11月。
文章连接：http://blog.csdn.net/u014540717

一、主函数void forward_network(network net, network_state state)

//network.c
void forward_network(network net, network_state state)
{
    state.workspace = net.workspace;
    int i;
    for(i = 0; i < net.n; ++i){
        state.index = i;
        layer l = net.layers[i];
        //如果delta不为零，那么就把所有的输入值输入乘一个系数，用float *delta指针指向它
        if(l.delta){
            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
        }
        //从这里开始我们可以一层一层分析了，重复的层就不再分析了，顺序如下：
        //[convolutional]
        //[maxpool]
        //[local]
        //[dropout]
        //[connected]
        //[detection]
        l.forward(l, state);
        state.input = l.output;
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23

1、前向传播-convolutional层

//convolutional_layer.c
void forward_convolutional_layer(convolutional_layer l, network_state state)
{
    //获取卷积层输出的长、宽
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    int i;

    //初始化，将输出的数据全部赋值０
    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

    /*
       if(l.binary){
       binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
       binarize_weights2(l.weights, l.n, l.c*l.size*l.size, l.cweights, l.scales);
       swap_binary(&l);
       }
     */

    /*
       if(l.binary){
       int m = l.n;
       int k = l.size*l.size*l.c;
       int n = out_h*out_w;

       char  *a = l.cweights;
       float *b = state.workspace;
       float *c = l.output;

       for(i = 0; i < l.batch; ++i){
       im2col_cpu(state.input, l.c, l.h, l.w, 
       l.size, l.stride, l.pad, b);
       gemm_bin(m,n,k,1,a,k,b,n,c,n);
       c += n*m;
       state.input += l.c*l.h*l.w;
       }
       scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
       add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
       activate_array(l.output, m*n*l.batch, l.activation);
       return;
       }
     */

    if(l.xnor){
        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
        swap_binary(&l);
        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
        state.input = l.binary_input;
    }
    //m是卷积核的个数，k是每个卷积核的参数数量（l.size是卷积核的大小），n是每个输出feature map的像素个数
    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    if (l.xnor && l.c%32 == 0 && AI2) {
        forward_xnor_layer(l, state);
        printf("xnor\n");
    } else {

        //weights顾名思义，就是卷积核的参数，`$grep -rn "l.weights"`可以看到：
        //l.weights = calloc(c*n*size*size, sizeof(float))
        //说白了a是指向权重的指针，b是指向工作空间指针，c是指向输出的指针
        float *a = l.weights;
        float *b = state.workspace;
        float *c = l.output;

        for(i = 0; i < l.batch; ++i){
            //im2col就是image to column,就是将图像依照卷积核的大小拉伸为列向量，方便矩阵运算
            im2col_cpu(state.input, l.c, l.h, l.w,
                    l.size, l.stride, l.pad, b);
            //这个函数实现矩阵运算，也就是卷积运算
            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
            c += n*m;
            //更新输入
            state.input += l.c*l.h*l.w;
        }
    }
    //BN层，加速收敛
    if(l.batch_normalize){
        forward_batchnorm_layer(l, state);
    }
    //添加偏置项
    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
    //非线性变化，leaky RELU层，非常简单，不多做介绍
    activate_array(l.output, m*n*l.batch, l.activation);
    //不太清楚binary和xnor是什么意思，希望有了解的留言，谢谢～
    if(l.binary || l.xnor) swap_binary(&l);
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88

函数剖析

a. im2col_cpu()

这个函数还是很重要的，我们来分析下。这个函数是从caffe中移植过来的

//im2col.c
//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im,
     int channels,  int height,  int width,
     int ksize,  int stride, int pad, float* data_col) 
{
    int c,h,w;
    int height_col = (height + 2*pad - ksize) / stride + 1;
    int width_col = (width + 2*pad - ksize) / stride + 1;

    int channels_col = channels * ksize * ksize;
    //最外层循环是每个卷积核的参数个数
    for (c = 0; c < channels_col; ++c) {
        int w_offset = c % ksize;
        int h_offset = (c / ksize) % ksize;
        int c_im = c / ksize / ksize;
        //这两层循环是用卷积核把图像遍历一遍，这说起来比较晦涩，一会儿画个图来理解，很简单～
        for (h = 0; h < height_col; ++h) {
            for (w = 0; w < width_col; ++w) {
                int im_row = h_offset + h * stride;
                int im_col = w_offset + w * stride;
                int col_index = (c * height_col + h) * width_col + w;
                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
                        im_row, im_col, c_im, pad);
            }
        }
    }
}

float im2col_get_pixel(float *im, int height, int width, int channels,
                        int row, int col, int channel, int pad)
{
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return 0;
    return im[col + width*(row + height*channel)];
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40

我画了下面的图来帮助理解下im2col_cpu()这个函数，为了方便理解，这里假设图像尺寸是5*5， stride=2，kernel_size=3
这里写图片描述
float *b指向state.workspace这个工作空间，也就是把原始数据变成行向量放到工作空间里，然后进行卷积计算

b. gemm()

这个函数实现了卷积的运算

//gemm.c
void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}

void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
        float *A, int lda,
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    int i, j;
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
            C[i*ldc + j] *= BETA;
        }
    }
    if(!TA && !TB)
        //调用这个函数
        gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(TA && !TB)
        gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(!TA && TB)
        gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else
        gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
}

void gemm_nn(int M, int N, int K, float ALPHA,
        float *A, int lda,
        float *B, int ldb,
        float *C, int ldc)
{
    int i,j,k;
    //这个函数同样一会儿画图表示吧，说起来太费劲。。。
    for(i = 0; i < M; ++i){
        for(k = 0; k < K; ++k){
            //关键字请求编译器尽可能的将变量存在CPU内部寄存器中，而不是通过内存寻址访问，以提高效率。
            register float A_PART = ALPHA*A[i*lda+k];
            for(j = 0; j < N; ++j){
                C[i*ldc+j] += A_PART*B[k*ldb+j];
            }
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51

这个函数的将卷积后的值放入c所指向的内存中（最终生成number of kernel个2*2的feature map）
这里写图片描述
函数结束后，开始循环每一个batch，卷积计算结果依次向后放

c. forward_batchnorm_layer()

这个函数实现batch normalization，加速了训练的收敛过程，详细见这篇论文 Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift

//batchnorm_layer.c
void forward_batchnorm_layer(layer l, network_state state)
{
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
    if(l.type == CONNECTED){
        l.out_c = l.outputs;
        l.out_h = l.out_w = 1;
    }
    if(state.train){
        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
    } else {
        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
    }
    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17

//blas.c
//计算均值
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
    float scale = 1./(batch * spatial);
    int i,j,k;
    //注意，这里的均值是不同batch的同一维度的feature的均值
    for(i = 0; i < filters; ++i){
        mean[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                mean[i] += x[index];
            }
        }
        mean[i] *= scale;
    }
}
//计算方差
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
    float scale = 1./(batch * spatial - 1);
    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance[i] += pow((x[index] - mean[i]), 2);
            }
        }
        variance[i] *= scale;
    }
}
//归一化
void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
    int b, f, i;
    for(b = 0; b < batch; ++b){
        for(f = 0; f < filters; ++f){
            for(i = 0; i < spatial; ++i){
                int index = b*filters*spatial + f*spatial + i;
                //公式中的ε=.000001f
                x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
            }
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48

//convolutional_layer.c
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                //scales就是创建convolutional_layer时分配的l.scales，值全是1
                output[(b*n + i)*size + j] *= scales[i];
            }
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13

z这三个函数分别对应论文中的如下公式：
这里写图片描述

d. add_bias()

这个函数和scale_bias()一模一样，是什么意思？明白的给解释一下呗～

//convolutional_layer.c
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] *= scales[i];
            }
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12

2、前向传播-maxpool layer

//池化层相对要简单很多，如果理解了卷基层，这一层就很好理解

void forward_maxpool_layer(const maxpool_layer l, network_state state)
{
    int b,i,j,k,m,n;
    int w_offset = -l.pad;
    int h_offset = -l.pad;

    int h = l.out_h;
    int w = l.out_w;
    int c = l.c;

    for(b = 0; b < l.batch; ++b){
        for(k = 0; k < c; ++k){
            //注意这里的h和w是maxpooling层输出的高度和宽度
            for(i = 0; i < h; ++i){
                for(j = 0; j < w; ++j){
                    int out_index = j + w*(i + h*(k + c*b));
                    float max = -FLT_MAX;
                    int max_i = -1;
                    //寻找最大值
                    for(n = 0; n < l.size; ++n){
                        for(m = 0; m < l.size; ++m){
                            int cur_h = h_offset + i*l.stride + n;
                            int cur_w = w_offset + j*l.stride + m;
                            int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
                            int valid = (cur_h >= 0 && cur_h < l.h &&
                                         cur_w >= 0 && cur_w < l.w);
                            float val = (valid != 0) ? state.input[index] : -FLT_MAX;
                            max_i = (val > max) ? index : max_i;
                            max   = (val > max) ? val   : max;
                        }
                    }
                    l.output[out_index] = max;
                    l.indexes[out_index] = max_i;
                }
            }
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38

3、前向传播-local layer

顾名思义，local层就是只看前一层的一部分，作者这里选择了左上角的区域

void forward_local_layer(const local_layer l, network_state state)
{
    int out_h = local_out_height(l);
    int out_w = local_out_width(l);
    int i, j;
    int locations = out_h * out_w;

    for(i = 0; i < l.batch; ++i){
        copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1);
    }

    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_cpu(input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, l.col_image);
        float *output = l.output + i*l.outputs;
        for(j = 0; j < locations; ++j){
            float *a = l.weights + j*l.size*l.size*l.c*l.n;
            float *b = l.col_image + j;
            float *c = output + j;

            int m = l.n;
            //n=1说明了作者只取了左上角的local区域，很容易想明白～
            int n = 1;
            int k = l.size*l.size*l.c;

            gemm(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
        }
    }
    activate_array(l.output, l.outputs*l.batch, l.activation);
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31

4、前向传播-dropout layer

dropout层主要是为了防止过拟合，详细可以自己搜索，这里不做过多介绍

void forward_dropout_layer(dropout_layer l, network_state state)
{
    int i;
    //dropout层只在训练阶段有效
    if (!state.train) return;
    for(i = 0; i < l.batch * l.inputs; ++i){
        //产生0~1之间的随机数
        float r = rand_uniform(0, 1);
        l.rand[i] = r;
        //如果小于给定概率值，就把相应的输入项赋0
        if(r < l.probability) state.input[i] = 0;
        else state.input[i] *= l.scale;
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14

5、前向传播-connected layer

void forward_connected_layer(connected_layer l, network_state state)
{
    int i;
    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
    int m = l.batch;
    int k = l.inputs;
    int n = l.outputs;
    float *a = state.input;
    float *b = l.weights;
    float *c = l.output;
    //注意这里的TB=1了，所以调用了gemm_tn()这个函数，下面会有介绍
    gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
    if(l.batch_normalize){
        if(state.train){
            mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
            variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
            //将l.rolling_mean所有值赋0.95（移动平均什么意思呢？自己百度吧，数据分析用的很多～）
            scal_cpu(l.outputs, .95, l.rolling_mean, 1);
            //将l.rolling_mean的值加上0.5*l.mean
            axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
            //将l.rolling_variance所有值赋0.95
            scal_cpu(l.outputs, .95, l.rolling_variance, 1);
            //将l.rolling_variance的值加上0.5*l.variance
            axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);

            //将l.output的值赋值到l.x，此时l.x是没有经过BN的
            copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
            //BN
            normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
            //将l.output的值赋值到l.x_norm，此时l.x_norm是经过BN之后的数据
            copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
        } else {
            normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
        }
        scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
    }
    for(i = 0; i < l.batch; ++i){
        axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
    }
    //线性变换返回值不变
    activate_array(l.output, l.outputs*l.batch, l.activation);
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42

//gemm.c
void gemm_nt(int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float *C, int ldc)
{
    int i,j,k;
    //M=batch，每个样本有N(yolo.train.cfg中是1715=S×S×(B∗5+C))个输出
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
            register float sum = 0;
            //K是inputs，即输入个数
            for(k = 0; k < K; ++k){
                //输入项和权重项对应相乘相加
                sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
            }
            C[i*ldc+j] += sum;
        }
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20

6、前向传播-detectiondetection layer

这是最后一层了，这也是作者论文的精髓所在，希望大家能对比论文认真看一下。
前向传播终于要结束了，这篇博文有点小长～～

void forward_detection_layer(const detection_layer l, network_state state)
{
    int locations = l.side*l.side;
    int i,j;
    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
    //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
    int b;
    //这里的softmax=0，所以最后竟然都没有softmax层……
    if (l.softmax){
        for(b = 0; b < l.batch; ++b){
            int index = b*l.inputs;
            for (i = 0; i < locations; ++i) {
                int offset = i*l.classes;
                softmax(l.output + index + offset, l.classes, 1,
                        l.output + index + offset);
            }
        }
    }
    //训练的时候才需要cost function
    if(state.train){
        float avg_iou = 0;
        float avg_cat = 0;
        float avg_allcat = 0;
        float avg_obj = 0;
        float avg_anyobj = 0;
        int count = 0;
        *(l.cost) = 0;
        int size = l.inputs * l.batch;
        /*void *memset(void *s, int ch, size_t n);
        memset是计算机中C/C++语言函数。将s所指向的某一块内存中的前n个 字节的内容全部设置
        为ch指定的ASCII值*/
        memset(l.delta, 0, size * sizeof(float));//l.delta存放的loss function的没一项
        for (b = 0; b < l.batch; ++b){
            int index = b*l.inputs;
            //locations = 7*7
            for (i = 0; i < locations; ++i) {
                //coords包括x, y, w, h，1代表的是置信度
                //truth_index是真实值的坐标索引
                int truth_index = (b*locations + i)*(1+l.coords+l.classes);
                int is_obj = state.truth[truth_index];
                //计算置信度的损失
                for (j = 0; j < l.n; ++j) {
                    //p_index是预测值的坐标索引，每个网格预测l.n个框，这里l.n=3（cfg文件中的num值）,论文中是2
                    int p_index = index + locations*l.classes + i*l.n + j;
                    l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
                    //对应论文公式，这里先假设B个框中都没有物体
                    *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
                    avg_anyobj += l.output[p_index];
                }

                int best_index = -1;
                float best_iou = 0;
                float best_rmse = 20;

                if (!is_obj){
                    continue;
                }

                int class_index = index + i*l.classes;                
                for(j = 0; j < l.classes; ++j) {
                    //计算类别的损失
                    l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
                    *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
                    if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
                    avg_allcat += l.output[class_index+j];
                }

                //计算位置信息的损失
                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
                truth.x /= l.side;
                truth.y /= l.side;

                /*寻找最后预测框（We only predict one set of class probabilities per grid cell, 
                regardless of the number of boxes B）*/
                for(j = 0; j < l.n; ++j){
                    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
                    box out = float_to_box(l.output + box_index);
                    out.x /= l.side;
                    out.y /= l.side;

                    if (l.sqrt){
                        out.w = out.w*out.w;
                        out.h = out.h*out.h;
                    }

                    //计算iou的值
                    float iou  = box_iou(out, truth);
                    //iou = 0;
                    //计算均方根误差（root-mean-square error）
                    float rmse = box_rmse(out, truth);
                    //选出iou最大或者均方根误差最小的那个框作为最后预测框～
                    if(best_iou > 0 || iou > 0){
                        if(iou > best_iou){
                            best_iou = iou;
                            best_index = j;
                        }
                    }else{
                        if(rmse < best_rmse){
                            best_rmse = rmse;
                            best_index = j;
                        }
                    }
                }
                //强制确定一个最后预测框
                if(l.forced){
                    if(truth.w*truth.h < .1){
                        best_index = 1;
                    }else{
                        best_index = 0;
                    }
                }
                //随机确定一个最后预测框～
                if(l.random && *(state.net.seen) < 64000){
                    best_index = rand()%l.n;
                }

                //预测的框的索引
                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
                //真实框的索引
                int tbox_index = truth_index + 1 + l.classes;

                box out = float_to_box(l.output + box_index);
                out.x /= l.side;
                out.y /= l.side;
                if (l.sqrt) {
                    out.w = out.w*out.w;
                    out.h = out.h*out.h;
                }
                float iou  = box_iou(out, truth);

                //printf("%d,", best_index);
                int p_index = index + locations*l.classes + i*l.n + best_index;
                //前面假设了B个框中都没有物体来计算损失，这里再把有物体的那个减掉
                *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
                *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
                avg_obj += l.output[p_index];
                l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);

                if(l.rescore){
                    l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                }

                l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
                l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
                l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
                l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
                if(l.sqrt){
                    l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
                    l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
                }

                //把iou作为损失，这包含了x,y,w,h四个参数，其实后来没用iou来计算损失，而是论文中给的公式
                *(l.cost) += pow(1-iou, 2);
                avg_iou += iou;
                ++count;
            }
        }
        //论文中没用到
        if(0){
            float *costs = calloc(l.batch*locations*l.n, sizeof(float));
            for (b = 0; b < l.batch; ++b) {
                int index = b*l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
                        int p_index = index + locations*l.classes + i*l.n + j;
                        costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
                    }
                }
            }
            int indexes[100];
            top_k(costs, l.batch*locations*l.n, 100, indexes);
            float cutoff = costs[indexes[99]];
            for (b = 0; b < l.batch; ++b) {
                int index = b*l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
                        int p_index = index + locations*l.classes + i*l.n + j;
                        if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
                    }
                }
            }
            free(costs);
        }

        //前面的*(l.cost)其实可以注释掉了，因为前面都没用，到这里才计算loss
        *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);

        //打印log
        printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
        //if(l.reorg) reorg(l.delta, l.w*l.h, size*l.n, l.batch, 0);
    }
}
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
   
   89
   
   90
   
   91
   
   92
   
   93
   
   94
   
   95
   
   96
   
   97
   
   98
   
   99
   
   100
   
   101
   
   102
   
   103
   
   104
   
   105
   
   106
   
   107
   
   108
   
   109
   
   110
   
   111
   
   112
   
   113
   
   114
   
   115
   
   116
   
   117
   
   118
   
   119
   
   120
   
   121
   
   122
   
   123
   
   124
   
   125
   
   126
   
   127
   
   128
   
   129
   
   130
   
   131
   
   132
   
   133
   
   134
   
   135
   
   136
   
   137
   
   138
   
   139
   
   140
   
   141
   
   142
   
   143
   
   144
   
   145
   
   146
   
   147
   
   148
   
   149
   
   150
   
   151
   
   152
   
   153
   
   154
   
   155
   
   156
   
   157
   
   158
   
   159
   
   160
   
   161
   
   162
   
   163
   
   164
   
   165
   
   166
   
   167
   
   168
   
   169
   
   170
   
   171
   
   172
   
   173
   
   174
   
   175
   
   176
   
   177
   
   178
   
   179
   
   180
   
   181
   
   182
   
   183
   
   184
   
   185
   
   186
   
   187
   
   188
   
   189
   
   190
   
   191
   
   192
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
   
   89
   
   90
   
   91
   
   92
   
   93
   
   94
   
   95
   
   96
   
   97
   
   98
   
   99
   
   100
   
   101
   
   102
   
   103
   
   104
   
   105
   
   106
   
   107
   
   108
   
   109
   
   110
   
   111
   
   112
   
   113
   
   114
   
   115
   
   116
   
   117
   
   118
   
   119
   
   120
   
   121
   
   122
   
   123
   
   124
   
   125
   
   126
   
   127
   
   128
   
   129
   
   130
   
   131
   
   132
   
   133
   
   134
   
   135
   
   136
   
   137
   
   138
   
   139
   
   140
   
   141
   
   142
   
   143
   
   144
   
   145
   
   146
   
   147
   
   148
   
   149
   
   150
   
   151
   
   152
   
   153
   
   154
   
   155
   
   156
   
   157
   
   158
   
   159
   
   160
   
   161
   
   162
   
   163
   
   164
   
   165
   
   166
   
   167
   
   168
   
   169
   
   170
   
   171
   
   172
   
   173
   
   174
   
   175
   
   176
   
   177
   
   178
   
   179
   
   180
   
   181
   
   182
   
   183
   
   184
   
   185
   
   186
   
   187
   
   188
   
   189
   
   190
   
   191
   
   192

二、总结

读到这里，我们已经完全掌握了YOLO代码的框架，我们大概总结下darknet的优缺点。
优点：

代码依赖项少，只有cuda，甚至连opencv都可以不需要，如果你在cpu平台，cuda都可以扔了（当然darknet的cup代码并没有做什么优化，跑起来就很慢）。这对于做工程的人来说是非常好的消息，因为我们可以很easy的将代码移植到其他平台

缺点：

在darknet中，所有层的lr都一样，这对微调造成了很大的困难，因为微调需要把前面几层的lr都设置的很小很小，然后主要训练最后一层的权重
总的来说就是darknet的接口确实很差，如果想把网络改成inception或者resnet的构架，需要改大量的代码，这对于验证模型可行性来说，非常浪费时间。你也应该能理解到为什么我们想要将代码移植到mxnet或者caffe上，然后在mxnet上做模型压缩了

下一篇是反向传播部分的代码，这部分是很重要的，但对与移植或者进一步改进网络来说，其实没必要理解反向传播部分的代码，但如果你想对CNN更深入的了解，可以继续看一下篇关于反向传播部分的内容。

(END)