YOLOv3 yolo_layer.c

for (b = 0; b < l.batch; ++b){
    for(n = 0; n < l.n; ++n){
        int index = entry_index(l, b, n*l.w*l.h, 0);
        // 对 tx, ty进行logistic变换
        activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
        index = entry_index(l, b, n*l.w*l.h, 4);
        // 对confidence和C类进行logistic变换
        activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
    }
}

for (j = 0; j < l.h; ++j) {
    for (i = 0; i < l.w; ++i) {
        for (n = 0; n < l.n; ++n) {
            // 对每个预测的bounding box
            // 找到与其IoU最大的ground truth
            int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
            box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
            float best_iou = 0;
            int best_t = 0;
            for(t = 0; t < l.max_boxes; ++t){
                box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
                if(!truth.x) break;
                float iou = box_iou(pred, truth);
                if (iou > best_iou) {
                    best_iou = iou;
                    best_t = t;
                }
            }
            int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
            avg_anyobj += l.output[obj_index];
            // 计算梯度
            // 如果大于ignore_thresh, 那么忽略
            // 如果小于ignore_thresh，target = 0
            // diff = -gradient = target - output
            // 为什么是上式，见下面的数学分析
            l.delta[obj_index] = 0 - l.output[obj_index];
            if (best_iou > l.ignore_thresh) {
                l.delta[obj_index] = 0;
            }
            // 这里仍然有疑问，为何使用truth_thresh?这个值是1
            // 按道理，iou无论如何不可能大于1啊。。。
            if (best_iou > l.truth_thresh) {
                // confidence target = 1
                l.delta[obj_index] = 1 - l.output[obj_index];
                int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
                if (l.map) class = l.map[class];
                int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                // 对class进行求导
                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
                box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
                // 对box位置参数进行求导
                delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
            }
        }
    }
}

hi即为logistic激活后的输出，yi为target,logistic损失函数求导之后的结果就是（hi - yi）(推导过程可以省略)

// class是类别的ground truth
// classes是类别总数
// index是feature map一维数组里面class prediction的起始索引
void delta_yolo_class(float *output, float *delta, int index, 
  int class, int classes, int stride, float *avg_cat) {
    int n;
    /*
我们知道，在YOLO_v3中类别损失函数使用的是sigmoid-loss，而不是使用softmax-loss。分类时使用sigmoid损失函数时，由于在使用真值框的中心点计算得到的最后一层feature map上的点位置存在量化误差，feature map上的点只能为整型，因此可能会存在两个靠的很近的真值框中心点计算出的位置在feature map上的坐标点位置是一样的，出现这种情况时，对应的class梯度已经在前一个真值框计算时计算过，而新的真值框计算class梯度时，没有必要将原来的class_delta全部覆盖掉，只需要更新对应class label对应的sigmoid梯度即可，因此这样的操作方式可能导致一个目标框的几个类别概率都比较大（即多label）。

当然，如果计算分类损失时使用softmax-loss就没必要这样做了。因为softmax计算出的类别概率是互斥的，不像使用sigmoid计算分类损失，因为每个类别都使用一个sigmoid计算其分类损失，他们的类别不是互斥的，因此可以使用代码中描述的操作方式，使用softmax-loss计算分类损失梯度时，第一

   */
    if (delta[index]){
        delta[index + stride*class] = 1 - output[index + stride*class];
        if(avg_cat) *avg_cat += output[index + stride*class];
        return;
    }
    for(n = 0; n < classes; ++n){
        // 见上，diff = target - prediction
        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
    }
}
// box delta这里没什么可说的，就是square error的求导
float delta_yolo_box(box truth, float *x, float *biases, int n, 
  int index, int i, int j, int lw, int lh, int w, int h, 
  float *delta, float scale, int stride) {
    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
    float iou = box_iou(pred, truth);
    float tx = (truth.x*lw - i);
    float ty = (truth.y*lh - j);
    float tw = log(truth.w*w / biases[2*n]);
    float th = log(truth.h*h / biases[2*n + 1]);
    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
    return iou;
}

// 遍历ground truth
for(t = 0; t < l.max_boxes; ++t){
    box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
    if(!truth.x) break;
    // 找到iou最大的那个bounding box
    float best_iou = 0;
    int best_n = 0;
    i = (truth.x * l.w);
    j = (truth.y * l.h);
    box truth_shift = truth;
    truth_shift.x = truth_shift.y = 0;
    for(n = 0; n < l.total; ++n){
        box pred = {0};
        pred.w = l.biases[2*n]/net.w;
        pred.h = l.biases[2*n+1]/net.h;
        float iou = box_iou(pred, truth_shift);
        if (iou > best_iou){
            best_iou = iou;
            best_n = n;
        }
    }
    
    int mask_n = int_index(l.mask, best_n, l.n);
    if(mask_n >= 0){
        int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
        float iou = delta_yolo_box(truth, l.output, l.biases, best_n, 
          box_index, i, j, l.w, l.h, net.w, net.h, l.delta, 
          (2-truth.w*truth.h), l.w*l.h);
        int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
        avg_obj += l.output[obj_index];
        // 对应objectness target = 1
        l.delta[obj_index] = 1 - l.output[obj_index];
        int class = net.truth[t*(4 + 1) + b*l.truths + 4];
        if (l.map) class = l.map[class];
        int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
        ++count;
        ++class_count;
        if(iou > .5) recall += 1;
        if(iou > .75) recall75 += 1;
        avg_iou += iou;
    }
}

猜你喜欢