Faster Rcnn 源码解析（四）—— proposals_targte_layer.py

功能：

根据GTbox和topN proposals选择满足要求的128个proposals（包括fg和bg），然后加上物体类别标签和bbox的回归目标，只有在该类别的对应位置上面才会有位置信息）,并计算权重weights。（这128个proposals是包含了Gtbox的？）

输入：

bottom[0]: rpn_rois，从proposal_layer提取到的proposals
bottom[1]: gt_boxes:  GroundTruth boxes

输出：

top[0]: 'rois':包括所有roi的左上和右下角坐标
top[1]: 'labels'：所有提取出的roi的标签，bg = 0
top[2]: 'bbox_targets'：所有roi相对于与其有最大IOU的GTboxes的偏移量，是一个[4*classes]的vector，偏移量存放在对应label位置
top[3]: 'bbox_inside_weights'： = 1
top[4]: 'bbox_outside_weights'：代码里面没有出现

源码：

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import caffe
import yaml
import numpy as np
import numpy.random as npr
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps

DEBUG = False
class ProposalTargetLayer(caffe.Layer):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    """

    def setup(self, bottom, top):
        layer_params = yaml.load(self.param_str_)
        self._num_classes = layer_params['num_classes']#获取总的分类数量

        # sampled rois (0, x1, y1, x2, y2)
        top[0].reshape(1, 5)
        # labels
        top[1].reshape(1, 1)
        # bbox_targets
        top[2].reshape(1, self._num_classes * 4)
        # bbox_inside_weights
        top[3].reshape(1, self._num_classes * 4)
        # bbox_outside_weights
        top[4].reshape(1, self._num_classes * 4)

    def forward(self, bottom, top):
        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
        # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
        all_rois = bottom[0].data
        # GT boxes (x1, y1, x2, y2, label)
        # TODO(rbg): it's annoying that sometimes I have extra info before
        # and other times after box coordinates -- normalize to one format
        gt_boxes = bottom[1].data

        # Include ground-truth boxes in the set of candidate rois
        # 产生了一个M*1的0矩阵
        zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
        # 由于gt_boxes是有5列信息的（x1,y1,x2,y2,标签），
        # 此时只取前4个（gt_boxes[:, :-1]）即位置信息，存入all_rois
        # 即将rois和gt_boxes在0维拼合在一起，数据还是五列，第一列全0，后四列是box坐标；
        # 所以此时的all_rois存放了所有的gt和proposals的坐标
        all_rois = np.vstack(
            (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
        )

        # Sanity check: single batch only
        assert np.all(all_rois[:, 0] == 0), \
                'Only single item batches are supported'

        num_images = 1
        #cfg.TRAIN.BATCH_SIZE：感兴趣区域的个数
        rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images #128/1 = 128
        # cfg.TRAIN.FG_FRACTION ：fg的比例
        fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) #0.25*128=32

        # Sample rois with classification labels and bounding box regression
        # targets
        labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
            all_rois, gt_boxes, fg_rois_per_image,
            rois_per_image, self._num_classes)

        if DEBUG:
            print 'num fg: {}'.format((labels > 0).sum())
            print 'num bg: {}'.format((labels == 0).sum())
            self._count += 1
            self._fg_num += (labels > 0).sum()
            self._bg_num += (labels == 0).sum()
            print 'num fg avg: {}'.format(self._fg_num / self._count)
            print 'num bg avg: {}'.format(self._bg_num / self._count)
            print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num))

        # sampled rois
        top[0].reshape(*rois.shape)
        top[0].data[...] = rois

        # classification labels
        top[1].reshape(*labels.shape)
        top[1].data[...] = labels

        # bbox_targets
        top[2].reshape(*bbox_targets.shape)
        top[2].data[...] = bbox_targets

        # bbox_inside_weights
        top[3].reshape(*bbox_inside_weights.shape)
        top[3].data[...] = bbox_inside_weights

        # bbox_outside_weights
        top[4].reshape(*bbox_inside_weights.shape)
        top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32)

    def backward(self, top, propagate_down, bottom):
        """This layer does not propagate gradients."""
        pass

    def reshape(self, bottom, top):
        """Reshaping happens during the call to forward."""
        pass

#计算bbox_target_data向量
def _get_bbox_regression_labels(bbox_target_data, num_classes):
    """Bounding-box regression targets (bbox_target_data) are stored in a
    compact form N x (class, tx, ty, tw, th)

    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets).

    Returns:
        bbox_target (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    """

    clss = bbox_target_data[:, 0] #表示一共有clss 个 bbox。
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)#4 * num_classes = 4*21列
    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    inds = np.where(clss > 0)[0] #只要fg的
    for ind in inds: #只在该类别对应的那4个位置放值
        cls = clss[ind]
        start = 4 * cls
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS#前景的inside_weights =1
    return bbox_targets, bbox_inside_weights

#将label和其与GT的偏移量放在一起得到[label,偏移量]
def _compute_targets(ex_rois, gt_rois, labels):
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4
    #计算ROI和GT的偏移量
    targets = bbox_transform(ex_rois, gt_rois)
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        # Optionally normalize targets by a precomputed mean and stdev
        targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
                / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
    return np.hstack(
            (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
#从一张图片的rois里采样得到roi
def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    # overlaps: (rois x gt_boxes)
    #计算ROI和GT的IOU
    overlaps = bbox_overlaps(
        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
    #每行的最大值，表示与该anchor具有最大IOU的GT的下标
    gt_assignment = overlaps.argmax(axis=1)
    #最大IOU
    max_overlaps = overlaps.max(axis=1)
    #提取与该ROI具有最大IOU的GT的label
    labels = gt_boxes[gt_assignment, 4]

    # Select foreground RoIs as those with >= FG_THRESH overlap
    #找到大于规定阈值的fg，
    # 当fg_inds的个数比fg_rois_per_image大时，就只筛选32个出来；否则，全部保留；
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement

    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    #挑选满足条件的bg
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    #bg个数 = 总ROI-fg个数，接下来的方法和fg相同
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
    # Sample background regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)

    # The indices that we're selecting (both fg and bg)
    #所有挑选出来的ROI的下标
    keep_inds = np.append(fg_inds, bg_inds)
    # Select sampled values from various arrays:
    #这些ROIs的标签
    labels = labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    #将bg的label全部置为0.
    labels[fg_rois_per_this_image:] = 0
    rois = all_rois[keep_inds]
    #计算roi的gt的偏移量，返回为[label,四个偏移量]
    bbox_target_data = _compute_targets(
        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)

    bbox_targets, bbox_inside_weights = \
        _get_bbox_regression_labels(bbox_target_data, num_classes)

    return labels, rois, bbox_targets, bbox_inside_weights

Faster Rcnn 源码解析（四）—— proposals_targte_layer.py

猜你喜欢