py-faster-rcnn之AnchorTargetLayer阅读笔记

RPN网络示意图

（一）生成anchor

/lib/rpn/generate_anchors.py

代码生成的9个anchor,分别为:

[[ -84.  -40.   99.   55.]
 [-176.  -88.  191.  103.]
 [-360. -184.  375.  199.]
 [ -56.  -56.   71.   71.]
 [-120. -120.  135.  135.]
 [-248. -248.  263.  263.]
 [ -36.  -80.   51.   95.]
 [ -80. -168.   95.  183.]
 [-168. -344.  183.  359.]]

尺度分别是 base_size (16) *(8,16,32)

[x1,y1,x2,y2]表示的是矩形框的左上角,右下角坐标;

def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                     scales=2**np.arange(3, 6)):
    """
    Generate anchor (reference) windows by enumerating aspect ratios X
    scales wrt a reference (0, 0, 15, 15) window.
    """

    base_anchor = np.array([1, 1, base_size, base_size]) - 1
    ratio_anchors = _ratio_enum(base_anchor, ratios)
    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
                         for i in xrange(ratio_anchors.shape[0])])
    return anchors

ratio =[0.5,1,2],代表了该anchors 的宽高比分别为:1:2,1:1,2:1;

scales=2**np.arange(3, 6)
array([ 8, 16, 32])
代表了三种倍数 , 8,16,32 .

3与3 结合得到9种比列;

(1)ratio 操作:

def _ratio_enum(anchor, ratios):
    """
    Enumerate a set of anchors for each aspect ratio wrt an anchor.
    """
    #[0,0,15,15],[0.5,1,2]
    '''
    根据x_center,y_center 中心点的位置,生成三个[2:1,1:1,2:1]的比例的anchor
    '''
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    size = w * h
    size_ratios = size / ratios
    ##生成ws,hs 的list,如[0,0,15,15]的话,生成的w为array([21.21320344, 15.        , 10.60660172]),
    #再根据  ratios 为[2:1,1:1,1:2]生成相应的hlist,
    # 得到的x,y center   + w,h list 就可以得到3 种anchor 了
    ws = np.round(np.sqrt(size_ratios))
    hs = np.round(ws * ratios)
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors

#得到x,y center  与w,h 函数:
def _whctrs(anchor):
    """
    Return width, height, x center, and y center for an anchor (window).
    """

    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5 * (h - 1)
    return w, h, x_ctr, y_ctr

def _mkanchors(ws, hs, x_ctr, y_ctr):
    """
    Given a vector of widths (ws) and heights (hs) around a center
    (x_ctr, y_ctr), output a set of anchors (windows).
    """

    ws = ws[:, np.newaxis]
    hs = hs[:, np.newaxis]
    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
                         y_ctr - 0.5 * (hs - 1),
                         x_ctr + 0.5 * (ws - 1),
                         y_ctr + 0.5 * (hs - 1)))
    return anchors

经过2:1,1:1,1:2 的比例后,原始的array 为:

[[ -3.5   2.   18.5  13. ]
 [  0.    0.   15.   15. ]
 [  2.5  -3.   12.5  18. ]]

该矩阵再经过乘不同倍数得到

_scale_enum()函数用来将 w,h分别乘 [scale]倍数,得到最终的anchors

def _scale_enum(anchor, scales):
    """
    Enumerate a set of anchors for each scale wrt an anchor.
    """

    w, h, x_ctr, y_ctr = _whctrs(anchor)
    ws = w * scales
    hs = h * scales
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors

最终anchors 示意图:

image

(二)AnchorTargetLayer

该部分代码位于lib/rpn/anchor_target_layer.py

该层的定义:

layer {
  name: 'rpn-data'
  type: 'Python'
  bottom: 'rpn_cls_score'
  bottom: 'gt_boxes'
  bottom: 'im_info'
  bottom: 'data'
  top: 'rpn_labels'
  top: 'rpn_bbox_targets'
  top: 'rpn_bbox_inside_weights'
  top: 'rpn_bbox_outside_weights'
  python_param {
    module: 'rpn.anchor_target_layer'
    layer: 'AnchorTargetLayer'
    param_str: "'feat_stride': 16"
  }
}

该层接收RoiDataLayer 传来的 gt_boxes ,im_info,data 等数据,
,此处的feat_stride参数代表了每次滑动的窗口大小;

bottom[0] 指的就是 rpn_cls_score
bottom[1] 指的是 gt_boxes
bottom[2] 指的是 im_info
bottom[3] 指的是 data

forward

对每一个(h,w),生成9种不同形状的anchor,然后仅仅保留范围在原图中的anchor.

 '''
        当width 取61,heigh 取36的时候:
        shift_x = np.arange(0, width) * self._feat_stride
        shift_x
array([  0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192,
       208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400,
       416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608,
       624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816,
       832, 848, 864, 880, 896, 912, 928, 944, 960])
        shift_y = np.arange(0, height) * self._feat_stride
        >>> shift_y
array([  0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192,
       208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400,
       416, 432, 448, 464, 480, 496, 512, 528, 544, 560])

        #组合成grid
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
        >>> shift_x
array([[  0,  16,  32, ..., 928, 944, 960],
       [  0,  16,  32, ..., 928, 944, 960],
       [  0,  16,  32, ..., 928, 944, 960],
       ...,
       [  0,  16,  32, ..., 928, 944, 960],
       [  0,  16,  32, ..., 928, 944, 960],
       [  0,  16,  32, ..., 928, 944, 960]])
>>> shift_y
array([[  0,   0,   0, ...,   0,   0,   0],
       [ 16,  16,  16, ...,  16,  16,  16],
       [ 32,  32,  32, ...,  32,  32,  32],
       ...,
       [528, 528, 528, ..., 528, 528, 528],
       [544, 544, 544, ..., 544, 544, 544],
       [560, 560, 560, ..., 560, 560, 560]])

        #最后生成四个坐标(x1,y1,x2,y2)的偏移值:
        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),\
                            shift_x.ravel(), shift_y.ravel())).transpose()
        >>> shifts
array([[  0,   0,   0,   0],
       [ 16,   0,  16,   0],
       [ 32,   0,  32,   0],
       ...,
       [928, 560, 928, 560],
       [944, 560, 944, 560],
       [960, 560, 960, 560]])

        '''

A = anchors 的数量,为9
K = width * height

将偏移值与 anchor 相加,得到预测框的坐标


        A = self._num_anchors
        K = shifts.shape[0]
        all_anchors = (self._anchors.reshape((1, A, 4)) +
                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
        all_anchors = all_anchors.reshape((K * A, 4))
        total_anchors = int(K * A)

inds_inside是得到那些在图片内的框的ID


        # only keep anchors inside the image
        inds_inside = np.where(
            (all_anchors[:, 0] >= -self._allowed_border) &
            (all_anchors[:, 1] >= -self._allowed_border) &
            (all_anchors[:, 2] < im_info[1] + self._allowed_border) &  # width
            (all_anchors[:, 3] < im_info[0] + self._allowed_border)    # height
        )[0]
        # keep only inside anchors
        anchors = all_anchors[inds_inside, :]
        if DEBUG:
            print 'anchors.shape', anchors.shape

计算各个预设anchor框与真实框GT 的IOU,目的是找最接近真实框GT的anchors,设置为FG,计算loss,去掉那些属于背景的anchor框;

#计算anchors 与真实框的IOU
        overlaps = bbox_overlaps(
            np.ascontiguousarray(anchors, dtype=np.float),
            np.ascontiguousarray(gt_boxes, dtype=np.float))
        argmax_overlaps = overlaps.argmax(axis=1)
        #max_overlaps是每个anchor对应最大的overlap值
        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
        #gt_max_overlaps 是每个gt_boxes 对应的最大overlap值
        gt_argmax_overlaps = overlaps.argmax(axis=0)
        gt_max_overlaps = overlaps[gt_argmax_overlaps,
                                   np.arange(overlaps.shape[1])]
        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
        # fg label: for each gt, anchor with highest overlap
        #给每一个GT对应的最高iou anchors 都分配为 1
        labels[gt_argmax_overlaps] = 1

        # fg label: above threshold IOU
        #大于IOU设置值的anchor 也设为1
        labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

        if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
            # assign bg labels last so that negative labels can clobber positives
            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

正负各取样,保持正负样本1:1

        # subsample positive labels if we have too many
        #接下来两步工作是为了让正样本与负样本严格保持1:1
        num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
        fg_inds = np.where(labels == 1)[0]
        if len(fg_inds) > num_fg:
            #如果FG的数量大于所需数量.则随机抽取一部分
            disable_inds = npr.choice(
                fg_inds, size=(len(fg_inds) - num_fg), replace=False)
            #其他anchors的选择不要,设置为-1
            labels[disable_inds] = -1

        # subsample negative labels if we have too many
        num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
        bg_inds = np.where(labels == 0)[0]
        if len(bg_inds) > num_bg:
            disable_inds = npr.choice(
                bg_inds, size=(len(bg_inds) - num_bg), replace=False)
            labels[disable_inds] = -1

还原target大小,方便将数据传递上一层网络:

        # map up to original set of anchors
        # #还记得文初将all_anchors裁减掉了2/3左右，仅仅保留在图像内的anchor吗，
        # 将砍掉的图像外的anchors 还原,作为下一层的输入
        # 并reshape成相应的格式
        labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
        bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
        bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
        bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

        if DEBUG:
            print 'rpn: max max_overlap', np.max(max_overlaps)
            print 'rpn: num_positive', np.sum(labels == 1)
            print 'rpn: num_negative', np.sum(labels == 0)
            self._fg_sum += np.sum(labels == 1)
            self._bg_sum += np.sum(labels == 0)
            self._count += 1
            print 'rpn: num_positive avg', self._fg_sum / self._count
            print 'rpn: num_negative avg', self._bg_sum / self._count

        # labels
        labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
        labels = labels.reshape((1, 1, A * height, width))
        top[0].reshape(*labels.shape)
        top[0].data[...] = labels

        # bbox_targets
        bbox_targets = bbox_targets \
            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
        top[1].reshape(*bbox_targets.shape)
        top[1].data[...] = bbox_targets

        # bbox_inside_weights
        bbox_inside_weights = bbox_inside_weights \
            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
        assert bbox_inside_weights.shape[2] == height
        assert bbox_inside_weights.shape[3] == width
        top[2].reshape(*bbox_inside_weights.shape)
        top[2].data[...] = bbox_inside_weights

        # bbox_outside_weights
        bbox_outside_weights = bbox_outside_weights \
            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
        assert bbox_outside_weights.shape[2] == height
        assert bbox_outside_weights.shape[3] == width
        top[3].reshape(*bbox_outside_weights.shape)
        top[3].data[...] = bbox_outside_weights

target bounding-box的回归计算公式:

def _compute_targets(ex_rois, gt_rois):
    """Compute bounding-box regression targets for an image."""
    #计算target 与anchors 的偏移量,方便回归loss计算
    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 5

    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
def bbox_transform(ex_rois, gt_rois):
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights

    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights

    
    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = np.log(gt_widths / ex_widths)
    targets_dh = np.log(gt_heights / ex_heights)

    targets = np.vstack(
        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
    return targets

总结:AnchorTargetLayer 该类的作用是 :

通过预设步长feat_stride,遍历图像,得到每个小格子的坐标
然后将预设生anchors[x1,y1,x2,y2] 作用到格子中,形成了A * K个 anchors
去掉处于图像边缘外面的anchors
分别计算每个anchors 与GT的 IOU, IOU大于阈值的将其label 设置为1(前景),IOU小于阈值的设置为0(背景)
采样,使正负样本保持1:1比例