Detailed MaskRCNN Code (unfinished)

Mask RCNN model

anchor generation

def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2 ** np.arange(3, 6)):

  base_anchor = np.array([1, 1, base_size, base_size]) - 1
  # 首先根据base size生成三个最基本的anchor,一个方形,一个竖长方形,一个横长方形,
  # 但最后真正的anchor会在这三个anchor上乘以scale的比例,这个过程中一直未变的是中心点坐标(7.5,7.5)
  ratio_anchors = _ratio_enum(base_anchor, ratios)
  # scales的值有三个,在faster RCNN中为[8,16,32],Mask RCNN中则根据FPN,默认设置三组这样的list,
  # 以faster RCNN为例,在_scale_enum过程中,会对上述_ratio_enum所生成的三个基本anchor的宽和高
  # 分别乘以8,16,32,所得到的最终anchor为3*3=9个anchor
  anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
                       for i in range(ratio_anchors.shape[0])])
  return anchors
# 这个函数实际上相当于将anchor映射到input尺寸上,其中height,width为feature map的高和宽,
# sw与sh的作用便是将anchor映射到原图的尺寸上,当feature map比较小的时候,框大且稀疏,
# feature map比较大的时候,框小且密集,但均能覆盖整张原图。
def canchors_plane(height, width, stride, anchors_base):
    A = anchors_base.shape[0]
    all_anchors = np.zeros((height, width, A, 4), dtype=np.float32)
    for iw in range(width):
      sw = iw * stride
      for ih in range(height):
        sh = ih * stride
        for k in range(A):
          all_anchors[ih, iw, k, 0] = anchors_base[k, 0] + sw
          all_anchors[ih, iw, k, 1] = anchors_base[k, 1] + sh
          all_anchors[ih, iw, k, 2] = anchors_base[k, 2] + sw
          all_anchors[ih, iw, k, 3] = anchors_base[k, 3] + sh
    return all_anchors

MaskRCNN normalization is done by dividing the corresponding length and width of the anchor

First extraction and classification tag anchor offset on each feature map FPN, the coefficients of the predicted displacement of the center point coordinates is multiplied by a corresponding anchor multiplied by the corresponding length and width, the predicted length and width telescopic will be multiplied by the corresponding coefficient index and do the conversion, the results obtained rpn stage based on all the boxes to be amended. All extracted the feature map anchor are placing them under the size of the original image size, border processing of these boxes revised want to go beyond that work clip_boxes function.

def bbox_transform_inv(boxes, deltas):
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

    boxes = boxes.astype(deltas.dtype, copy=False)

    widths = boxes[:, 2] - boxes[:, 0] + 1.0
    heights = boxes[:, 3] - boxes[:, 1] + 1.0
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    dx = deltas[:, 0::4] * 0.1
    dy = deltas[:, 1::4] * 0.1
    dw = deltas[:, 2::4] * 0.2
    dh = deltas[:, 3::4] * 0.2

    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]

    pred_w = np.exp(dw + np.log(widths[:, np.newaxis]))
    pred_h = np.exp(dh + np.log(heights[:, np.newaxis]))

    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
    # x1
    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
    # y1
    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
    # x2
    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
    # y2
    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1

    return pred_boxes
def clip_boxes(boxes, im_shape):
    """
    Clip boxes to image boundaries.
    """

    # x1 >= 0 同时 x1 <= 图像最右边框
    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
    # y1 >= 0
    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
    # x2 < im_shape[1]
    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
    # y2 < im_shape[0]
    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
    return boxes

nms code is relatively simple to understand, here is not posted, and its function is to press the feature classification score descending order obtained by screening to remove part of the map, and then do nms, nms also retain a small box IOU at the same time try to keep a box score high, to do the same after a screening nms, the removal of part of the low scores again.

Computing overlap, where boxes generated for the anchor, the number is very large, that a very large number N, query_boxes is true box, that gt_boxes, K is the number of pictures in the Target box, usually much smaller than N, and anchors the gt_boxes elements are taken out and do cross over, the work function is obtained for the shape is a (N, K) of the array

def bbox_overlaps(boxes, query_boxes):
    """
    Parameters
    ----------
    boxes: (N, 4) ndarray of float
    query_boxes: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    overlaps = np.zeros((N, K), dtype=np.float32)
    for k in range(K):
        box_area = (
            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        )
        for n in range(N):
            iw = (
                min(boxes[n, 2], query_boxes[k, 2]) -
                max(boxes[n, 0], query_boxes[k, 0]) + 1
            )
            if iw > 0:
                ih = (
                    min(boxes[n, 3], query_boxes[k, 3]) -
                    max(boxes[n, 1], query_boxes[k, 1]) + 1
                )
                if ih > 0:
                    ua = float(
                        (boxes[n, 2] - boxes[n, 0] + 1) *
                        (boxes[n, 3] - boxes[n, 1] + 1) +
                        box_area - iw * ih
                    )
                    overlaps[n, k] = iw * ih / ua
    return overlaps

Prospects bounding_box taken with the mask is not the same threshold value, simply select the foreground mask requirement value is greater than 0.5 to IOU, selected out of prospect will be relatively many, i.e. N, bounding_box foreground selecting the manner described above way more complicated, less the number of prospects to stay.

def sample_rpn_outputs_wrt_gt_boxes(boxes, scores, gt_boxes, is_training=False, only_positive=False):
    """sample boxes for refined output"""
    boxes, scores, batch_inds = sample_rpn_outputs(boxes, scores, is_training, only_positive)

    if gt_boxes.size > 0:
        overlaps = cython_bbox.bbox_overlaps(
            np.ascontiguousarray(boxes[:, 0:4], dtype=np.float),
            np.ascontiguousarray(gt_boxes[:, 0:4], dtype=np.float))
        # 在这里是计算所有anchors与gt_boxes交并比的最大的那个gt_boxes所在的坐标,
        # 即去除了交并比小的那些gt_boxes,返回值的shape是(N)。
        gt_assignment = overlaps.argmax(axis=1)  # B
        max_overlaps = overlaps[np.arange(boxes.shape[0]), gt_assignment]  # B
        # 在所有anchors里面,去除那些交并比小于0.7的
        # cfg.FLAGS.fg_threshold = 0.7
        fg_inds = np.where(max_overlaps >= cfg.FLAGS.fg_threshold)[0]
        # mask的阈值设置的比bounding_box要小,交并比大于0.5便作为mask的ROI区域
        # cfg.FLAGS.mask_threshold = 0.5, cfg.FLAGS.masks_per_image = 64
        mask_fg_inds = np.where(max_overlaps >= cfg.FLAGS.mask_threshold)[0]
        if mask_fg_inds.size > cfg.FLAGS.masks_per_image:
            mask_fg_inds = np.random.choice(mask_fg_inds, size=cfg.FLAGS.masks_per_image, replace=False)

        if True:
            # 这里计算的是所有gt_boxes与anchors交并比最大的那个anchor所在的坐标
            # 即去除了那些交并比较小的anchors,得到的值的shape为(K)。
            gt_argmax_overlaps = overlaps.argmax(axis=0)  # G
            # 默认将与每个gt_boxes交并比最大的anchor设置为前景,这些前景数量不多,与gt_boxes
            # 数量一致,同时把那些交并比大于0.7的anchors加进去,这部分anchors可能比较多,一般
            # 比上面默认的那部分多,两者合到一起,union1d有去重的功能
            fg_inds = np.union1d(gt_argmax_overlaps, fg_inds)
        # 保证前景的数量小于等于64,如果fg_inds大于64,则从中随机选出64个,这一步是为了保证后面
        # 前景背景的比例大于1:3
        # cfg.FLAGS.rois_per_image * cfg.FLAGS.fg_roi_fraction = 256 * 0.25 = 64
        fg_rois = int(min(fg_inds.size, cfg.FLAGS.rois_per_image * cfg.FLAGS.fg_roi_fraction))
        if fg_inds.size > 0 and fg_rois < fg_inds.size:
            fg_inds = np.random.choice(fg_inds, size=fg_rois, replace=False)

        # TODO: sampling strategy
        # 背景取那些交并比小于0.3的anchors
        # cfg.FLAGS.bg_threshold = 0.3
        bg_inds = np.where((max_overlaps < cfg.FLAGS.bg_threshold))[0]
        bg_rois = max(min(cfg.FLAGS.rois_per_image - fg_rois, fg_rois * 3), 8)  # 64
        if bg_inds.size > 0 and bg_rois < bg_inds.size:
            bg_inds = np.random.choice(bg_inds, size=bg_rois, replace=False)

        keep_inds = np.append(fg_inds, bg_inds)
        # print(gt_boxes[np.argmax(overlaps[fg_inds],axis=1),4])
    else:
        bg_inds = np.arange(boxes.shape[0])
        bg_rois = min(int(cfg.FLAGS.rois_per_image * (1 - cfg.FLAGS.fg_roi_fraction)), 8)  # 64
        if bg_rois < bg_inds.size:
            bg_inds = np.random.choice(bg_inds, size=bg_rois, replace=False)

        keep_inds = bg_inds
        mask_fg_inds = np.arange(0)
    # 这里有个疑问就是batch_inds的元素始终为0
    return boxes[keep_inds, :], scores[keep_inds], batch_inds[keep_inds], \
           boxes[mask_fg_inds, :], scores[mask_fg_inds], batch_inds[mask_fg_inds]

In all the above process, the order of scores, and no change anchor_boxes, the feature map are small to large sort of feature map, in the process of assign_boxes, a fetch operation to index box according to the size of the area, I guess here the index should still be the largest in the box most forward position, box area of ​​the smallest in the rearmost position of.

Guess you like

Origin blog.csdn.net/weixin_33681778/article/details/90977675