功能:
根据GTbox和topN proposals选择满足要求的128个proposals(包括fg和bg),然后加上物体类别标签和bbox的回归目标,只有在该类别的对应位置上面才会有位置信息),并计算权重weights。(这128个proposals是包含了Gtbox的?)
输入:
bottom[0]: rpn_rois,从proposal_layer提取到的proposals bottom[1]: gt_boxes: GroundTruth boxes
输出:
top[0]: 'rois':包括所有roi的左上和右下角坐标 top[1]: 'labels':所有提取出的roi的标签,bg = 0 top[2]: 'bbox_targets':所有roi相对于与其有最大IOU的GTboxes的偏移量,是一个[4*classes]的vector,偏移量存放在对应label位置 top[3]: 'bbox_inside_weights': = 1 top[4]: 'bbox_outside_weights':代码里面没有出现
源码:
# -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import caffe import yaml import numpy as np import numpy.random as npr from fast_rcnn.config import cfg from fast_rcnn.bbox_transform import bbox_transform from utils.cython_bbox import bbox_overlaps DEBUG = False class ProposalTargetLayer(caffe.Layer): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. """ def setup(self, bottom, top): layer_params = yaml.load(self.param_str_) self._num_classes = layer_params['num_classes']#获取总的分类数量 # sampled rois (0, x1, y1, x2, y2) top[0].reshape(1, 5) # labels top[1].reshape(1, 1) # bbox_targets top[2].reshape(1, self._num_classes * 4) # bbox_inside_weights top[3].reshape(1, self._num_classes * 4) # bbox_outside_weights top[4].reshape(1, self._num_classes * 4) def forward(self, bottom, top): # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN # (i.e., rpn.proposal_layer.ProposalLayer), or any other source all_rois = bottom[0].data # GT boxes (x1, y1, x2, y2, label) # TODO(rbg): it's annoying that sometimes I have extra info before # and other times after box coordinates -- normalize to one format gt_boxes = bottom[1].data # Include ground-truth boxes in the set of candidate rois # 产生了一个M*1的0矩阵 zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) # 由于gt_boxes是有5列信息的(x1,y1,x2,y2,标签), # 此时只取前4个(gt_boxes[:, :-1])即位置信息,存入all_rois # 即将rois和gt_boxes在0维拼合在一起,数据还是五列,第一列全0,后四列是box坐标; # 所以此时的all_rois存放了所有的gt和proposals的坐标 all_rois = np.vstack( (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) ) # Sanity check: single batch only assert np.all(all_rois[:, 0] == 0), \ 'Only single item batches are supported' num_images = 1 #cfg.TRAIN.BATCH_SIZE:感兴趣区域的个数 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images #128/1 = 128 # cfg.TRAIN.FG_FRACTION :fg的比例 fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) #0.25*128=32 # Sample rois with classification labels and bounding box regression # targets labels, rois, bbox_targets, bbox_inside_weights = _sample_rois( all_rois, gt_boxes, fg_rois_per_image, rois_per_image, self._num_classes) if DEBUG: print 'num fg: {}'.format((labels > 0).sum()) print 'num bg: {}'.format((labels == 0).sum()) self._count += 1 self._fg_num += (labels > 0).sum() self._bg_num += (labels == 0).sum() print 'num fg avg: {}'.format(self._fg_num / self._count) print 'num bg avg: {}'.format(self._bg_num / self._count) print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) # sampled rois top[0].reshape(*rois.shape) top[0].data[...] = rois # classification labels top[1].reshape(*labels.shape) top[1].data[...] = labels # bbox_targets top[2].reshape(*bbox_targets.shape) top[2].data[...] = bbox_targets # bbox_inside_weights top[3].reshape(*bbox_inside_weights.shape) top[3].data[...] = bbox_inside_weights # bbox_outside_weights top[4].reshape(*bbox_inside_weights.shape) top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32) def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass #计算bbox_target_data向量 def _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets (bbox_target_data) are stored in a compact form N x (class, tx, ty, tw, th) This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). Returns: bbox_target (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] #表示一共有clss 个 bbox。 bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)#4 * num_classes = 4*21列 bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] #只要fg的 for ind in inds: #只在该类别对应的那4个位置放值 cls = clss[ind] start = 4 * cls end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS#前景的inside_weights =1 return bbox_targets, bbox_inside_weights #将label和其与GT的偏移量放在一起得到[label,偏移量] def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 #计算ROI和GT的偏移量 targets = bbox_transform(ex_rois, gt_rois) if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) #从一张图片的rois里采样得到roi def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) #计算ROI和GT的IOU overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) #每行的最大值,表示与该anchor具有最大IOU的GT的下标 gt_assignment = overlaps.argmax(axis=1) #最大IOU max_overlaps = overlaps.max(axis=1) #提取与该ROI具有最大IOU的GT的label labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH overlap #找到大于规定阈值的fg, # 当fg_inds的个数比fg_rois_per_image大时,就只筛选32个出来;否则,全部保留; fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) #挑选满足条件的bg bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) #bg个数 = 总ROI-fg个数,接下来的方法和fg相同 bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) #所有挑选出来的ROI的下标 keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: #这些ROIs的标签 labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 #将bg的label全部置为0. labels[fg_rois_per_this_image:] = 0 rois = all_rois[keep_inds] #计算roi的gt的偏移量,返回为[label,四个偏移量] bbox_target_data = _compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = \ _get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, bbox_targets, bbox_inside_weights