SSD-TensorFlow 源码解析

本文解析的是SSD的tensorflow实现源码，来源 Github: balancap/SSD-Tensorflow。

1.anchor boxes生成：

通过模型获得位置信息与分类信息，并获取默认anchors信息，得到预测结果。

2.ground truth预处理

将Ground Truth进行预处理（使得Ground Truth与预测结果一一对应）。

3.网络结构

4.损失函数

通过预测结果与Ground Truth计算损失函数，通过优化器进行训练。

1.anchor boxes生成：

#####SSD-Tensorflow-master\train_ssd_network.py#####
# ssd_anchors array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
# 6层每一层特征图上所有anchor box的中心坐标和宽高
ssd_anchors = ssd_net.anchors(ssd_shape)


#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def anchors(self, img_shape, dtype=np.float32):
    """Compute the default anchor boxes, given an image shape.
    """
    # return array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
    return ssd_anchors_all_layers(img_shape,
                                  self.params.feat_shapes,
                                  self.params.anchor_sizes,
                                  self.params.anchor_ratios,
                                  self.params.anchor_steps,
                                  self.params.anchor_offset,
                                  dtype)


#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_anchors_all_layers(img_shape,
                           layers_shape,
                           anchor_sizes,
                           anchor_ratios,
                           anchor_steps,
                           offset=0.5,
                           dtype=np.float32):
    """Compute anchor boxes for all feature layers.
    img_shape = (300,300)
    layers_shape = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
    anchor_sizes = [(21., 45.),
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)]
    anchor_ratios = [[2, .5],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5],
                       [2, .5]]
    anchor_steps = [8, 16, 32, 64, 100, 300]
    """
    layers_anchors = []
    for i, s in enumerate(layers_shape):
        anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
                                             anchor_sizes[i],
                                             anchor_ratios[i],
                                             anchor_steps[i],
                                             offset=offset, dtype=dtype)
        layers_anchors.append(anchor_bboxes)
    # layers_anchors = array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
    return layers_anchors

#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_anchor_one_layer(img_shape,
                         feat_shape,
                         sizes,
                         ratios,
                         step,
                         offset=0.5,
                         dtype=np.float32):
    """Computer SSD default anchor boxes for one feature layer.

    Determine the relative position grid of the centers, and the relative
    width and height.

    Arguments:
      feat_shape: Feature shape, used for computing relative position grids;
      size: Absolute reference sizes;
      ratios: Ratios to use on these features;
      img_shape: Image shape, used for computing height, width relatively to the
        former;
      offset: Grid offset.

    Return:
      y, x, h, w: Relative x and y grids, and height and width.
    """
    '''
    img_shape = (300,300)
    feat_shape = (38,38)
    sizes = (21., 45.)
    ratios = [2, .5]
    step = 8
    '''
    # Compute the position grid: simple way.
    # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    # y = (y.astype(dtype) + offset) / feat_shape[0]
    # x = (x.astype(dtype) + offset) / feat_shape[1]
    # Weird SSD-Caffe computation using steps values...
    
    '''
    y.shape,x.shape = (38,38)
    y=array([[0,0,...,0],
             [1,1,...,1],
               ...
             [36,36,...,36],
             [37,37,...,37]])
    x=array([[0,1,...,36,37],
             [0,1,...,36,37],
              ...
             [0,1,...,36,37]])
    '''
      
    y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    '''
    y = (y+0.5)*8/300
    x = (x+0.5)*8/300
    归一化，为与GT box比较
    '''
    y = (y.astype(dtype) + offset) * step / img_shape[0]
    x = (x.astype(dtype) + offset) * step / img_shape[1]

    # Expand dims to support easy broadcasting.
    '''
    扩展维度，维度为(38,38,1)
    np.expand_dims中axis从左到右为0,1,2，从右到左为-1,-2,-3
    y = array([[[0.01],[0.01],...,[0.01]],
               [[0.04],[0.04],...,[0.04]],
               ...
               [[0.97],[0.97],...,[0.97]],
               [[1],[1],...,[1]]])
    x = array([[[0.01],[0.04],...,[1]],
               [[0.01],[0.04],...,[1]],
               ...
               [[0.01],[0.04],...,[1]],
               [[0.01],[0.04],...,[1]]])
    '''
    y = np.expand_dims(y, axis=-1)
    x = np.expand_dims(x, axis=-1)

    # Compute relative height and width.
    # Tries to follow the original implementation of SSD for the order.
    # num_anchors = 2+2
    num_anchors = len(sizes) + len(ratios)
    # h,w = array([0.,0.,0.,0.])
    h = np.zeros((num_anchors, ), dtype=dtype)
    w = np.zeros((num_anchors, ), dtype=dtype)
    # Add first anchor boxes with ratio=1.
    # h[0],w[0] = 21/300
    h[0] = sizes[0] / img_shape[0]
    w[0] = sizes[0] / img_shape[1]
    di = 1
    if len(sizes) > 1:
        h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
        w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
        di += 1
    for i, r in enumerate(ratios):
        h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
        w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
    # y,x的shape为(38,38,1)，h,w的shape为(4,)
    return y, x, h, w

2.ground truth预处理

#####SSD-Tensorflow-master\train_ssd_network.py#####
# Encode groundtruth labels and bboxes.
'''
基于交并比
gclasses 每一层特征图上所有anchor box的类别
glocalisations 对应损失函数的一种变换
gscores anchor box和对应GT box的重合度
'''
gclasses, glocalisations, gscores = \
ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)


#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def bboxes_encode(self, labels, bboxes, anchors,
                      scope=None):
    """Encode labels and bounding boxes.
    """
    return ssd_common.tf_ssd_bboxes_encode(
        labels, bboxes, anchors,
        self.params.num_classes,
        self.params.no_annotation_label,
        ignore_threshold=0.5,
        prior_scaling=self.params.prior_scaling,
        scope=scope)


#####SSD-Tensorflow-master\nets\ssd_common.py#####
def tf_ssd_bboxes_encode(labels,
                         bboxes,
                         anchors,
                         num_classes,
                         no_annotation_label,
                         ignore_threshold=0.5,
                         prior_scaling=[0.1, 0.1, 0.2, 0.2],
                         dtype=tf.float32,
                         scope='ssd_bboxes_encode'):
    """Encode groundtruth labels and bounding boxes using SSD net anchors.
    Encoding boxes for all feature layers.

    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors: List of Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.

    Return:
      (target_labels, target_localizations, target_scores):
        Each element is a list of target Tensors.
    """
    # anchors = array([[y,x,h,w],...,[y,x,h,w]]) len(anchors) = 6
    with tf.name_scope(scope):
        target_labels = []
        target_localizations = []
        target_scores = []
        for i, anchors_layer in enumerate(anchors):
            with tf.name_scope('bboxes_encode_block_%i' % i):
                t_labels, t_loc, t_scores = \
                    tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
                                               num_classes, no_annotation_label,
                                               ignore_threshold,
                                               prior_scaling, dtype)
                target_labels.append(t_labels)
                target_localizations.append(t_loc)
                target_scores.append(t_scores)
        return target_labels, target_localizations, target_scores

#####SSD-Tensorflow-master\nets\ssd_common.py#####
def tf_ssd_bboxes_encode_layer(labels,
                               bboxes,
                               anchors_layer,
                               num_classes,
                               no_annotation_label,
                               ignore_threshold=0.5,
                               prior_scaling=[0.1, 0.1, 0.2, 0.2],
                               dtype=tf.float32):
    """Encode groundtruth labels and bounding boxes using SSD anchors from
    one layer.

    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors_layer: Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.

    Return:
      (target_labels, target_localizations, target_scores): Target Tensors.
    """
    # Anchors coordinates and volume.
    '''
     每个anchor box的中心位置及宽高   anchors_layer = array([y,x,h,w])
     yref,xref的shape为(38,38,1)，href,wref的shape为(4,)
    '''
    yref, xref, href, wref = anchors_layer
    '''
    ymin,xmin,ymax,xmax为feature map cell上4个default boxes的左上和右下的纵横坐标，shape均为(38,38,4)
    38*38个feature map cell，每个cell上有4个高度不同的box，进而每个cell上分别有4个不同的ymin。
    同理，ymax,xmin,xmax也如此。
    或者说cell上的4个框分别得到一套ymin,xmin,ymax,xmax。
    38*38*[第一个框的ymin，第二个框的ymin，第三个框的ymin，第四个框的ymin]
    '''
    ymin = yref - href / 2.
    xmin = xref - wref / 2.
    ymax = yref + href / 2.
    xmax = xref + wref / 2.
    # 每一个feature map cell的4个default boxes的体积，shape均为(38,38,4)
    vol_anchors = (xmax - xmin) * (ymax - ymin)

    # Initialize tensors...
    # shape = (38,38,4)
    shape = (yref.shape[0], yref.shape[1], href.size)
    feat_labels = tf.zeros(shape, dtype=tf.int64)
    feat_scores = tf.zeros(shape, dtype=dtype)

    feat_ymin = tf.zeros(shape, dtype=dtype)
    feat_xmin = tf.zeros(shape, dtype=dtype)
    feat_ymax = tf.ones(shape, dtype=dtype)
    feat_xmax = tf.ones(shape, dtype=dtype)

    def jaccard_with_anchors(bbox):
        """Compute jaccard score between a box and the anchors.
        """
        # 真实框GT box(bbox)与所有anchor box(default box)的比较，
        # 每个feature map cell上的4个不同的ymin分别与bbox[0]比较，得到4个int_ymin。
        # bbox[0],bbox[1],bbox[2],bbox[3]的shape为(1,)
        # int_ymin,int_xmin,int_ymax,int_xmax的shape为(38,38,4)
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        # h,w的shape为(38,38,4)
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        # Volumes.
        inter_vol = h * w
        union_vol = vol_anchors - inter_vol \
            + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
        jaccard = tf.div(inter_vol, union_vol)
        # jaccard的shape为(38,38,4)
        # 38*38*4个anchor box与第i个GT box(bbox)的jaccard值
        return jaccard

    def intersection_with_anchors(bbox):
        """Compute intersection between score a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        inter_vol = h * w
        scores = tf.div(inter_vol, vol_anchors)
        return scores

    def condition(i, feat_labels, feat_scores,
                  feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Condition: check label index.
        """
        '''
        tf.less(x,y,name=none) x,y都是一个tensor，返回一个bool类型的tensor。逐元素返回是否x<y
        其实就是遍历label，因为i在body返回的时候加1了，直到遍历完
        tf.shape(labels)为tensor类型的label个数。在遍历完真实框之前，r中元素均为true
        '''
        r = tf.less(i, tf.shape(labels))
        return r[0]

    def body(i, feat_labels, feat_scores,
             feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Body: update feature labels, scores and bboxes.
        Follow the original SSD paper for that purpose:
          - assign values when jaccard > 0.5;
          - only update if beat the score of other bboxes.
        """
        # Jaccard score.
        # 第i个GT box的类别，label shape为(1,)
        label = labels[i]
        # 第i个GT box的位置，bbox shape为(4,)
        bbox = bboxes[i]
        # 计算某一层上所有的anchor box(38*38*4个)和第i个GT box的重合度，shape为(38,38,4)
        jaccard = jaccard_with_anchors(bbox)
        # Mask: check threshold + scores + no annotations + num_classes.
        # tf.greater(x,y,name=none) x,y都是一个tensor，返回一个bool类型的tensor。逐元素返回是否x>y
        # 与第一个GT box比较后，jaccard值为0的都为false，mask的shape为(38,38,4)，bool类型的tensor。
        mask = tf.greater(jaccard, feat_scores)
        # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
        # 逻辑操作-与，都真为真
        mask = tf.logical_and(mask, feat_scores > -0.5)
        mask = tf.logical_and(mask, label < num_classes)
        # 类型转换函数
        imask = tf.cast(mask, tf.int64)
        # dtype = tf.float32
        fmask = tf.cast(mask, dtype)
        # Update values using mask.
        # mask bool, imask int64, fmask float32. shape (38,38,4)
        # mask为true即imask为1，fmask为1.0。
        # 与第一个GT box比较后，jaccard值>0并且feat_scores>-0.5并且label<num_classes。其余为false
        # feat_labels中mask为true即imask为1的位置赋值label，mask为false即imask为0的位置仍为feat_labels
        feat_labels = imask * label + (1 - imask) * feat_labels
        # if mask true,jaccard;if mask false,feat_scores.
        # feat_scores中对应mask中false的位置的元素值不变，其余元素替换成jaccard中对应位置的元素值
        feat_scores = tf.where(mask, jaccard, feat_scores)
        
        # feat_ymin中mask为true即fmask为1.0的位置赋值bbox[0],mask为false即fmask为0.0的位置仍为feat_ymin
        feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
        feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
        feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
        feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax

        # Check no annotation label: ignore these anchors...
        # interscts = intersection_with_anchors(bbox)
        # mask = tf.logical_and(interscts > ignore_threshold,
        #                       label == no_annotation_label)
        # # Replace scores by -1.
        # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
        '''
         某一层上所有的anchor box(38*38*4个)依次与每个GT box比较，
         每个anchor box与某个GT box比较的jaccard值小的，
         会被此anchor box与后面某个GT box比较的jaccard值大的覆盖。
         最后，仅保留jaccard值最大的相关信息。
         遍历到最后，feat_labels为每个anchor box与所有GT box比较后的
         重叠度最高的GT box的类别，
         feat_scores为每个anchor box与所有GT box比较后的重叠度最高的值，
         feat_ymin等等为每个anchor box与所有GT box比较后的重叠度最高的GT box的位置。
        '''
        return [i+1, feat_labels, feat_scores,
                feat_ymin, feat_xmin, feat_ymax, feat_xmax]
    # Main loop definition.
    '''
    tf.while_loop(cond,body,loop_vars)
    Repeat body while the condition cond is true.
    loop_vars is a (possibly nested) tuple, namedtuple or list of tensors that is passed to both cond and body. 
    cond and body both take as many arguments as there are loop_vars.
    等价于
    loop_vars = []
    while cond(loop_vars):
        loop_vars = body(loop_vars)
        
    计算某一层上所有的anchor box(38*38*4个)和所有GT boxes的比较
    '''
    i = 0
    [i, feat_labels, feat_scores,
     feat_ymin, feat_xmin,
     feat_ymax, feat_xmax] = tf.while_loop(condition, body,
                                           [i, feat_labels, feat_scores,
                                            feat_ymin, feat_xmin,
                                            feat_ymax, feat_xmax])
    # Transform to center / size.
    # feat_cy,feat_cx,feat_h,feat_w  shape (38,38,4)
    # 计算每个anchor box对应的GT box的中心位置和宽高
    feat_cy = (feat_ymax + feat_ymin) / 2.
    feat_cx = (feat_xmax + feat_xmin) / 2.
    feat_h = feat_ymax - feat_ymin
    feat_w = feat_xmax - feat_xmin
    # Encode features.
    # yref,xref的shape为(38,38,1)，href,wref的shape为(4,)
    # anchor box和GT box的一种变换,对应位置损失函数
    feat_cy = (feat_cy - yref) / href / prior_scaling[0]
    feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
    feat_h = tf.log(feat_h / href) / prior_scaling[2]
    feat_w = tf.log(feat_w / wref) / prior_scaling[3]
    # Use SSD ordering: x / y / w / h instead of ours.
    # tf.stack ,axis 指明以何种方式对矩阵进行拼接，就是对原矩阵的哪个维度进行拼接。
    # feat_localizations shape为(38,38,4,4)
    feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
    # 返回每个anchor box的类别，一种变换，anchor box和对应GT box的重合度
    # feat_labels、feat_scores的shape均为[feature_map_height, feature_map_width, num_anchors]。
    return feat_labels, feat_localizations, feat_scores

3.网络结构

#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
# SSD class definition.
class SSDNet(object):
    """Implementation of the SSD VGG-based 300 network.

    The default features layers with 300x300 image input are:
      conv4 ==> 38 x 38
      conv7 ==> 19 x 19
      conv8 ==> 10 x 10
      conv9 ==> 5 x 5
      conv10 ==> 3 x 3
      conv11 ==> 1 x 1
    The default image size used to train this network is 300x300.
    """
    default_params = SSDParams(
        img_shape=(300, 300),    #输入图片大小
        num_classes=21,    #类别数+背景
        no_annotation_label=21,
        feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],
        feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
        anchor_size_bounds=[0.15, 0.90],
        # anchor_size_bounds=[0.20, 0.90],
        anchor_sizes=[(21., 45.),
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)],
        # anchor_sizes=[(30., 60.),
        #               (60., 111.),
        #               (111., 162.),
        #               (162., 213.),
        #               (213., 264.),
        #               (264., 315.)],
        anchor_ratios=[[2, .5],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5],
                       [2, .5]],
        anchor_steps=[8, 16, 32, 64, 100, 300], #特征图的单元大小
        anchor_offset=0.5,
        normalizations=[20, -1, -1, -1, -1, -1],
        prior_scaling=[0.1, 0.1, 0.2, 0.2]
        )

    def __init__(self, params=None):
        """Init the SSD net with some parameters. Use the default ones
        if none provided.
        """
        if isinstance(params, SSDParams):
            self.params = params
        else:
            self.params = SSDNet.default_params

    # ======================================================================= #
    def net(self, inputs,
            is_training=True,
            update_feat_shapes=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
        """SSD network definition.
        """
        # inputs 训练图片
        r = ssd_net(inputs,
                    num_classes=self.params.num_classes,
                    feat_layers=self.params.feat_layers,
                    anchor_sizes=self.params.anchor_sizes,
                    anchor_ratios=self.params.anchor_ratios,
                    normalizations=self.params.normalizations,
                    is_training=is_training,
                    dropout_keep_prob=dropout_keep_prob,
                    prediction_fn=prediction_fn,
                    reuse=reuse,
                    scope=scope)
        # Update feature shapes (try at least!)
        if update_feat_shapes:
            shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)
            self.params = self.params._replace(feat_shapes=shapes)
        return r

    def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
        """Network arg_scope.
        """
        return ssd_arg_scope(weight_decay, data_format=data_format)

    def arg_scope_caffe(self, caffe_scope):
        """Caffe arg_scope used for weights importing.
        """
        return ssd_arg_scope_caffe(caffe_scope)

    # ======================================================================= #
    def update_feature_shapes(self, predictions):
        """Update feature shapes from predictions collection (Tensor or Numpy
        array).
        """
        shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
        self.params = self.params._replace(feat_shapes=shapes)

    def anchors(self, img_shape, dtype=np.float32):
        """Compute the default anchor boxes, given an image shape.
        """
        # return array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
        return ssd_anchors_all_layers(img_shape,
                                      self.params.feat_shapes,
                                      self.params.anchor_sizes,
                                      self.params.anchor_ratios,
                                      self.params.anchor_steps,
                                      self.params.anchor_offset,
                                      dtype)

    def bboxes_encode(self, labels, bboxes, anchors,
                      scope=None):
        """Encode labels and bounding boxes.
        """
        return ssd_common.tf_ssd_bboxes_encode(
            labels, bboxes, anchors,
            self.params.num_classes,
            self.params.no_annotation_label,
            ignore_threshold=0.5,
            prior_scaling=self.params.prior_scaling,
            scope=scope)

    def bboxes_decode(self, feat_localizations, anchors,
                      scope='ssd_bboxes_decode'):
        """Encode labels and bounding boxes.
        """
        return ssd_common.tf_ssd_bboxes_decode(
            feat_localizations, anchors,
            prior_scaling=self.params.prior_scaling,
            scope=scope)

    def detected_bboxes(self, predictions, localisations,
                        select_threshold=None, nms_threshold=0.5,
                        clipping_bbox=None, top_k=400, keep_top_k=200):
        """Get the detected bounding boxes from the SSD network output.
        """
        # Select top_k bboxes from predictions, and clip
        rscores, rbboxes = \
            ssd_common.tf_ssd_bboxes_select(predictions, localisations,
                                            select_threshold=select_threshold,
                                            num_classes=self.params.num_classes)
        rscores, rbboxes = \
            tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
        # Apply NMS algorithm.
        rscores, rbboxes = \
            tfe.bboxes_nms_batch(rscores, rbboxes,
                                 nms_threshold=nms_threshold,
                                 keep_top_k=keep_top_k)
        if clipping_bbox is not None:
            rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
        return rscores, rbboxes

    def losses(self, logits, localisations,
               gclasses, glocalisations, gscores,
               match_threshold=0.5,
               negative_ratio=3.,
               alpha=1.,
               label_smoothing=0.,
               scope='ssd_losses'):
        """Define the SSD network losses.
        """
        return ssd_losses(logits, localisations,
                          gclasses, glocalisations, gscores,
                          match_threshold=match_threshold,
                          negative_ratio=negative_ratio,
                          alpha=alpha,
                          label_smoothing=label_smoothing,
                          scope=scope)

#####SSD-Tensorflow-master\train_ssd_network.py#####            
            # Construct SSD network.
            arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay,
                                          data_format=DATA_FORMAT)
            # 打印arg_scope，查看嵌套使用规则

            with slim.arg_scope(arg_scope):
                ''' 
                b_image 训练图片
                经过卷积
                prediction 经过激活函数的预测结果
                localisations 预测框
                logits 没有激活函数的预测结果
                end_poins 每一层的输出
                '''
                predictions, localisations, logits, end_points = \
                    ssd_net.net(b_image, is_training=True)

#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
        """Network arg_scope.
        """
        return ssd_arg_scope(weight_decay, data_format=data_format)


#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
    """Defines the VGG arg scope.

    Args:
      weight_decay: The l2 regularization coefficient.

    Returns:
      An arg_scope.
    """
    '''
    为一些操作提供默认参数，可以嵌套使用、复用。
    在其中调用的函数，可以不用重复写一些参数，但也允许覆盖。
    '''
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_regularizer=slim.l2_regularizer(weight_decay),
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.zeros_initializer()):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                            padding='SAME',
                            data_format=data_format):
            with slim.arg_scope([custom_layers.pad2d,
                                 custom_layers.l2_normalization,
                                 custom_layers.channel_to_last],
                                data_format=data_format) as sc:
                return sc

#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_net(inputs,
            num_classes=SSDNet.default_params.num_classes,
            feat_layers=SSDNet.default_params.feat_layers,
            anchor_sizes=SSDNet.default_params.anchor_sizes,
            anchor_ratios=SSDNet.default_params.anchor_ratios,
            normalizations=SSDNet.default_params.normalizations,
            is_training=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
    """SSD net definition.
    """
    # if data_format == 'NCHW':
    #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))

    # End_points collect relevant activations for external use.
    # inputs 训练图片
    # endpoints 字典，包含的是不同特征图的输出，就是SSD不是只利用一层特征，而是多层，所以这个地方存放多层的输出。
    end_points = {}
    with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
        # Original VGG-16 blocks.
        net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
        end_points['block1'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool1')
        # Block 2.
        net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
        end_points['block2'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool2')
        # Block 3.
        net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
        end_points['block3'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool3')
        # Block 4.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
        end_points['block4'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool4')
        # Block 5.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
        end_points['block5'] = net
        net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')

        # Additional SSD blocks.
        # Block 6: let's dilate the hell out of it!
        net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
        end_points['block6'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
        # Block 7: 1x1 conv. Because the fuck.
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
        end_points['block7'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)

        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        end_point = 'block8'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block9'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block10'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block11'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net

        # Prediction and localisations layers.
        predictions = []
        logits = []
        localisations = []
        for i, layer in enumerate(feat_layers):
            with tf.variable_scope(layer + '_box'):
                # p = cls_pred,l =  loc_pred ,表示每一层的预测结果
                p, l = ssd_multibox_layer(end_points[layer],
                                          num_classes,
                                          anchor_sizes[i],
                                          anchor_ratios[i],
                                          normalizations[i])
            # 对于类别再进行tf.softmax
            predictions.append(prediction_fn(p))
            logits.append(p)
            localisations.append(l)
            # prediction 保存了经过激活函数的，logits 没有激活函数的
            # localisations 保存预测的框，end_poins 每一层的输出
        return predictions, localisations, logits, end_points

4.损失函数

#####SSD-Tensorflow-master\train_ssd_network.py#####
            # Add loss function.
            ssd_net.losses(logits, localisations,
                           b_gclasses, b_glocalisations, b_gscores,
                           match_threshold=FLAGS.match_threshold,
                           negative_ratio=FLAGS.negative_ratio,
                           alpha=FLAGS.loss_alpha,
                           label_smoothing=FLAGS.label_smoothing)

#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
# SSD loss function.
def ssd_losses(logits, localisations,
               gclasses, glocalisations, gscores,
               match_threshold=0.5,
               negative_ratio=3.,
               alpha=1.,
               label_smoothing=0.,
               device='/cpu:0',
               scope=None):
    '''
    logits 卷积后没有激活函数的预测的类别
    localisations 卷积后预测的位置
    gclasses 基于交并比，每一层特征图上所有anchor box的类别
    glocalisations 基于交并比，对应损失函数的一种变换，位置
    gscores 基于交并比，anchor box和对应GT box的得分
    '''
    with tf.name_scope(scope, 'ssd_losses'):
        # 提取类别数和batch_size
        lshape = tfe.get_shape(logits[0], 5)
        num_classes = lshape[-1]
        batch_size = lshape[0]

        # Flatten out all vectors!
        flogits = []
        fgclasses = []
        fgscores = []
        flocalisations = []
        fglocalisations = []
        # 按照ssd特征层循环
        for i in range(len(logits)):
            flogits.append(tf.reshape(logits[i], [-1, num_classes]))
            fgclasses.append(tf.reshape(gclasses[i], [-1]))
            fgscores.append(tf.reshape(gscores[i], [-1]))
            flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
            fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
        # And concat the crap!
        '''
        [<tf.Tensor 'ssd_losses/concat:0' shape=(279424, 21) dtype=float32>,
         <tf.Tensor 'ssd_losses/concat_1:0' shape=(279424,) dtype=int64>,
         <tf.Tensor 'ssd_losses/concat_2:0' shape=(279424,) dtype=float32>,
         <tf.Tensor 'ssd_losses/concat_3:0' shape=(279424, 4) dtype=float32>,
         <tf.Tensor 'ssd_losses/concat_4:0' shape=(279424, 4) dtype=float32>]
        '''
        logits = tf.concat(flogits, axis=0)
        gclasses = tf.concat(fgclasses, axis=0)
        gscores = tf.concat(fgscores, axis=0)
        localisations = tf.concat(flocalisations, axis=0)
        glocalisations = tf.concat(fglocalisations, axis=0)
        dtype = logits.dtype

        # Compute positive matching mask...
        pmask = gscores > match_threshold
        # 类型转换函数
        fpmask = tf.cast(pmask, dtype)
        # 正例的个数，压缩求和
        n_positives = tf.reduce_sum(fpmask)

        # Hard negative mining...
        # {0,1} 前景是1，背景是0
        no_classes = tf.cast(pmask, tf.int32)
        # 此时每一行的21个数转化为概率
        predictions = slim.softmax(logits)
        # 逻辑操作-与，都真为真    IOU达不到阈值的类别搜索框位置记1
        nmask = tf.logical_and(tf.logical_not(pmask),
                               gscores > -0.5)
        fnmask = tf.cast(nmask, dtype)
        # if nmask true,predictions[:, 0];if nmask false,1. - fnmask.
        # 框内无物体标记为背景预测概率；框内有物体位置标记为1
        nvalues = tf.where(nmask,
                           predictions[:, 0],
                           1. - fnmask)
        # pass '[-1]' to flatten 't'
        # reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
        nvalues_flat = tf.reshape(nvalues, [-1])
        # Number of negative entries to select.
        # 设置反例数量为正例的negative_ratio
        max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
        n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
        n_neg = tf.minimum(n_neg, max_neg_entries)

        '''
        负样本不超过正样本数目的3倍，确保能够收敛，
        由于知道这些负样本都属于背景（和真实框IOU不足），所以理论上其class 0预测值越大越好，
        取class 0预测值最小的3倍正样本数目的负样本，最大化其class 0预测值，达到最小化损失函数的目的。
        筛选后的负样本（fnmask标记）为原负样本中class 0预测值最小的目标数目的点。
        '''
        # 最不可能为背景的n_neg个点
        val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
        max_hard_pred = -val[-1]
        # Final negative mask.
        nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
        fnmask = tf.cast(nmask, dtype)

        # Add cross-entropy loss.
        # 计算正例的分类误差
        # gclasses 0-20
        with tf.name_scope('cross_entropy_pos'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                  labels=gclasses)
            # fpmask是为了过滤掉负样本，因为负样本的label是0,其他是1
            loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
            tf.losses.add_loss(loss)

        # 计算反例的分类误差
        # no_classes 0,1
        with tf.name_scope('cross_entropy_neg'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                  labels=no_classes)
            # fnmask也是过滤掉正例
            loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
            tf.losses.add_loss(loss)

        # Add localization loss: smooth L1, L2, ...
        with tf.name_scope('localization'):
            # Weights Tensor: positive mask + random negative.
            # fpmask 过滤框没有目标的
            weights = tf.expand_dims(alpha * fpmask, axis=-1)
            loss = custom_layers.abs_smooth(localisations - glocalisations)
            loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
            tf.losses.add_loss(loss)

SSD-TensorFlow 源码解析

1.anchor boxes生成：

2.ground truth预处理

3.网络结构

4.损失函数

猜你喜欢