Faster-RCNN_TF code interpretation (1): lib\networks\VGGnet_test.py

import tensorflow as tf
from networks.network import Network
#Number of categories to detect + background
n_classes = 21
#For an image of any size PxQ, first scale to a fixed size MxN, the scaling factor is 1/16
_feat_stride = [16,]
#anchor_scales is 3 scales
anchor_scales = [8, 16, 32]

class VGGnet_test(Network):
    def __init__(self, trainable=True):
        self.inputs = []
        #image[batchsize, H, W, channels]
        self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3])
        #For a PxQ image of any size, first reshape it to a fixed Mx N before passing it into Faster RCNN, and im_info=[M, N, scale_factor] saves all the information of this scaling.
        self.im_info = tf.placeholder(tf.float32, shape=[None, 3])
        # This parameter defines the dropout scale
        self.keep_prob = tf.placeholder(tf.float32)
        self.layers = dict({'data':self.data, 'im_info':self.im_info})
        self.trainable = trainable
        self.setup()

    def setup(self):
        '''
         The MxN image is sent to the network; the Conv layers contain 13 conv layers + 13 relu layers + 4 pooling layers;
         All conv layers are: kernel_size=3 , pad=1 , stride=1
         All pooling layers are: kernel_size=2 , pad=0 , stride=2
         In the Faster RCNN Conv layers, all convolutions are edge-expanded ( pad=1, that is, a circle of 0 is filled), resulting in the original image becoming (M+2)x(N+2) size, and then 3x3 Output MxN after convolution.
         It is this setting that causes the conv layers in Conv layers to not change the input and output matrix sizes.
         The pooling layer kernel_size=2 in Conv layers, stride=2 . In this way, each MxN matrix passing through the pooling layer,
         Both make the output length and width become 1/2 of the input. Then, a matrix of MxN size is fixed to (M/16)x(N/16) through Conv layers
        :return:
        '''
        (self.feed('data')
             .conv(3, 3, 64, 1, 1, name='conv1_1', trainable=False)
             .conv(3, 3, 64, 1, 1, name='conv1_2', trainable=False)
             .max_pool(2, 2, 2, 2, padding='VALID', name='pool1')
             .conv(3, 3, 128, 1, 1, name='conv2_1', trainable=False)
             .conv(3, 3, 128, 1, 1, name='conv2_2', trainable=False)
             .max_pool(2, 2, 2, 2, padding='VALID', name='pool2')
             .conv(3, 3, 256, 1, 1, name='conv3_1')
             .conv(3, 3, 256, 1, 1, name='conv3_2')
             .conv(3, 3, 256, 1, 1, name='conv3_3')
             .max_pool(2, 2, 2, 2, padding='VALID', name='pool3')
             .conv(3, 3, 512, 1, 1, name='conv4_1')
             .conv(3, 3, 512, 1, 1, name='conv4_2')
             .conv(3, 3, 512, 1, 1, name='conv4_3')
             .max_pool(2, 2, 2, 2, padding='VALID', name='pool4')
             .conv(3, 3, 512, 1, 1, name='conv5_1')
             .conv(3, 3, 512, 1, 1, name='conv5_2')
             .conv(3, 3, 512, 1, 1, name='conv5_3'))

        #RPN network uses anchors and softmax to initially extract foreground anchors as candidate regions.
        (self.feed('conv5_3')
             .conv(3,3,512,1,1,name='rpn_conv/3x3')
             #anchor_scales is 3 scales*3 scales, which just corresponds to feature maps, each point has 9 anchors, and each anchors may be foreground and background, two scores, 3*3*2
             .conv(1,1,len(anchor_scales)*3*2,1,1,padding='VALID',relu = False,name='rpn_cls_score'))

        (self.feed('rpn_conv/3x3')
             #9 anchors, do box regression for each one, save (dx, dy, dw, dh) 3*3*4=36
             .conv(1,1,len(anchor_scales)*3*4,1,1,padding='VALID',relu = False,name='rpn_bbox_pred'))

        # First reshape and then softmax activation
        (self.feed('rpn_cls_score')
             # shape shape(1,9xH,W,2)
             .reshape_layer(2,name = 'rpn_cls_score_reshape')
             # shape shape(1,9XH,W,2)
             .softmax(name='rpn_cls_prob'))
        # 再reshape
        (self.feed('rpn_cls_prob')
         # Shape shape(1,H,W,18), the information is restored to 'rpn_cls_score', just two steps of reshape_layer operation:
         # 1. Modify to softmax format. 2. Restore the rpn_cls_score information location format, but the content becomes the sotfmax score
             .reshape_layer(len(anchor_scales)*3*2,name = 'rpn_cls_prob_reshape'))
        '''
        The Proposal Layer has 3 inputs:
        1.fg/bg anchors classifier result rpn_cls_prob_reshape,
        2. [d_{x}(A), d_{y}(A), d_{w}(A), d_{h}(A)] transformation rpn_bbox_pred of the corresponding bbox reg,
        3.im_info;
        Another _feat_stride=16 saves the zoom information
        '''
        (self.feed('rpn_cls_prob_reshape','rpn_bbox_pred','im_info')
        #Enter network.proposal_layer, cfg_key=TEST, output estimation frame
             .proposal_layer(_feat_stride, anchor_scales, 'TEST', name = 'rois'))
        
        (self.feed('conv5_3', 'rois')
             .roi_pool(7, 7, 1.0/16, name='pool_5')
             .fc(4096, name='fc6')
             .fc(4096, name='fc7')
             .fc(n_classes, relu=False, name='cls_score')
             .softmax(name='cls_prob'))

        (self.feed('fc7')
             .fc(n_classes*4, relu=False, name='bbox_pred'))


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325645653&siteId=291194637