nillboy/yolo代码解读6：/yolo/net/yolo_tiny_net.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import re

from yolo.net.net import Net 

class YoloTinyNet(Net):

  def __init__(self, common_params, net_params, test=False):
    """
    common params: a params dict
    net_params   : a params dict
    """
    super(YoloTinyNet, self).__init__(common_params, net_params)
    # process params
    self.image_size = int(common_params['image_size'])# 448
    self.num_classes = int(common_params['num_classes'])# 20
    self.cell_size = int(net_params['cell_size'])# 7
    self.boxes_per_cell = int(net_params['boxes_per_cell'])# 2
    self.batch_size = int(common_params['batch_size'])# 16
    self.weight_decay = float(net_params['weight_decay'])# 0.0005

    if not test:
      self.object_scale = float(net_params['object_scale'])# 1
      self.noobject_scale = float(net_params['noobject_scale'])# 0.5
      self.class_scale = float(net_params['class_scale'])# 1
      self.coord_scale = float(net_params['coord_scale'])# 5

  def inference(self, images):
    """Build the yolo model

    Args:
      images:  4-D tensor [batch_size, image_height, image_width, channels]
    Returns:
      predicts: 4-D tensor [batch_size, cell_size, cell_size, num_classes + 5 * boxes_per_cell]
    """
    conv_num = 1

    temp_conv = self.conv2d('conv' + str(conv_num), images, [3, 3, 3, 16], stride=1)
    conv_num += 1

    temp_pool = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_pool, [3, 3, 16, 32], stride=1)
    conv_num += 1

    temp_pool = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_pool, [3, 3, 32, 64], stride=1)
    conv_num += 1

    temp_conv = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 64, 128], stride=1)
    conv_num += 1

    temp_conv = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 128, 256], stride=1)
    conv_num += 1

    temp_conv = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 256, 512], stride=1)
    conv_num += 1

    temp_conv = self.max_pool(temp_conv, [2, 2], 2)

    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 512, 1024], stride=1)
    conv_num += 1     

    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 1024, 1024], stride=1)
    conv_num += 1 
    # shape:(N,7,7,1024)
    temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 1024, 1024], stride=1)
    conv_num += 1 
    # shape:(N,1024,7,7)
    temp_conv = tf.transpose(temp_conv, (0, 3, 1, 2))

    #Fully connected layer
    # shape:(N,256)
    local1 = self.local('local1', temp_conv, self.cell_size * self.cell_size * 1024, 256)
    # shape:(N,4096)
    local2 = self.local('local2', local1, 256, 4096)
    # shape:(N,7*7*30) 
    local3 = self.local('local3', local2, 4096, self.cell_size * self.cell_size * (self.num_classes + self.boxes_per_cell * 5), leaky=False, pretrain=False, train=True)
    # n1=7*7*20
    n1 = self.cell_size * self.cell_size * self.num_classes
    # n2=7*7*22
    n2 = n1 + self.cell_size * self.cell_size * self.boxes_per_cell
    # (N,7,7,20)
    class_probs = tf.reshape(local3[:, 0:n1], (-1, self.cell_size, self.cell_size, self.num_classes))
    # (N,7,7,2)
    scales = tf.reshape(local3[:, n1:n2], (-1, self.cell_size, self.cell_size, self.boxes_per_cell))
    # (N,7,7,8)
    boxes = tf.reshape(local3[:, n2:], (-1, self.cell_size, self.cell_size, self.boxes_per_cell * 4))
    # (N,7,7,30)
    local3 = tf.concat([class_probs, scales, boxes], 3)

    predicts = local3

    return predicts

  def iou(self, boxes1, boxes2):
    """calculate ious
    Args:
      boxes1: 4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
      boxes2: 1-D tensor [4] ===> (x_center, y_center, w, h)
    Return:
      iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    """
    # boxes1 (7,7,2,4) [x,y,w,h]----(4,7,7,2) [x1,y1,x2,y2]
    boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
                      boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])
    # boxes1 (7,7,2,4) [x1,y1,x2,y2]
    boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])
    # boxes2 (4) [x1,y1,x2,y2]
    boxes2 =  tf.stack([boxes2[0] - boxes2[2] / 2, boxes2[1] - boxes2[3] / 2,
                      boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2])

    # calculate the left up point
    # (7,7,2,2)
    lu = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2])
    rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])

    # intersection
    # 两个boxes交集的宽和长(7,7,2,2)
    intersection = rd - lu 
    # 交集的面积(7,7,2,)
    inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]
    # 确保两个boxes有交集，小于0两者交集为0
    mask = tf.cast(intersection[:, :, :, 0] > 0, tf.float32) * tf.cast(intersection[:, :, :, 1] > 0, tf.float32)

    inter_square = mask * inter_square

    # calculate the boxs1 square and boxs2 square
    square1 = (boxes1[:, :, :, 2] - boxes1[:, :, :, 0]) * (boxes1[:, :, :, 3] - boxes1[:, :, :, 1])
    square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])
    # (7,7,2)
    return inter_square/(square1 + square2 - inter_square + 1e-6)

  def cond1(self, num, object_num, loss, predict, label, nilboy):
    """
    if num < object_num
    """
    # 0 < object_num
    return num < object_num


  def body1(self, num, object_num, loss, predict, labels, nilboy):
    """
    calculate loss
    Args:
      predict: 3-D tensor [cell_size, cell_size, 5 * boxes_per_cell]
      labels : [max_objects, 5]  (x_center, y_center, w, h, class)
    """
    # 
    label = labels[num:num+1, :]
    label = tf.reshape(label, [-1])

    # calculate objects  tensor [CELL_SIZE, CELL_SIZE]
    # objects (7,7)，boxes映射到网格的区域全是1，其余用0补充
    # 计算x1,x2,y1,y2，boxes映射到7*7网格上
    min_x = (label[0] - label[2] / 2) / (self.image_size / self.cell_size)
    max_x = (label[0] + label[2] / 2) / (self.image_size / self.cell_size)
    min_y = (label[1] - label[3] / 2) / (self.image_size / self.cell_size)
    max_y = (label[1] + label[3] / 2) / (self.image_size / self.cell_size)
    # 向下取整
    min_x = tf.floor(min_x)
    min_y = tf.floor(min_y)
    # 向上取整
    max_x = tf.ceil(max_x)
    max_y = tf.ceil(max_y)    
    temp = tf.cast(tf.stack([max_y - min_y, max_x - min_x]), dtype=tf.int32)
    objects = tf.ones(temp, tf.float32)
    temp = tf.cast(tf.stack([min_y, self.cell_size - max_y, min_x, self.cell_size - max_x]), tf.int32)
    temp = tf.reshape(temp, (2, 2))
    objects = tf.pad(objects, temp, "CONSTANT")

    # calculate response tensor [CELL_SIZE, CELL_SIZE]
    # response (7,7)，目标中心映射到网格的点是1，其余用0补充
    # 当前目标中心映射到7*7网格上一个点
    center_x = label[0] / (self.image_size / self.cell_size)
    center_x = tf.floor(center_x)
    center_y = label[1] / (self.image_size / self.cell_size)
    center_y = tf.floor(center_y)
    response = tf.ones([1, 1], tf.float32)
    temp = tf.cast(tf.stack([center_y, self.cell_size - center_y - 1, center_x, self.cell_size -center_x - 1]), tf.int32)
    temp = tf.reshape(temp, (2, 2))
    response = tf.pad(response, temp, "CONSTANT")

    # calculate iou_predict_truth [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # predict.shape (7, 7, 30) predict_boxes.shape (7, 7, 8)
    predict_boxes = predict[:, :, self.num_classes + self.boxes_per_cell:]
    # (7, 7, 2, 4)
    predict_boxes = tf.reshape(predict_boxes, [self.cell_size, self.cell_size, self.boxes_per_cell, 4])

    predict_boxes = predict_boxes * [self.image_size / self.cell_size, self.image_size / self.cell_size, self.image_size, self.image_size]
    # (7, 7, 4)
    base_boxes = np.zeros([self.cell_size, self.cell_size, 4])

    for y in range(self.cell_size):
      for x in range(self.cell_size):
        #nilboy
        base_boxes[y, x, :] = [self.image_size / self.cell_size * x, self.image_size / self.cell_size * y, 0, 0]
    # base_boxes(7, 7, 2, 4),第三维度复制
    base_boxes = np.tile(np.resize(base_boxes, [self.cell_size, self.cell_size, 1, 4]), [1, 1, self.boxes_per_cell, 1])

    predict_boxes = base_boxes + predict_boxes
    # (7,7,2)
    iou_predict_truth = self.iou(predict_boxes, label[0:4])
    # calculate C [cell_size, cell_size, boxes_per_cell]
    # (7,7,2)只在负责这个目标的网格下有值，其余是0
    C = iou_predict_truth * tf.reshape(response, [self.cell_size, self.cell_size, 1])
    # calculate I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # (7,7,2)只在负责这个目标的网格下有值，其余是0
    I = iou_predict_truth * tf.reshape(response, (self.cell_size, self.cell_size, 1))
    # 找出第三维度上IOU最大的(7,7,1)
    max_I = tf.reduce_max(I, 2, keep_dims=True)
    # (7,7,2) 只在负责这个目标的网格下并且IOU最大的位置是1,其余是0
    I = tf.cast((I >= max_I), tf.float32) * tf.reshape(response, (self.cell_size, self.cell_size, 1))
    # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # no_I和I相反，是1的时候是0，是0的时候是1
    no_I = tf.ones_like(I, dtype=tf.float32) - I 


    p_C = predict[:, :, self.num_classes:self.num_classes + self.boxes_per_cell]

    #calculate truth x,y,sqrt_w,sqrt_h 0-D
    x = label[0]
    y = label[1]

    sqrt_w = tf.sqrt(tf.abs(label[2]))
    sqrt_h = tf.sqrt(tf.abs(label[3]))

    #calculate predict p_x, p_y, p_sqrt_w, p_sqrt_h 3-D [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
    # 预测中心坐标
    p_x = predict_boxes[:, :, :, 0]
    p_y = predict_boxes[:, :, :, 1]
    # 预测宽度和高度必须大于0小于448 (7,7,2)
    p_sqrt_w = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2])))
    p_sqrt_h = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3])))

    # calculate truth p 1-D tensor [NUM_CLASSES]
    P = tf.one_hot(tf.cast(label[4], tf.int32), self.num_classes, dtype=tf.float32)
    # calculate predict p_P 3-D tensor [CELL_SIZE, CELL_SIZE, NUM_CLASSES]
    # (7,7,20)
    p_P = predict[:, :, 0:self.num_classes]

    #class_loss
    class_loss = tf.nn.l2_loss(tf.reshape(objects, (self.cell_size, self.cell_size, 1)) * (p_P - P)) * self.class_scale
    #class_loss = tf.nn.l2_loss(tf.reshape(response, (self.cell_size, self.cell_size, 1)) * (p_P - P)) * self.class_scale
    #object_loss
    object_loss = tf.nn.l2_loss(I * (p_C - C)) * self.object_scale
    #noobject_loss？？？
    #noobject_loss = tf.nn.l2_loss(no_I * (p_C - C)) * self.noobject_scale
    noobject_loss = tf.nn.l2_loss(no_I * (p_C)) * self.noobject_scale
    # coord_loss
    # 为什么中心坐标除以64,宽度高度除以448？？？
    coord_loss = (tf.nn.l2_loss(I * (p_x - x)/(self.image_size/self.cell_size)) +
                 tf.nn.l2_loss(I * (p_y - y)/(self.image_size/self.cell_size)) +
                 tf.nn.l2_loss(I * (p_sqrt_w - sqrt_w))/ self.image_size +
                 tf.nn.l2_loss(I * (p_sqrt_h - sqrt_h))/self.image_size) * self.coord_scale

    nilboy = I

    return num + 1, object_num, [loss[0] + class_loss, loss[1] + object_loss, loss[2] + noobject_loss, loss[3] + coord_loss], predict, labels, nilboy



  def loss(self, predicts, labels, objects_num):
    """Add Loss to all the trainable variables

    Args:
      predicts: 4-D tensor [batch_size, cell_size, cell_size, 5 * boxes_per_cell]
      ===> (num_classes, boxes_per_cell, 4 * boxes_per_cell)
      labels  : 3-D tensor of [batch_size, max_objects, 5]
      objects_num: 1-D tensor [batch_size]
    """
    class_loss = tf.constant(0, tf.float32)
    object_loss = tf.constant(0, tf.float32)
    noobject_loss = tf.constant(0, tf.float32)
    coord_loss = tf.constant(0, tf.float32)
    loss = [0, 0, 0, 0]
    # 对于每一个图片
    for i in range(self.batch_size):
      predict = predicts[i, :, :, :]
      label = labels[i, :, :]
      object_num = objects_num[i]
      nilboy = tf.ones([7,7,2])
      tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy])
      for j in range(4):
        loss[j] = loss[j] + tuple_results[2][j]
      nilboy = tuple_results[5]

    tf.add_to_collection('losses', (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size)

    tf.summary.scalar('class_loss', loss[0]/self.batch_size)
    tf.summary.scalar('object_loss', loss[1]/self.batch_size)
    tf.summary.scalar('noobject_loss', loss[2]/self.batch_size)
    tf.summary.scalar('coord_loss', loss[3]/self.batch_size)
    tf.summary.scalar('weight_loss', tf.add_n(tf.get_collection('losses')) - (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size )

    return tf.add_n(tf.get_collection('losses'), name='total_loss'), nilboy
nillboy/yolo代码解读6：/yolo/net/yolo_tiny_net.py

猜你喜欢