import numpy as np
import tensorflow as tf
import yolo.config as cfg
slim = tf.contrib.slim
class YOLONet(object):
def __init__(self, is_training=True):
self.classes = cfg.CLASSES #有哪些种类
self.num_class = len(self.classes) #一共20类
self.image_size = cfg.IMAGE_SIZE #图片大小448*448
self.cell_size = cfg.CELL_SIZE #feature map大小 7*7
self.boxes_per_cell = cfg.BOXES_PER_CELL #feature map中每个cell包含两个box
self.output_size = (self.cell_size * self.cell_size) *\
(self.num_class + self.boxes_per_cell * 5) #输出长度 7*7*(20+2*5)
self.scale = 1.0 * self.image_size / self.cell_size #缩放比
self.boundary1 = self.cell_size * self.cell_size * self.num_class
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell
self.object_scale = cfg.OBJECT_SCALE #值为1,存在目标的因子
self.noobject_scale = cfg.NOOBJECT_SCALE #值为1,不存在目标的因子
self.class_scale = cfg.CLASS_SCALE #类别损失函数的因子
self.coord_scale = cfg.COORD_SCALE #坐标损失函数的因子
self.learning_rate = cfg.LEARNING_RATE
self.batch_size = cfg.BATCH_SIZE #每个批次45张图片
self.alpha = cfg.ALPHA
# [2,7,7] -> [7,7,2]
self.offset = np.transpose(np.reshape(np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
(self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
# 构建网络图,返回预测结果
self.images = tf.placeholder(
tf.float32, [None, self.image_size, self.image_size, 3],
name='images')
self.logits = self.build_network(
self.images, num_outputs=self.output_size, alpha=self.alpha,
is_training=is_training)
if is_training:
self.labels = tf.placeholder(
tf.float32,
[None, self.cell_size, self.cell_size, 5 + self.num_class]) #训练时,实际标签的维度为25
self.loss_layer(self.logits, self.labels)
self.total_loss = tf.losses.get_total_loss()
tf.summary.scalar('total_loss', self.total_loss)
# 构造网络图
def build_network(self,
images,
num_outputs,
alpha,
keep_prob=0.5,
is_training=True,
scope='yolo'):
with tf.variable_scope(scope):
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
activation_fn=leaky_relu(alpha),
weights_regularizer=slim.l2_regularizer(0.0005),
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
):
net = tf.pad(
images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]), #对输入数据的宽高进行填充,batch_size和channel不做填充
name='pad_1')
net = slim.conv2d(
net, 64, 7, 2, padding='VALID', scope='conv_2')# conv:64个7x7的卷积核,以2为步伐进行卷积,out:224x224*64
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')# pool:最大池化kernel=2,stride = 2,out:112x112x64
net = slim.conv2d(net, 192, 3, scope='conv_4')# conv: num_kernel = 192,kernel_size=3,out:112x112x192
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')# pool: kernel_size = 2,stride = 2,out:56x56x192
net = slim.conv2d(net, 128, 1, scope='conv_6')# conv: num_kernel = 128,kernel_size = 1,out:56x56x128
net = slim.conv2d(net, 256, 3, scope='conv_7')# conv: num_kernel = 256,kernel_size = 3,out:56x56x256
net = slim.conv2d(net, 256, 1, scope='conv_8')# conv: num_kernel = 256,kernel_size = 1,out:56x56x256
net = slim.conv2d(net, 512, 3, scope='conv_9')# conv: num_kernel = 512,kernel_size = 3,out:56x56x512
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')# pool: kernel_size = 2,stride = 2,out:28x28x512
net = slim.conv2d(net, 256, 1, scope='conv_11')# conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net, 512, 3, scope='conv_12')# conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net, 256, 1, scope='conv_13')# conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net, 512, 3, scope='conv_14')# conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net, 256, 1, scope='conv_15')# conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net, 512, 3, scope='conv_16')# conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net, 256, 1, scope='conv_17')# conv: num_kernel = 256,kernel_size = 1,out:28x28x256
net = slim.conv2d(net, 512, 3, scope='conv_18')# conv: num_kernel = 512,kernel_size = 3,out:28x28x512
net = slim.conv2d(net, 512, 1, scope='conv_19')# conv: num_kernel = 512,kernel_size = 1,out:28x28x512
net = slim.conv2d(net, 1024, 3, scope='conv_20') # conv: num_kernel = 1024,kernel_size = 3,out:28x28x1024
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') # pool:kernel_size = 2,stride = 2,out: 14x14x1024
net = slim.conv2d(net, 512, 1, scope='conv_22') # conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
net = slim.conv2d(net, 1024, 3, scope='conv_23')# conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = slim.conv2d(net, 512, 1, scope='conv_24')# conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
net = slim.conv2d(net, 1024, 3, scope='conv_25')# conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = slim.conv2d(net, 1024, 3, scope='conv_26')# conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
net = tf.pad( # 对特征图进行填充
net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),
name='pad_27')
net = slim.conv2d(
net, 1024, 3, 2, padding='VALID', scope='conv_28')# conv: num_kernel = 1024, kernel_size = 3,stride = 2,out:7x7x1024
net = slim.conv2d(net, 1024, 3, scope='conv_29') # conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
net = slim.conv2d(net, 1024, 3, scope='conv_30') # conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
net = tf.transpose(net, [0, 3, 1, 2], name='trans_31') #[batchsize 1024 7 7]
net = slim.flatten(net, scope='flat_32')
net = slim.fully_connected(net, 512, scope='fc_33') #7*7*1024--512
net = slim.fully_connected(net, 4096, scope='fc_34')#512--4096
net = slim.dropout(
net, keep_prob=keep_prob, is_training=is_training,
scope='dropout_35')
net = slim.fully_connected(
net, num_outputs, activation_fn=None, scope='fc_36')#输出维度1470=7*7*30(20+2*5)
return net
def calc_iou(self, boxes1, boxes2, scope='iou'):
"""calculate ious
Args:
boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h)
boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
Return:
iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
with tf.variable_scope(scope):
# transform (x_center, y_center, w, h) to (x1, y1, x2, y2)
boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0, # x - w / 2
boxes1[..., 1] - boxes1[..., 3] / 2.0, # y - h / 2
boxes1[..., 0] + boxes1[..., 2] / 2.0, # x + w / 2
boxes1[..., 1] + boxes1[..., 3] / 2.0], # y + h / 2
axis=-1)
boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
boxes2[..., 1] - boxes2[..., 3] / 2.0,
boxes2[..., 0] + boxes2[..., 2] / 2.0,
boxes2[..., 1] + boxes2[..., 3] / 2.0],
axis=-1)
# calculate the left up point & right down point
lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
# intersection
intersection = tf.maximum(0.0, rd - lu)
inter_square = intersection[..., 0] * intersection[..., 1]
# calculate the boxs1 square and boxs2 square
square1 = boxes1[..., 2] * boxes1[..., 3]
square2 = boxes2[..., 2] * boxes2[..., 3]
union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)
def loss_layer(self, predicts, labels, scope='loss_layer'):
with tf.variable_scope(scope):
predict_classes = tf.reshape(
predicts[:, :self.boundary1],
[self.batch_size, self.cell_size, self.cell_size, self.num_class]) # 网络输出端类别数据
predict_scales = tf.reshape(
predicts[:, self.boundary1:self.boundary2],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell]) #网络输出的每个box的置信度
predict_boxes = tf.reshape(
predicts[:, self.boundary2:],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4]) #网络输出box
response = tf.reshape(
labels[..., 0],
[self.batch_size, self.cell_size, self.cell_size, 1]) #标签输出置信度,标签为7*7*25
boxes = tf.reshape(
labels[..., 1:5],
[self.batch_size, self.cell_size, self.cell_size, 1, 4]) #标签输出的box
boxes = tf.tile(
boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size # 将label中的box复制一份 因为label中只有一个box 而预测却有两个box
classes = labels[..., 5:] # 标签中的类别信息
offset = tf.reshape(
tf.constant(self.offset, dtype=tf.float32),
[1, self.cell_size, self.cell_size, self.boxes_per_cell])
offset = tf.tile(offset, [self.batch_size, 1, 1, 1]) #将offset复制bach_size份
offset_tran = tf.transpose(offset, (0, 2, 1, 3))
predict_boxes_tran = tf.stack(
[(predict_boxes[..., 0] + offset) / self.cell_size, #由于预测的是相对于左上角的偏移量,因此要加上offset
(predict_boxes[..., 1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[..., 2]),
tf.square(predict_boxes[..., 3])], axis=-1)
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes) #计算label和predict之间的iou
# calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
object_mask = tf.cast(
(iou_predict_truth >= object_mask), tf.float32) * response #cell是否负责物体
# calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
noobject_mask = tf.ones_like(
object_mask, dtype=tf.float32) - object_mask #cell不负责物体
boxes_tran = tf.stack(
[boxes[..., 0] * self.cell_size - offset,
boxes[..., 1] * self.cell_size - offset_tran, #转化为偏移量的形式
tf.sqrt(boxes[..., 2]),
tf.sqrt(boxes[..., 3])], axis=-1)
# class_loss
class_delta = response * (predict_classes - classes)#C =有物体的格子 * (预测 - 真实值) #目的是让C减小,即预测=真实值
class_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
name='class_loss') * self.class_scale
# object_loss
object_delta = object_mask * (predict_scales - iou_predict_truth)#A = 有物体的格子 * (预测 - IOU) #目的是让A减小,即预测=IOU
object_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
name='object_loss') * self.object_scale
# noobject_loss
noobject_delta = noobject_mask * predict_scales#B = 没有物体的格子 * 预测 #目的是让B减小,即没有物体的格子预测值为0
noobject_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
name='noobject_loss') * self.noobject_scale
# coord_loss
coord_mask = tf.expand_dims(object_mask, 4)#D = 有物体的格子 * (预测[位移x,位移y, w比例,h比例] - 真实[位移x,位移y, w比例,h比例]) #目的使得预测=真实
boxes_delta = coord_mask * (predict_boxes - boxes_tran)
coord_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
name='coord_loss') * self.coord_scale
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(coord_loss)
tf.summary.scalar('class_loss', class_loss)
tf.summary.scalar('object_loss', object_loss)
tf.summary.scalar('noobject_loss', noobject_loss)
tf.summary.scalar('coord_loss', coord_loss)
tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
tf.summary.histogram('iou', iou_predict_truth)
def leaky_relu(alpha):
def op(inputs):
return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
return op
如有错误,欢迎指正,谢谢!