从零开始用Tensorflow实现YOLO物体检测算法

一直对物体检测算法很感兴趣，记录一下自己用Tensorflow来重现YOLO论文的过程。这里的程序代码都是个人原创。

我认为YOLO论文是一个开创性的研究成果，通过对图像划分多个网格，并对网格的物体进行预测，在速度上大大领先其他的物体检测算法，例如Regional CNN，Faster RCNN等。YOLO的论文写的不是太好读懂，特别是关于LOSS函数的计算这一部分。网上对于YOLO论文的解读有很多，个人认为以下这个是解释的最好的，里面的很多细节也解释的很清楚，推荐大家读一下。

https://zhuanlan.zhihu.com/p/37850811

以下是我的实现YOLO的全过程，分为几大部分

1. YOLO模型的搭建

首先是按照论文描述的网络结构来搭建模型。

其中的前20个卷积层采用Imagenet的数据来预训练，提取图像的特征。之后再增加4个卷积层和2个全连接层来做物体检测。但是在实际的模型中我做了一点修改，主要是把最后的两个全连接层取消了，而是用一个3×3，filter是30的卷积层来替代，这个卷积层的输出就是一个7×7×30的向量。这个修改主要是因为我按照论文的模型来进行训练时，发现很难收敛，而且这两个全连接层的参数量太大了，在我的GTX 1070 Ti 8G的显卡里面没法按照论文提到的一个Batch 64来进行训练，只能用Batch 24来训练。在Yolo V2论文中也把最后的全连接层改为卷积层了。

以下的yolonet_model.py的代码是构建YOLO的模型，注意在对Imagenet进行预训练时，Inference的参数pretrain_trainable和pretrain_training要设置为True，yolo_training设置为False。当预训练完成后，进行目标检测训练时，要把pretrain_trainable和pretrain_training要设置为False，yolo_training设置为True：

import tensorflow as tf

def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
        conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
        biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
        bias = tf.nn.bias_add(conv, biases)
        output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
        output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
        return output_bn

def inference(images, pretrain_trainable=True, wd=0.0005, pretrain_training=True, yolo_training=True):
    conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain_trainable, 0.01, pretrain_training)       #112*112*64
    pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1')   #56*56*64
    conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)      #56*56*192
    pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2')   #28*28*192
    conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #28*28*128
    conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #28*28*256
    conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #28*28*256
    conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #28*28*512
    pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6')   #14*14*512
    conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #14*14*256
    conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #14*14*512
    conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)     #14*14*256
    conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)   #14*14*512
    conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)  #14*14*256
    conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)  #14*14*512
    conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)  #14*14*256
    conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)  #14*14*512
    conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training)  #14*14*512
    conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*1024
    pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16')  #7*7*1024
    conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*512
    conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*1024
    conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*512
    conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*1024

    #For YOLO training, add below 4 conv layers and 2 full connect layers.
    #Remember to set the above pretrained 20 conv layers to non trainable.
    if not pretrain_training:
        new_conv21 = _conv('new_conv21', conv20, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #14*14*1024
        new_conv22 = _conv('new_conv22', new_conv21, 3, 1024, 1024, 2, 'SAME', True, 0.01, yolo_training) #7*7*1024
        new_conv23 = _conv('new_conv23', new_conv22, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #7*7*1024
        new_conv24 = _conv('new_conv24', new_conv23, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #7*7*1024
        new_conv25 = _conv('new_conv25', new_conv24, 3, 1024, 30, 1, 'SAME', True, 0.01, yolo_training) #7*7*30
        return new_conv25
    #For Imagenet pretrain
    else:
        avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True)    #1024
        flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
        with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
            weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=True, name='weights')
            weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
            tf.add_to_collection('losses', weight_decay)
            biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=True, name='biases')
            local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
        return local

2. 对Imagenet的1000个分类数据进行预训练

具体的训练过程和我之前博客提到的类似，这里就不再重复了。

3. 准备目标检测的训练数据

论文中提到了是采用PASCAL VOC 2012的train/val数据加上2007的train/val/test的数据来进行训练的。因此我这里也采用同样的数据来进行训练。在PASCAL的官网上可以下载相关的数据。下载之后可以看到，在JPEGImages目录下存放了图片，在Annotations目录下面存放了图片对应的物体检测框BBOX的数据。因此我们需要首先把Annotation里面的BBOX提取出来。用以下的代码来进行提取后，保存为一个csv文件：

import xml.etree.ElementTree as ET
import os
 
xmlRootDir_train = 'VOC2012_train/Annotations/'
files_train = os.listdir(xmlRootDir_train)
 
def parseXML(filename):
    bbox = [[],[],[],[],[]]
    tree = ET.parse(filename)
    root = tree.getroot()
    size = root.find('size')
    width = float(size.find('width').text)
    height = float(size.find('height').text)
    for node in root.iter("object"):
        bndbox = node.find('bndbox')
        classname = node.find('name').text
        try:
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            bbox[0].append(classname)
            bbox[1].append(xmin)
            bbox[2].append(ymin)
            bbox[3].append(xmax)
            bbox[4].append(ymax)
        except:
            print(filename)
    return bbox
 
bboxfile = open('bbox_train.csv', 'w')
content = ''
for xmlfile in files_train:
    bbox = parseXML(xmlRootDir_train+xmlfile)
    content += xmlfile
    for j in range(5):
        content += ','+';'.join([str(x) for x in bbox[j]])
    content += '\n'

bboxfile.writelines(content)
bboxfile.close()

BBOX的数据提取完成后，我们就可以把图片数据以及BBOX数据都保存在TFRECORD文件中，来方便之后的训练。在生成训练数据时，我把图片都统一缩放保存为538×538像素，这是因为在训练时要随机裁剪为448×448尺寸，因此放大20%。以下代码是把图片以及BBOX保存为多个TFRECORD文件，最终生成的TFRECORD文件共包含26332张图片。

#-*- encoding: utf-8 -*-
import tensorflow as tf
import cv2
import numpy as np
import os
from multiprocessing import Process, Queue
import sys
import time
import random
import math
 
max_num = 1400  #max record number in one file
train_path = 'VOC2012_train/JPEGImages/'  #the folder stroes the train images
cores = 10   #number of CPU cores to process

resize_width = 538   #448*1.2
resize_height = 538  #448*1.2

grids = 7
 
#VOC2012_train图片共分为20个类别，构建一个字典，Key是类名，value是0-19
labels_dict = {'person':0, 'bird':1, 'cat':2, 'cow':3, 'dog':4, 'horse':5, 'sheep':6, 'aeroplane':7, 'bicycle':8,
               'boat':9, 'bus':10, 'car':11, 'motorbike':12, 'train':13, 'bottle':14, 'chair':15, 'diningtable':16,
               'pottedplant':17, 'sofa':18, 'tvmonitor':19}

#读取bbox文件
bbox_list = {}
with open('bbox_train.csv', 'r') as bboxfile:
    records = bboxfile.readlines()
    for record in records:
        fields = record.strip().split(',')
        filename = fields[0][:-4]
        labels = [labels_dict[x] for x in fields[1].split(';')]
        xmin = [float(x) for x in fields[2].split(';')]
        ymin = [float(x) for x in fields[3].split(';')]
        xmax = [float(x) for x in fields[4].split(';')]
        ymax = [float(x) for x in fields[5].split(';')]
        bbox_list[filename] = [labels, xmin, ymin, xmax, ymax] 
files = bbox_list.keys()
        
#构建训练集文件列表，里面的每个元素是路径名+图片文件名
train_images_filenames = os.listdir(train_path)
train_images = []
for image_file in train_images_filenames:
    if image_file[:-4] in files:
        train_images.append(train_path+','+image_file)
random.shuffle(train_images)

#把图像数据和标签转换为TRRECORD的格式
def make_example(image, height, width, label, bbox, filename):
    colorspace = b'RGB'
    channels = 3
    img_format = b'JPEG'
    return tf.train.Example(features=tf.train.Features(feature={
        'image' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'height' : tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
        'width' : tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
        'channels' : tf.train.Feature(int64_list=tf.train.Int64List(value=[channels])),
        'colorspace' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[colorspace])),
        'img_format' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_format])),
        'label' : tf.train.Feature(int64_list=tf.train.Int64List(value=label)),
        'bbox_xmin' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[0])),
        'bbox_xmax' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[2])),
        'bbox_ymin' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[1])),
        'bbox_ymax' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[3])),
        'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename]))
    }))
 
#这个函数用来生成TFRECORD文件，第一个参数是列表，每个元素是图片文件名，第二个参数是写入的目录名
#第三个参数是文件名的起始序号，第四个参数是队列名称，用于和父进程发送消息
def gen_tfrecord(trainrecords, targetfolder, startnum, queue):
    tfrecords_file_num = startnum
    file_num = 0
    total_num = len(trainrecords)
    pid = os.getpid()
    queue.put((pid, file_num))
    writer = tf.python_io.TFRecordWriter(targetfolder+"train_"+str(tfrecords_file_num)+".tfrecord")
    for record in trainrecords:
        file_num += 1
        fields = record.split(',')
        img_raw = cv2.imread(fields[0]+fields[1])
        height, width, _ = img_raw.shape
        img = cv2.resize(img_raw, (resize_width, resize_height))
        height_ratio = resize_height/height
        width_ratio = resize_width/width
        img_jpg = cv2.imencode('.jpg', img)[1].tobytes()
        bbox = bbox_list[fields[1][:-4]]
        bbox[1] = [int(item*width_ratio) for item in bbox[1]]   #xmin
        bbox[3] = [int(item*width_ratio) for item in bbox[3]]   #xmax
        bbox[2] = [int(item*height_ratio) for item in bbox[2]]  #ymin
        bbox[4] = [int(item*height_ratio) for item in bbox[4]]  #ymax
        label = bbox[0]
        ex = make_example(img_jpg, resize_height, resize_width, label, bbox[1:], fields[1].encode())
        writer.write(ex.SerializeToString())
        #每写入100条记录，向父进程发送消息，报告进度
        if file_num%100==0:
            queue.put((pid, file_num))
        if file_num%max_num==0:
            writer.close()
            tfrecords_file_num += 1
            writer = tf.python_io.TFRecordWriter(targetfolder+"train_"+str(tfrecords_file_num)+".tfrecord")
    writer.close()        
    queue.put((pid, file_num))
 
#这个函数用来多进程生成TFRECORD文件，第一个参数是要处理的图片的文件名列表，第二个参数是需要用的CPU核心数
#第三个参数写入的文件目录名
def process_in_queues(fileslist, cores, targetfolder):
    total_files_num = len(fileslist)
    each_process_files_num = int(total_files_num/cores)
    files_for_process_list = []
    for i in range(cores-1):
        files_for_process_list.append(fileslist[i*each_process_files_num:(i+1)*each_process_files_num])
    files_for_process_list.append(fileslist[(cores-1)*each_process_files_num:])
    files_number_list = [len(l) for l in files_for_process_list]
    
    each_process_tffiles_num = math.ceil(each_process_files_num/max_num)
    
    queues_list = []
    processes_list = []
    for i in range(cores):
        queues_list.append(Queue())
        #queue = Queue()
        processes_list.append(Process(target=gen_tfrecord, 
                                      args=(files_for_process_list[i],targetfolder,
                                      each_process_tffiles_num*i+1,queues_list[i],)))
 
    for p in processes_list:
        Process.start(p)
 
    #父进程循环查询队列的消息，并且每0.5秒更新一次
    while(True):
        try:
            total = 0
            progress_str=''
            for i in range(cores):
                msg=queues_list[i].get()
                total += msg[1]
                progress_str+='PID'+str(msg[0])+':'+str(msg[1])+'/'+ str(files_number_list[i])+'|'
            progress_str+='\r'
            print(progress_str, end='')
            if total == total_files_num:
                for p in processes_list:
                    p.terminate()
                    p.join()
                break
            time.sleep(0.5)
        except:
            break
    return total
 
if __name__ == '__main__':
    print('Start processing train data using %i CPU cores:'%cores)
    starttime=time.time()       	  
    total_processed = process_in_queues(train_images, cores, targetfolder='train_tf/')
    endtime=time.time()
    print('\nProcess finish, total process %i images in %i seconds'%(total_processed, int(endtime-starttime)), end='')

4. 目标检测的训练

现在可以正式开始训练了，代码如下：

import tensorflow as tf
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow import contrib
autograph = contrib.autograph
import yolonet_model
import time

image_width = 448
image_height = 448
image_width_delta = 90   #448*1.2-448
image_height_delta = 90  #448*1.2-448 
batch_size = 64
valid_batch_size = 1
epoch_size = 26332
grids=7
lambda_noobj = 0.5
lambda_obj = 5.0
grid_width = image_width//grids
grid_height = image_height//grids
labels = ['person','bird','cat','cow','dog','horse','sheep','aeroplane','bicycle',
          'boat','bus','car','motorbike','train','bottle','chair','diningtable',
          'pottedplant','sofa','tvmonitor']

#Define the function to parse the TFRECORD
def _parse_function(example_proto):
    features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.VarLenFeature(tf.int64),
                "bbox_xmin": tf.VarLenFeature(tf.int64),
                "bbox_xmax": tf.VarLenFeature(tf.int64),
                "bbox_ymin": tf.VarLenFeature(tf.int64),
                "bbox_ymax": tf.VarLenFeature(tf.int64),
                "filename": tf.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.parse_single_example(example_proto, features)
    
    label = tf.expand_dims(parsed_features["label"].values, 0)
    label = tf.cast(label, tf.float32)
    height = parsed_features["height"]
    width = parsed_features["width"]
    channels = parsed_features["channels"]

    #Generate the random crop offset
    random_width_start = tf.random.uniform([1], minval=0, maxval=image_width_delta, dtype=tf.dtypes.int64)
    random_height_start = tf.random.uniform([1], minval=0, maxval=image_height_delta, dtype=tf.dtypes.int64)
    random_start = tf.concat([random_height_start, random_width_start, tf.constant([0], dtype=tf.dtypes.int64)], axis=0)
    
    #Adjust the bbox coordinates with random crop offset
    def f1():
        xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
        xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
        ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
        ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
        xmin = xmin - random_width_start
        xmin = tf.clip_by_value(xmin, 0, image_width)
        xmax = xmax - random_width_start
        xmax = tf.clip_by_value(xmax, 0, image_width)
        ymin = ymin - random_height_start
        ymin = tf.clip_by_value(ymin, 0, image_height)
        ymax = ymax - random_height_start
        ymax = tf.clip_by_value(ymax, 0, image_height)
        return xmin, xmax, ymin, ymax
    #Adjust the bbox coordinates with image flipped and random crop offset
    def f2():
        xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
        xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
        ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
        ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
        xmin_temp = xmin - random_width_start
        xmax_temp = xmax - random_width_start
        xmin = image_width - tf.clip_by_value(xmax_temp, 0, image_width)
        xmax = image_width - tf.clip_by_value(xmin_temp, 0, image_width)
        ymin = ymin - random_height_start
        ymin = tf.clip_by_value(ymin, 0, image_height)
        ymax = ymax - random_height_start
        ymax = tf.clip_by_value(ymax, 0, image_height)
        return xmin, xmax, ymin, ymax
    
    #Generate the random flip flag
    random_flip = tf.random.uniform([1], minval=0, maxval=1, dtype=tf.dtypes.float32)
    #Get the random flip and crop image coordinates
    xmin, xmax, ymin, ymax = tf.cond(tf.less(random_flip[0], 0.5), f1, f2)
    image_raw = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    image_sliced = tf.slice(image_raw, random_start, [image_height, image_width, -1])
    image_decoded = tf.image.convert_image_dtype(image_sliced, tf.float32)
    image_flipped = tf.cond(tf.less(random_flip[0], 0.5), lambda:image_decoded, lambda:tf.image.flip_left_right(image_decoded))
    image_train = tf.image.per_image_standardization(image_flipped)
    
    #Calculate the boxes center point
    box_center_x = xmin+(xmax-xmin)//2
    box_center_y = ymin+(ymax-ymin)//2
    #Calculate the boxes relate to which grid
    grid_id = (tf.ceil(box_center_y/grid_height)-1)*grids + tf.ceil(box_center_x/grid_width) - 1
    grid_id = tf.cast(grid_id, tf.float32)
    #Calculate and normalize the bbox center and width by grids
    center_x_percent = box_center_x%grid_width/grid_width
    center_x_percent = tf.cast(center_x_percent, tf.float32)
    center_y_percent = box_center_y%grid_height/grid_height
    center_y_percent = tf.cast(center_y_percent, tf.float32)
    box_width = (xmax-xmin)/image_width
    box_width = tf.cast(box_width, tf.float32)
    box_height = (ymax-ymin)/image_height
    box_height = tf.cast(box_height, tf.float32)
    #Generate the new bbox vector for label
    bbox = tf.concat(axis=0, values=[grid_id, center_x_percent, center_y_percent, box_width, box_height, label])
    bbox = tf.transpose(bbox, [1, 0])

    return image_train, parsed_features["filename"], image_raw, bbox

#Construct the train dataset
with tf.device('/cpu:0'):
    train_files = tf.data.Dataset.list_files("train_tf/*.tfrecord")
    dataset_train = train_files.interleave(tf.data.TFRecordDataset, cycle_length=4, num_parallel_calls=4)
    dataset_train = dataset_train.shuffle(buffer_size=epoch_size)
    dataset_train = dataset_train.repeat(100)
    dataset_train = dataset_train.map(_parse_function, num_parallel_calls=12)
    dataset_train = dataset_train.padded_batch(batch_size, \
                                               padded_shapes=([None,None,None], [], \
                                                              [None,None,None], [None,None]))
    dataset_train = dataset_train.prefetch(batch_size)
    iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
    image_train, filename, image_decoded, bbox = iterator.get_next()
    train_init_op = iterator.make_initializer(dataset_train)


#Calculate the IOU, the merged is two rect vectors combined
#Each rect vecor has 4 elements, xmin, xmax, ymin and ymax
def calculate_IOU(merged):
    rect1 = merged[:4]
    rect2 = merged[4:8]
    IOU = 0.0
    IOU_area = 0.0
    xmin=0.0
    xmax=0.0
    ymin=0.0
    ymax= 0.0
    rect1_area=0.0
    rect2_area= 0.0
    if (rect1[0]>=rect2[1] or rect2[0]>=rect1[1] or rect1[2]>=rect2[3] or rect2[2]>=rect1[3]):
        IOU = 0.0
    else:
        xmin=tf.maximum(rect1[0], rect3[0])
        xmax=tf.minimum(rect1[1], rect3[1])
        ymin=tf.maximum(rect1[2], rect3[2])
        ymax=tf.minimum(rect1[3], rect3[3])
        IOU_area=(xmax-xmin)*(ymax-ymin)
        rect1_area=(rect1[1]-rect1[0])*(rect1[3]-rect1[2])
        rect3_area=(rect3[1]-rect3[0])*(rect3[3]-rect3[2])
        IOU=IOU_area / (rect1_area+rect3_area-IOU_area)
    return IOU
tf_calculate_IOU = autograph.to_graph(calculate_IOU)

#Calculate the loss. Based on the YOLO V1 paper
#The pred is the prediction vector with shape [batchsize*7*7,30].
#The label is the vector with shape [batchsize*7*7,26].
def loss_func(pred, label):
    #Divide the pred and label vectors by the mask for with object or non object.
    mask_obj = label[:,1]>0.0
    mask_noobj = label[:,1]<1.0
    pred_obj = tf.boolean_mask(pred, mask_obj)
    label_obj = tf.boolean_mask(label, mask_obj)
    pred_noobj = tf.boolean_mask(pred, mask_noobj)
    label_noobj = tf.boolean_mask(label, mask_noobj)
    #Calculate the no obj prediction error
    loss_noobj = tf.reduce_sum(tf.square(pred_noobj[:,0])+tf.square(pred_noobj[:,5]))
    loss_classes = tf.reduce_sum(tf.square(pred_obj[:,10:]-label_obj[:,6:]))
    #Calculate the prediction box coordinates
    center_x1 = pred_obj[:,1:2]*grid_width
    center_y1 = pred_obj[:,2:3]*grid_height
    xmin_1 = center_x1 - pred_obj[:,3:4]**2*image_width//2
    xmax_1 = center_x1 + pred_obj[:,3:4]**2*image_width//2
    ymin_1 = center_y1 - pred_obj[:,4:5]**2*image_height//2
    ymax_1 = center_y1 + pred_obj[:,4:5]**2*image_height//2
    center_x2 = pred_obj[:,6:7]*grid_width
    center_y2 = pred_obj[:,7:8]*grid_height
    xmin_2 = center_x2 - pred_obj[:,8:9]**2*image_width//2
    xmax_2 = center_x2 + pred_obj[:,8:9]**2*image_width//2
    ymin_2 = center_y2 - pred_obj[:,9:10]**2*image_height//2
    ymax_2 = center_y2 + pred_obj[:,9:10]**2*image_height//2
    #Calculate the label box coordinates
    center_x = label_obj[:,2:3]*grid_width
    center_y = label_obj[:,3:4]*grid_height
    xmin = center_x - label_obj[:,4:5]*image_width//2
    xmax = center_x + label_obj[:,4:5]*image_width//2
    ymin = center_y - label_obj[:,5:6]*image_height//2
    ymax = center_y + label_obj[:,5:6]*image_height//2
    #Concat the prediction box and ground truth box and calculate the IOU
    merged1 = tf.concat([xmin_1,xmax_1,ymin_1,ymax_1,xmin,xmax,ymin,ymax], -1)
    merged2 = tf.concat([xmin_2,xmax_2,ymin_2,ymax_2,xmin,xmax,ymin,ymax], -1)
    IOU1 = tf.map_fn(tf_calculate_IOU, merged1)
    IOU2 = tf.map_fn(tf_calculate_IOU, merged2)
    #Select the higher IOU prediction box for coordination loss calculation
    IOU1_mask = tf.math.greater_equal(IOU1,IOU2)
    IOU2_mask = tf.math.greater(IOU2,IOU1)
    coord_IOU1 = tf.boolean_mask(pred_obj[:,:5], IOU1_mask)
    label_IOU1 = tf.boolean_mask(label_obj[:,2:6], IOU1_mask)
    coord_IOU2 = tf.boolean_mask(pred_obj[:,5:10], IOU2_mask)
    label_IOU2 = tf.boolean_mask(label_obj[:,2:6], IOU2_mask)
    loss_coord = lambda_obj * (tf.reduce_sum( \
                 tf.square(coord_IOU1[:,1]-label_IOU1[:,0]) + \
                 tf.square(coord_IOU1[:,2]-label_IOU1[:,1]) + \
                 tf.square(coord_IOU1[:,3]-tf.sqrt(label_IOU1[:,2])) + \
                 tf.square(coord_IOU1[:,4]-tf.sqrt(label_IOU1[:,3])))+ \
                 tf.reduce_sum( \
                 tf.square(coord_IOU2[:,1]-label_IOU2[:,0]) + \
                 tf.square(coord_IOU2[:,2]-label_IOU2[:,1]) + \
                 tf.square(coord_IOU2[:,3]-tf.sqrt(label_IOU2[:,2])) + \
                 tf.square(coord_IOU2[:,4]-tf.sqrt(label_IOU2[:,3]))))
    #Calculate the confidence for these two prediction boxes
    loss_confidence = tf.reduce_sum(tf.square(pred_obj[:,0]-IOU1)+tf.square(pred_obj[:,5]-IOU2))
    #Sum up all the loss parts
    loss = (loss_noobj+loss_classes+loss_coord+loss_confidence)/batch_size
    return loss

image_train_batch = tf.placeholder(shape=[None, image_height, image_width, 3], dtype=tf.float32)
grids_vector_batch = tf.placeholder(shape=[None, grids*grids, 26], dtype=tf.float32)
result = yolonet_model.inference(image_train_batch, pretrain_trainable=False, wd=0.0005, pretrain_training=False, yolo_training=True)
result = tf.reshape(result, [-1,30])
grids_vector = tf.reshape(grids_vector_batch, [-1,26])
mse_loss = loss_func(result, grids_vector)
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')

global_step = tf.Variable(0, trainable=False)
epoch_steps = int(epoch_size/batch_size)
boundaries = [epoch_steps*5,epoch_steps*55,epoch_steps*75]
values = [0.001, 0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimize_op = optimizer.minimize(loss, global_step=global_step)

#Load the pretrain Imagenet weights
#For the first time training, cancel the comment of below codes.
'''
variables_list = []
for var in tf.all_variables():
    #The var with "new" means it's for object dection training, not included in Imagenet pretrain
    if 'new' in var.name or 'Variable' in var.name:
        continue
    else:
        variables_list.append(var)
saver=tf.train.Saver(variables_list)
'''
saver_yolo=tf.train.Saver()

with tf.Session() as sess:
    #For first time training, cancel the comment of below codes.
    '''
    sess.run(tf.global_variables_initializer())
    #Load the pretrained Imagenet weights
    saver.restore(sess, "/home/roy/AI/model_bn_loss/model.ckpt-105000")   
    sess.run(global_step.initializer)
    '''
    #Comment out below line if the first training
    saver_yolo.restore(sess, "model_yolo/model.ckpt-30000")   
    sess.run([train_init_op])
    total_loss = 0.0 
    starttime = time.time()
    while(True):
        try:
            images_run, bbox_run = sess.run([image_train, bbox])
            #Construct the grids_vector based on bbox_run
            batch_num, box_num, _ = bbox_run.shape
            grids_vector_list = []
            for i in range(batch_num):
                vector = np.zeros([grids*grids, 26], dtype=float)
                for j in range(box_num):
                    if bbox_run[i][j][3]==0.0 or bbox_run[i][j][4]==0.0:
                        continue
                    else:
                        grid_id = int(bbox_run[i][j][0])
                        vector[grid_id][0] = grid_id
                        vector[grid_id][1] = 1.0
                        vector[grid_id][2] = bbox_run[i][j][1]
                        vector[grid_id][3] = bbox_run[i][j][2]
                        vector[grid_id][4] = bbox_run[i][j][3]
                        vector[grid_id][5] = bbox_run[i][j][4]
                        label = int(bbox_run[i][j][5])
                        vector[grid_id][label+6] = 1.0
                grids_vector_list.append(vector)
            grids_vector_run = np.stack(grids_vector_list)
            
            loss_a, step, lr, _ = sess.run([loss, global_step, learning_rate, optimize_op], feed_dict={image_train_batch:images_run, grids_vector_batch:grids_vector_run})
            total_loss += loss_a
        
            if step%100==0:
                print("step: %i, Learning_rate:%f, Time: %is Loss: %f" \
                      %(step, lr, int(time.time()-starttime), total_loss/100))
                total_loss = 0.0
                starttime = time.time()
    
            if step%2000==0:
                save_path = saver_yolo.save(sess, "model_yolo/model.ckpt", global_step=global_step)
        except tf.errors.OutOfRangeError:
            break

5. 验证模型效果

当训练完成后，我们就可以检验一下模型的效果了，以下代码将读取之前训练好的模型，对图像进行检测，检测时，只选取Confidence值大于0.1的BBOX，最后再应用非最大化抑制方法来消除IOU大于0.2的BBOX：

import tensorflow as tf
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import yolonet_model

image_width = 448
image_height = 448
batch_size = 1
grids=7
grid_width = image_width//grids
grid_height = image_height//grids
labels = ['person','bird','cat','cow','dog','horse','sheep','aeroplane','bicycle',
          'boat','bus','car','motorbike','train','bottle','chair','diningtable',
          'pottedplant','sofa','tvmonitor']
category_colors = {'person':(33,36,41), 'bird':(105,128,112), 'cat':(201,230,252), 
                   'cow':(31,102,156), 'dog':(240,32,160), 'horse':(214,112,118),
                   'sheep':(18,153,255), 'aeroplane':(0,215,255), 'bicycle':(3,97,255), 
                   'boat':(42,42,128), 'bus':(143,143,188), 'car':(18,38,94), 
                   'motorbike':(230,224,176), 'train':(205,90,106), 'bottle':(15,94,56), 
                   'chair':(84,46,8), 'diningtable':(64,145,61), 'pottedplant':(50,205,50), 
                   'sofa':(80,127,255), 'tvmonitor':(31,23,176)}


def _parse_img(imgname):
    image_raw = tf.gfile.FastGFile(imgname,'rb').read()
    img = tf.image.decode_jpeg(image_raw)
    image_decoded = tf.image.convert_image_dtype(img, tf.float32)
    img_resized = tf.image.resize(image_decoded, [image_height, image_width])
    img_processed = tf.image.per_image_standardization(img_resized)
    img_processed = tf.expand_dims(img_processed, 0)
    img_resized = tf.expand_dims(img_resized, 0)
    img_original = tf.expand_dims(img, 0)
    return img_processed, img_original

#Calculate the IOU of two rect. 
#Each rect is a list of [(xmin, ymin), (xmax, ymax)]
def IOU_calculate(rect1, rect2):
    if (rect1[0][0]>=rect2[1][0] or rect2[0][0]>=rect1[1][0] or rect1[0][1]>=rect2[1][1] or rect2[0][1]>=rect1[1][1]):
    #if (rect1[0]>=rect2[1] or rect2[0]>=rect1[1] or rect1[2]>=rect2[3] or rect2[2]>=rect1[3]):
        IOU = 0.0
    else:
        xmin=max(rect1[0][0], rect2[0][0])
        xmax=min(rect1[1][0], rect2[1][0])
        ymin=max(rect1[0][1], rect2[0][1])
        ymax=min(rect1[1][1], rect2[1][1])
        IOU_area=(xmax-xmin)*(ymax-ymin)
        rect1_area=(rect1[1][0]-rect1[0][0])*(rect1[1][1]-rect1[0][1])
        rect2_area=(rect2[1][0]-rect2[0][0])*(rect2[1][1]-rect2[0][1])
        IOU=IOU_area / (rect1_area+rect2_area-IOU_area)
        if IOU<0.0:
            IOU = 0.0
        if IOU>1.0:
            IOU = 1.0
    return IOU

test_image = tf.placeholder(tf.float32, [1, image_height,image_width,3])
result = yolonet_model.inference(test_image, pretrain_trainable=False, wd=0.0005, pretrain_training=False, yolo_training=False)
result = tf.reshape(result, [49,30])
result=tf.clip_by_value(result, 0.0, 1.0)

grid_index = tf.reshape(tf.constant([i for i in range(49)]), [-1,1])
grid_index = tf.cast(grid_index, tf.float32)
grid_infer = tf.concat([grid_index, result], axis=1)
grid_class_index = tf.reshape(tf.argmax(grid_infer[:,11:], axis=1)+11, [-1,1])
grid_class_probability = tf.batch_gather(grid_infer, grid_class_index)
mask1=tf.math.greater_equal(grid_infer[:,1:2]*grid_class_probability, 0.1)  #check if the predict IOU>0.1
mask2=tf.math.greater_equal(grid_infer[:,6:7]*grid_class_probability, 0.1)  #check if the predict IOU>0.1
mask=tf.reshape(mask1|mask2, [-1])
grid_object = tf.boolean_mask(grid_infer, mask)   #Only get the grids that predict IOU>0.1
grid_object_class = tf.argmax(grid_object[:,11:], axis=1)   #Get the predict class

saver_yolo=tf.train.Saver()
img_processed, img = _parse_img("/home/roy/AI/darknet/darknet/data/dog.jpg")
with tf.Session() as sess:
    saver_yolo.restore(sess, "model_yolo/model.ckpt-50000")
    img1, image_run = sess.run([img_processed, img])
    grid_object_run,grid_object_class_run,grid_infer_run,m1,m2,m,g_index_t,g_index_class_t = sess.run([grid_object,grid_object_class,grid_infer,mask1,mask2,mask,grid_class_index,grid_class_probability], feed_dict={test_image:img1})

original_height, original_width, _ = image_run[0].shape
height_ratio = original_height/image_height
width_ratio = original_width/image_width
box = []
category = []
confidence = []
probabilty = []
box_by_category = {}
for i in range(len(grid_object_run)):
    coord_start = 2 

    if grid_object_run[i][1]<grid_object_run[i][6]:
        coord_start = 7

    center_x = grid_object_run[i][coord_start]*grid_width+grid_object_run[i][0]%grids*grid_width
    center_y = grid_object_run[i][coord_start+1]*grid_height+grid_object_run[i][0]//grids*grid_height
    xmin = int(max(int(center_x - grid_object_run[i][coord_start+2]**2*image_width//2), 0)*width_ratio)
    xmax = int((center_x + grid_object_run[i][coord_start+2]**2*image_width//2)*width_ratio)
    ymin = int(max(int(center_y - grid_object_run[i][coord_start+3]**2*image_height//2), 0)*height_ratio)
    ymax = int((center_y + grid_object_run[i][coord_start+3]**2*image_height//2)*height_ratio)
    box.append([(xmin,ymin),(xmax,ymax)])

    category.append(labels[grid_object_class_run[i]])
    confidence.append(grid_object_run[i][coord_start-1]*grid_object_run[i][grid_object_class_run[i]+11])
    probabilty.append(grid_object_run[i][grid_object_class_run[i]+11])
   
for i in range(len(category)):
    if category[i] in box_by_category:
        box_by_category[category[i]].append((box[i],confidence[i],probabilty[i]))
    else:
        box_by_category[category[i]]=[(box[i],confidence[i],probabilty[i])]

image_copy = image_run.copy()

#Draw all the predicted bbox that confidence>0.1
fontFace = cv2.FONT_HERSHEY_COMPLEX 
fontScale = 0.6 
thickness = 1 
lineType = 4
for i in range(len(box)):
    cv2.rectangle(image_run[0], box[i][0], box[i][1], category_colors[category[i]],0)
    cv2.putText(image_run[0], category[i], (box[i][0][0], box[i][0][1]-8), \
                fontFace, fontScale, category_colors[category[i]], thickness, lineType)
plt.imshow(image_run[0])

#Use Non Max Suppression to remove some bbox
nmx_threshold=0.2
box_by_category_nmx = {}
for key in box_by_category:
    nmx_box = sorted(box_by_category[key], key=lambda x:x[1], reverse=True)
    start_index = 0
    dropped_index = []
    while True:
        if start_index >= len(nmx_box):
            break
        if start_index in dropped_index:
            start_index += 1
            continue
        rect1 = nmx_box[start_index][0]  #The rect include [(xmin,ymin), (xmax,ymax)]
        for i in range((start_index+1),len(nmx_box)):
            if i not in dropped_index:
                rect2 = nmx_box[i][0]
                iou = IOU_calculate(rect1, rect2)
                if iou > nmx_threshold:
                    dropped_index.append(i)
        start_index += 1
    box_by_category_nmx[key]=[]
    for i in range(len(nmx_box)):
        if i not in dropped_index:
            box_by_category_nmx[key].append(nmx_box[i])

#Draw the bbox using Non max suppression
for key in box_by_category_nmx:
    for i in range(len(box_by_category_nmx[key])):
        cv2.rectangle(image_copy[0], box_by_category_nmx[key][i][0][0], box_by_category_nmx[key][i][0][1], category_colors[key],2)
        probability = str(round(box_by_category_nmx[key][i][2]*100, 2))+"%"
        cv2.putText(image_copy[0], key+" "+probability, \
                    (box_by_category_nmx[key][i][0][0][0], box_by_category_nmx[key][i][0][0][1]-8), \
                    fontFace, fontScale, category_colors[key], thickness, lineType)

#Save the picture
cv_image = cv2.cvtColor(image_copy[0], cv2.COLOR_BGR2RGB)
cv2.imwrite("predict_image/dog_yolo_50000.jpg", cv_image)

6. 检测效果

以下是一些图片的检测效果，还是能达到不错的效果的，和YOLO论文中的结果不相上下（除了最后一张图片中的白马被识别为了SHEEP，估计是之前预训练的图片中的SHEEP大多数是白色的，容易和白马搞混了吧 ^_^）