Parameter tuning experience of deep learning convolutional network based on Imagenet training

It is said that deep learning is alchemy, and tuning is the core skill of alchemy. Recently, based on Imagenet's data set, I tested the impact of different parameters on performance. Here is a summary.

First build a deep convolutional neural network. The network structure refers to the imagenet pre-trained network in the YOLO paper, that is, a 20-layer convolutional network plus a fully connected layer. The specific network structure code is as follows:

import tensorflow as tf

def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
        conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
        biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
        bias = tf.nn.bias_add(conv, biases)
        output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
        output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
        return output_bn

def inference(images, pretrain=True, wd=None, training=True):
    conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain, 0.01, training)       #112*112*64
    pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1')   #56*56*64
    conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain, 0.01, training)      #56*56*192
    pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2')   #28*28*192
    conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain, 0.01, training)     #28*28*128
    conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain, 0.01, training)     #28*28*256
    conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain, 0.01, training)     #28*28*256
    conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)     #28*28*512
    pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6')   #14*14*512
    conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)     #14*14*256
    conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)     #14*14*512
    conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)     #14*14*256
    conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)   #14*14*512
    conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)  #14*14*256
    conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)  #14*14*256
    conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #14*14*1024
    pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16')  #7*7*1024
    conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
    conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
    conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
    conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024

    avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True)    #1024
    flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
    with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
        weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=pretrain, name='weights')
        weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
        biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=pretrain, name='biases')
        local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
    return local

The code for network training is as follows:

import tensorflow as tf
import os
import random
import time

imageWidth = 224
imageHeight = 224
imageDepth = 3
batch_size = 112
resize_min = 256

def distort_color(image, color_ordering=0):
    if color_ordering == 0:
        image = tf.image.random_brightness(image, max_delta=32. / 255.)#亮度
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)#饱和度
        image = tf.image.random_hue(image, max_delta=0.2)#色相
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)#对比度
    if color_ordering == 1:
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
        image = tf.image.random_hue(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
    if color_ordering == 2:
        image = tf.image.random_hue(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
    if color_ordering == 3:
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
        image = tf.image.random_hue(image, max_delta=0.2)
    return tf.clip_by_value(image, 0.0, 1.0)

# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
    features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "bbox_xmin": tf.VarLenFeature(tf.float32),
                "bbox_xmax": tf.VarLenFeature(tf.float32),
                "bbox_ymin": tf.VarLenFeature(tf.float32),
                "bbox_ymax": tf.VarLenFeature(tf.float32),
                "text": tf.FixedLenFeature([], tf.string, default_value=""),
                "filename": tf.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.parse_single_example(example_proto, features)

    image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
    shape = tf.shape(image_decoded)
    height, width = shape[0], shape[1]
    resized_height, resized_width = tf.cond(height<width,
        lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
        lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
    resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
    cropped = tf.random_crop(resized, [imageHeight, imageWidth, 3])
    
    # Flip to add a little more random distortion in.
    flipped = tf.image.random_flip_left_right(cropped)
    image_train = tf.image.per_image_standardization(flipped)
    #distorted_image = distort_color(flipped, np.random.randint(4))
    return image_train, parsed_features["label"][0]

with tf.device('/cpu:0'):
    train_files_names = os.listdir('train_tf/')
    train_files = ['/home/roy/AI/train_tf/'+item for item in train_files_names]
    dataset_train = tf.data.TFRecordDataset(train_files)
    dataset_train = dataset_train.map(_parse_function, num_parallel_calls=4)
    dataset_train = dataset_train.repeat(10)
    dataset_train = dataset_train.batch(batch_size)
    dataset_train = dataset_train.prefetch(batch_size)
    iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
    next_images, next_labels = iterator.get_next()
    train_init_op = iterator.make_initializer(dataset_train)

def _parse_test_function(example_proto):
    features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "bbox_xmin": tf.VarLenFeature(tf.float32),
                "bbox_xmax": tf.VarLenFeature(tf.float32),
                "bbox_ymin": tf.VarLenFeature(tf.float32),
                "bbox_ymax": tf.VarLenFeature(tf.float32),
                "text": tf.FixedLenFeature([], tf.string, default_value=""),
                "filename": tf.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.parse_single_example(example_proto, features)
    image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
    shape = tf.shape(image_decoded)
    height, width = shape[0], shape[1]
    resized_height, resized_width = tf.cond(height<width,
        lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
        lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
    image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
    image_resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
    
    # calculate how many to be center crop
    shape = tf.shape(image_resized)  
    height, width = shape[0], shape[1]
    amount_to_be_cropped_h = (height - imageHeight)
    crop_top = amount_to_be_cropped_h // 2
    amount_to_be_cropped_w = (width - imageWidth)
    crop_left = amount_to_be_cropped_w // 2
    image_cropped = tf.slice(image_resized, [crop_top, crop_left, 0], [imageHeight, imageWidth, -1])
    image_cropped = tf.image.per_image_standardization(image_cropped)
    return image_cropped, parsed_features["label"][0]

with tf.device('/cpu:0'):
    valid_files_names = os.listdir('valid_tf/')
    valid_files = ['/home/roy/AI/valid_tf/'+item for item in valid_files_names]
    dataset_valid = tf.data.TFRecordDataset(valid_files)
    dataset_valid = dataset_valid.map(_parse_test_function, num_parallel_calls=4)
    dataset_valid = dataset_valid.batch(batch_size)
    dataset_valid = dataset_valid.prefetch(batch_size)
    iterator_valid = tf.data.Iterator.from_structure(dataset_valid.output_types, dataset_valid.output_shapes)
    next_valid_images, next_valid_labels = iterator_valid.get_next()
    valid_init_op = iterator_valid.make_initializer(dataset_valid)

global_step = tf.Variable(0, trainable=False)
epoch_steps = int(1281167/batch_size)
#boundaries = [epoch_steps*7, epoch_steps*11]
boundaries = [60000, 80000]
values = [0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
lr_summary = tf.summary.scalar('learning_rate', learning_rate)

result = yolonet_model.inference(next_images, pretrain=True, wd=0.0005, training=True)
output_result_scores = tf.nn.softmax(result)
output_result = tf.argmax(output_result_scores, 1)

#Calculate the cross entropy loss
cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=next_labels, logits=result)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
 
#Add the l2 weights to the loss
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
loss_summary = tf.summary.scalar('loss', loss)
 
#Define the optimizer
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimize_op = optimizer.minimize(loss, global_step=global_step)

#Get the inference logits by the model for the validation images
result_valid = yolonet_model.inference(next_valid_images, pretrain=True, wd=0.0005, training=False)
output_valid_scores = tf.nn.softmax(result_valid)
output_valid_result = tf.argmax(output_valid_scores, 1)
accuracy_valid_batch = tf.reduce_mean(tf.cast(tf.equal(next_valid_labels, tf.argmax(output_valid_scores, 1)), tf.float32))
accuracy_valid_top_5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(output_valid_scores, next_valid_labels, k=5), tf.float32))
acc_1_summary = tf.summary.scalar('accuracy_valid_top_1', accuracy_valid_batch)
acc_2_summary = tf.summary.scalar('accuracy_valid_top_5', accuracy_valid_top_5)

# Add ops to save and restore all the variables.
saver = tf.train.Saver()
            
with tf.Session() as sess:
    #saver.restore(sess, "model_bn_loss1/model.ckpt-15000")
    sess.run(tf.global_variables_initializer())
    sess.run([train_init_op, valid_init_op])
    total_loss = 0.0
    epoch = 0
    starttime = time.time()
    while(True):
        try:
            loss_t, output_result_t, lr, step, _ = sess.run([loss, output_result, learning_rate, global_step, optimize_op])
            total_loss += loss_t
            
            if step%100==0:
                print("step: %i, Learning_rate: %f, Time: %is Loss: %f"%(step, lr, int(time.time()-starttime), total_loss/100))
                total_loss = 0.0
                starttime = time.time()
            
            if step%5000==0:
                save_path = saver.save(sess, "model_bn_loss1/model.ckpt", global_step=global_step)
                truepredict = 0.0
                truepredict_top5 = 0.0
                valid_count = 0
                while(True):
                    try:
                        acc_valid_1, acc_valid_5, valid_result_t = sess.run([accuracy_valid_batch, accuracy_valid_top_5, output_valid_result])
                        truepredict += acc_valid_1
                        truepredict_top5 += acc_valid_5
                        valid_count += 1
                        #print(acc_valid_5)
                    except tf.errors.OutOfRangeError:
                        print("valid accuracy of top 1: %f" % (truepredict/valid_count))
                        print("valid accuracy of top 5: %f" % (truepredict_top5/valid_count))
                        break
                starttime = time.time()
                sess.run([valid_init_op])
          
        except tf.errors.OutOfRangeError:
            break

The test conclusions are as follows:

1. Adding Batch Normalization after the output of each convolutional layer can speed up network convergence and improve network performance. Batch normalization should be added after the activation function. After Batch normalization is increased, before calling Optimizer, you need to ensure that the average and variance of Batch normalization have been updated. Need to add the following code:

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimize_op = optimizer.minimize(loss, global_step=global_step)

In addition, during training, the isTraining of tf.batch_normalization should be set to True, and during prediction, it should be set to False

2. For image preprocessing, several different methods have been tested:

    a. Convert the pixel value of the image to 0-1

    b. Convert the pixel values ​​of the image to a normal distribution with a mean of 0

    c. Randomly change the contrast, saturation, brightness and hue of the image

    After testing, the effect of converting the image pixel value into a normal distribution with a mean value of 0 is the best

3. For the L2 weight decay parameter test, the values ​​of 0, 0.005, 0.0005, 0.00005 were tested, and it was found that the value of 0.00005 works best

The final training result is that after training 10 EPOCH, the accuracy of TOP 5 is 83.3%, and the accuracy of TOP 1 is 61.5%

Guess you like

Origin blog.csdn.net/gzroy/article/details/88831481