It is said that deep learning is alchemy, and tuning is the core skill of alchemy. Recently, based on Imagenet's data set, I tested the impact of different parameters on performance. Here is a summary.
First build a deep convolutional neural network. The network structure refers to the imagenet pre-trained network in the YOLO paper, that is, a 20-layer convolutional network plus a fully connected layer. The specific network structure code is as follows:
import tensorflow as tf
def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
bias = tf.nn.bias_add(conv, biases)
output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
return output_bn
def inference(images, pretrain=True, wd=None, training=True):
conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain, 0.01, training) #112*112*64
pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1') #56*56*64
conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain, 0.01, training) #56*56*192
pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2') #28*28*192
conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain, 0.01, training) #28*28*128
conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain, 0.01, training) #28*28*256
conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain, 0.01, training) #28*28*256
conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #28*28*512
pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6') #14*14*512
conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #14*14*1024
pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16') #7*7*1024
conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True) #1024
flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=pretrain, name='weights')
weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=pretrain, name='biases')
local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
return local
The code for network training is as follows:
import tensorflow as tf
import os
import random
import time
imageWidth = 224
imageHeight = 224
imageDepth = 3
batch_size = 112
resize_min = 256
def distort_color(image, color_ordering=0):
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)#亮度
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)#饱和度
image = tf.image.random_hue(image, max_delta=0.2)#色相
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)#对比度
if color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
if color_ordering == 2:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
if color_ordering == 3:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
return tf.clip_by_value(image, 0.0, 1.0)
# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
"height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.FixedLenFeature([], tf.string, default_value=""),
"label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.VarLenFeature(tf.float32),
"bbox_xmax": tf.VarLenFeature(tf.float32),
"bbox_ymin": tf.VarLenFeature(tf.float32),
"bbox_ymax": tf.VarLenFeature(tf.float32),
"text": tf.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
cropped = tf.random_crop(resized, [imageHeight, imageWidth, 3])
# Flip to add a little more random distortion in.
flipped = tf.image.random_flip_left_right(cropped)
image_train = tf.image.per_image_standardization(flipped)
#distorted_image = distort_color(flipped, np.random.randint(4))
return image_train, parsed_features["label"][0]
with tf.device('/cpu:0'):
train_files_names = os.listdir('train_tf/')
train_files = ['/home/roy/AI/train_tf/'+item for item in train_files_names]
dataset_train = tf.data.TFRecordDataset(train_files)
dataset_train = dataset_train.map(_parse_function, num_parallel_calls=4)
dataset_train = dataset_train.repeat(10)
dataset_train = dataset_train.batch(batch_size)
dataset_train = dataset_train.prefetch(batch_size)
iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
next_images, next_labels = iterator.get_next()
train_init_op = iterator.make_initializer(dataset_train)
def _parse_test_function(example_proto):
features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
"height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.FixedLenFeature([], tf.string, default_value=""),
"label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.VarLenFeature(tf.float32),
"bbox_xmax": tf.VarLenFeature(tf.float32),
"bbox_ymin": tf.VarLenFeature(tf.float32),
"bbox_ymax": tf.VarLenFeature(tf.float32),
"text": tf.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
image_resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
# calculate how many to be center crop
shape = tf.shape(image_resized)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - imageHeight)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - imageWidth)
crop_left = amount_to_be_cropped_w // 2
image_cropped = tf.slice(image_resized, [crop_top, crop_left, 0], [imageHeight, imageWidth, -1])
image_cropped = tf.image.per_image_standardization(image_cropped)
return image_cropped, parsed_features["label"][0]
with tf.device('/cpu:0'):
valid_files_names = os.listdir('valid_tf/')
valid_files = ['/home/roy/AI/valid_tf/'+item for item in valid_files_names]
dataset_valid = tf.data.TFRecordDataset(valid_files)
dataset_valid = dataset_valid.map(_parse_test_function, num_parallel_calls=4)
dataset_valid = dataset_valid.batch(batch_size)
dataset_valid = dataset_valid.prefetch(batch_size)
iterator_valid = tf.data.Iterator.from_structure(dataset_valid.output_types, dataset_valid.output_shapes)
next_valid_images, next_valid_labels = iterator_valid.get_next()
valid_init_op = iterator_valid.make_initializer(dataset_valid)
global_step = tf.Variable(0, trainable=False)
epoch_steps = int(1281167/batch_size)
#boundaries = [epoch_steps*7, epoch_steps*11]
boundaries = [60000, 80000]
values = [0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
lr_summary = tf.summary.scalar('learning_rate', learning_rate)
result = yolonet_model.inference(next_images, pretrain=True, wd=0.0005, training=True)
output_result_scores = tf.nn.softmax(result)
output_result = tf.argmax(output_result_scores, 1)
#Calculate the cross entropy loss
cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=next_labels, logits=result)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
#Add the l2 weights to the loss
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
loss_summary = tf.summary.scalar('loss', loss)
#Define the optimizer
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimize_op = optimizer.minimize(loss, global_step=global_step)
#Get the inference logits by the model for the validation images
result_valid = yolonet_model.inference(next_valid_images, pretrain=True, wd=0.0005, training=False)
output_valid_scores = tf.nn.softmax(result_valid)
output_valid_result = tf.argmax(output_valid_scores, 1)
accuracy_valid_batch = tf.reduce_mean(tf.cast(tf.equal(next_valid_labels, tf.argmax(output_valid_scores, 1)), tf.float32))
accuracy_valid_top_5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(output_valid_scores, next_valid_labels, k=5), tf.float32))
acc_1_summary = tf.summary.scalar('accuracy_valid_top_1', accuracy_valid_batch)
acc_2_summary = tf.summary.scalar('accuracy_valid_top_5', accuracy_valid_top_5)
# Add ops to save and restore all the variables.
saver = tf.train.Saver()
with tf.Session() as sess:
#saver.restore(sess, "model_bn_loss1/model.ckpt-15000")
sess.run(tf.global_variables_initializer())
sess.run([train_init_op, valid_init_op])
total_loss = 0.0
epoch = 0
starttime = time.time()
while(True):
try:
loss_t, output_result_t, lr, step, _ = sess.run([loss, output_result, learning_rate, global_step, optimize_op])
total_loss += loss_t
if step%100==0:
print("step: %i, Learning_rate: %f, Time: %is Loss: %f"%(step, lr, int(time.time()-starttime), total_loss/100))
total_loss = 0.0
starttime = time.time()
if step%5000==0:
save_path = saver.save(sess, "model_bn_loss1/model.ckpt", global_step=global_step)
truepredict = 0.0
truepredict_top5 = 0.0
valid_count = 0
while(True):
try:
acc_valid_1, acc_valid_5, valid_result_t = sess.run([accuracy_valid_batch, accuracy_valid_top_5, output_valid_result])
truepredict += acc_valid_1
truepredict_top5 += acc_valid_5
valid_count += 1
#print(acc_valid_5)
except tf.errors.OutOfRangeError:
print("valid accuracy of top 1: %f" % (truepredict/valid_count))
print("valid accuracy of top 5: %f" % (truepredict_top5/valid_count))
break
starttime = time.time()
sess.run([valid_init_op])
except tf.errors.OutOfRangeError:
break
The test conclusions are as follows:
1. Adding Batch Normalization after the output of each convolutional layer can speed up network convergence and improve network performance. Batch normalization should be added after the activation function. After Batch normalization is increased, before calling Optimizer, you need to ensure that the average and variance of Batch normalization have been updated. Need to add the following code:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimize_op = optimizer.minimize(loss, global_step=global_step)
In addition, during training, the isTraining of tf.batch_normalization should be set to True, and during prediction, it should be set to False
2. For image preprocessing, several different methods have been tested:
a. Convert the pixel value of the image to 0-1
b. Convert the pixel values of the image to a normal distribution with a mean of 0
c. Randomly change the contrast, saturation, brightness and hue of the image
After testing, the effect of converting the image pixel value into a normal distribution with a mean value of 0 is the best
3. For the L2 weight decay parameter test, the values of 0, 0.005, 0.0005, 0.00005 were tested, and it was found that the value of 0.00005 works best
The final training result is that after training 10 EPOCH, the accuracy of TOP 5 is 83.3%, and the accuracy of TOP 1 is 61.5%