The official version of Tensorflow 2.0 was officially launched in October. I also switched to this new version for the first time. After spending some time researching, my conclusion is that version 2.0 is really easy to use, but there is also a shortcoming that is encapsulated. Great, you can’t understand the mechanism implemented in it well. For example, after I tried the Keras model and training method recommended by 2.0, I found that it was not as fast as the previous 1.x version using low-level API to directly train and converge, and It seems that the training accuracy of 1.x is not reached. For example, if the Keras layer with Batch Normalization is used, if it is directly trained and verified by Keras's Fit method, it is not mentioned in the official document how to distinguish between training and prediction. In the actual test, it is found that the model converges very slowly. Check it Some posts on the Internet also mentioned similar problems. The solution is to call Keras.backend.set_learning_phase. However, I found that there is no big difference between whether to call or not. It may be that the online posts are based on the test version of TF 2.0, which is different from the official version different. Later, I also switched to the Custom Training Loop method for comparison, and found that it seems to converge faster and more stable than the direct model fit method, but it seems that it still does not reach the accuracy of 1.x. Although I haven't used TF2.0 well at the moment, I feel that the ease of use of TF 2.0 is still greatly enhanced, and it is worth continuing to study in depth. Let me record the process of Imagenet training with TF 2.0. The Imagenet file still generates the training set and the validation set according to the method mentioned in my previous blog, which will not be repeated here.
Model definition
I use the MobileNet V2 model, such as the following code:
import tensorflow as tf
l = tf.keras.layers
imageWidth = 224
imageHeight = 224
def _conv(inputs, filters, kernel_size, strides, padding, bias=False, normalize=True, activation='relu'):
output = inputs
padding_str = 'same'
if padding>0:
output = l.ZeroPadding2D(padding=padding)(output)
padding_str = 'valid'
output = l.Conv2D(filters, kernel_size, strides, padding_str, use_bias=bias, \
kernel_initializer='he_normal', \
kernel_regularizer=tf.keras.regularizers.l2(l=5e-4))(output)
if normalize:
output = l.BatchNormalization(axis=3)(output)
if activation=='relu':
output = l.ReLU()(output)
if activation=='relu6':
output = l.ReLU(max_value=6)(output)
if activation=='leaky_relu':
output = l.LeakyReLU(alpha=0.1)(output)
return output
def _dwconv(inputs, filters, kernel_size, strides, padding, bias=False, activation='relu'):
output = inputs
padding_str = 'same'
if padding>0:
output = l.ZeroPadding2D(padding=(padding, padding))(output)
padding_str = 'valid'
output = l.DepthwiseConv2D(kernel_size, strides, padding_str, use_bias=bias, \
depthwise_initializer='he_uniform', depthwise_regularizer=tf.keras.regularizers.l2(l=5e-4))(output)
output = l.BatchNormalization(axis=3)(output)
if activation=='relu':
output = l.ReLU()(output)
if activation=='relu6':
output = l.ReLU(max_value=6)(output)
if activation=='leaky_relu':
output = l.LeakyReLU(alpha=0.1)(output)
return output
def _bottleneck(inputs, in_filters, out_filters, kernel_size, strides, bias=False, activation='relu6', t=1):
output = inputs
output = _conv(output, in_filters*t, 1, 1, 0, False, activation)
padding = 0
if strides == 2:
padding = 1
output = _dwconv(output, in_filters*t, kernel_size, strides, padding, bias=False, activation=activation)
output = _conv(output, out_filters, 1, 1, 0, False, 'linear')
if strides==1 and inputs.get_shape().as_list()[3]==output.get_shape().as_list()[3]:
output = l.add([output, inputs])
return output
def mobilenet_model_v2():
# Input Layer
image = tf.keras.Input(shape=(imageHeight,imageWidth,3)) #224*224*3
net = _conv(image, 32, 3, 2, 1, False, 'relu6') #112*112*32
net = _bottleneck(net, 32, 16, 3, 1, False, 'relu6', 1) #112*112*16
net = _bottleneck(net, 16, 24, 3, 2, False, 'relu6', 6) #56*56*24
net = _bottleneck(net, 24, 24, 3, 1, False, 'relu6', 6) #56*56*24
net = _bottleneck(net, 24, 32, 3, 2, False, 'relu6', 6) #28*28*32
net = _bottleneck(net, 32, 32, 3, 1, False, 'relu6', 6) #28*28*32
net = _bottleneck(net, 32, 32, 3, 1, False, 'relu6', 6) #28*28*32
net = _bottleneck(net, 32, 64, 3, 2, False, 'relu6', 6) #14*14*64
net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6) #14*14*64
net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6) #14*14*64
net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6) #14*14*64
net = _bottleneck(net, 64, 96, 3, 1, False, 'relu6', 6) #14*14*96
net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6) #14*14*96
net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6) #14*14*96
net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6) #14*14*96
net = _bottleneck(net, 96, 160, 3, 2, False, 'relu6', 6) #7*7*160
net = _bottleneck(net, 160, 160, 3, 1, False, 'relu6', 6) #7*7*160
net = _bottleneck(net, 160, 160, 3, 1, False, 'relu6', 6) #7*7*160
net = _bottleneck(net, 160, 320, 3, 1, False, 'relu6', 6) #7*7*320
net = _conv(net, 1280, 3, 1, 0, False, 'relu6') #7*7*1280
net = l.AveragePooling2D(7)(net)
net = l.Flatten()(net)
logits = l.Dense(1000, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1/1000))(net)
model = tf.keras.Model(inputs=image, outputs=logits)
return model
Build training set and validation set
imageDepth = 3
batch_size = 64
resize_min = 256
train_files_names = os.listdir('/AI/train_tf/')
train_files = ['/AI/train_tf/'+item for item in train_files_names]
valid_files_names = os.listdir('/AI/valid_tf/')
valid_files = ['/AI/valid_tf/'+item for item in valid_files_names]
# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
features = {"image": tf.io.FixedLenFeature([], tf.string, default_value=""),
"height": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.io.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.io.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.io.FixedLenFeature([], tf.string, default_value=""),
"label": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.io.VarLenFeature(tf.float32),
"bbox_xmax": tf.io.VarLenFeature(tf.float32),
"bbox_ymin": tf.io.VarLenFeature(tf.float32),
"bbox_ymax": tf.io.VarLenFeature(tf.float32),
"text": tf.io.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.io.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.io.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
# Random resize the image
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
resized = tf.image.resize(image_float, [resized_height, resized_width])
# Random crop from the resized image
cropped = tf.image.random_crop(resized, [imageHeight, imageWidth, 3])
# Flip to add a little more random distortion in.
flipped = tf.image.random_flip_left_right(cropped)
# Standardization the image
image_train = tf.image.per_image_standardization(flipped)
image_train = tf.transpose(image_train, perm=[2, 0, 1])
features = {'input_1': image_train}
return features, parsed_features["label"][0]
def train_input_fn():
dataset_train = tf.data.TFRecordDataset(train_files)
dataset_train = dataset_train.map(_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_train = dataset_train.shuffle(10000)
dataset_train = dataset_train.repeat(10)
dataset_train = dataset_train.batch(batch_size)
dataset_train = dataset_train.prefetch(batch_size)
return dataset_train
def _parse_test_function(example_proto):
features = {"image": tf.io.FixedLenFeature([], tf.string, default_value=""),
"height": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.io.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.io.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.io.FixedLenFeature([], tf.string, default_value=""),
"label": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.io.VarLenFeature(tf.float32),
"bbox_xmax": tf.io.VarLenFeature(tf.float32),
"bbox_ymin": tf.io.VarLenFeature(tf.float32),
"bbox_ymax": tf.io.VarLenFeature(tf.float32),
"text": tf.io.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.io.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.io.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
image_resized = tf.image.resize(image_float, [resized_height, resized_width])
# calculate how many to be center crop
shape = tf.shape(image_resized)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - imageHeight)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - imageWidth)
crop_left = amount_to_be_cropped_w // 2
image_cropped = tf.slice(image_resized, [crop_top, crop_left, 0], [imageHeight, imageWidth, -1])
image_valid = tf.image.per_image_standardization(image_cropped)
image_valid = tf.transpose(image_valid, perm=[2, 0, 1])
features = {'input_1': image_valid}
return features, parsed_features["label"][0]
def val_input_fn():
dataset_valid = tf.data.TFRecordDataset(valid_files)
dataset_valid = dataset_valid.map(_parse_test_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_valid = dataset_valid.batch(batch_size)
dataset_valid = dataset_valid.prefetch(batch_size)
return dataset_valid
Define the callback function of the model
The main function is to adjust the learning rate of the optimizer according to the number of training steps, and to print out the verification set indicators after each training EPOCH is completed, such as the following code:
boundaries = [1000, 5000, 60000, 80000]
values = [0.001, 0.1, 0.01, 0.001, 0.0001]
learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)
class LRCallback(tf.keras.callbacks.Callback):
def __init__(self, starttime):
super(LRCallback, self).__init__()
self.epoch_starttime = starttime
self.batch_starttime = starttime
def on_train_batch_end(self, batch, logs):
step = tf.keras.backend.get_value(self.model.optimizer.iterations)
if step%100==0:
elasp_time = time.time()-self.batch_starttime
self.batch_starttime = time.time()
lr = tf.keras.backend.get_value(self.model.optimizer.lr)
tf.keras.backend.set_value(self.model.optimizer.lr, learning_rate_fn(step))
print("Steps:{}, LR:{:6.4f}, Loss:{:4.2f}, Time:{:4.1f}s"\
.format(step, lr, logs['loss'], elasp_time))
def on_epoch_end(self, epoch, logs=None):
epoch_elasp_time = time.time()-self.epoch_starttime
print("Epoch:{}, Top-1 Accuracy:{:5.3f}, Top-5 Accuracy:{:5.3f}, Time:{:5.1f}s"\
.format(epoch, logs['val_top_1_accuracy'], logs['val_top_5_accuracy'], epoch_elasp_time))
def on_epoch_begin(self, epoch, logs=None):
tf.keras.backend.set_learning_phase(True)
self.epoch_starttime=time.time()
def on_test_begin(self, logs=None):
tf.keras.backend.set_learning_phase(False)
tensorboard_cbk = tf.keras.callbacks.TensorBoard(log_dir='mobilenet/logs')
checkpoint_cbk = tf.keras.callbacks.ModelCheckpoint(filepath='mobilenet/test_{epoch}.h5', verbose=1)
Compile the model
Compile the model, define the LOSS function, select the optimizer, and select the verification index.
model = mobilenet_model_v2()
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='top_1_accuracy'),
tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy')])
Train and validate the model
Finally, start training and verification. Note that the Callbacks inside fill in the callback function we defined earlier, which can help us adjust the learning rate, print the verification results, and save the model. Afterwards, if you need to load the model, you only need to call tf.keras.models.load_model, and you don't need to compile the model.
train_data = train_input_fn()
val_data = val_input_fn()
_ = model.fit(train_data,
validation_data=val_data,
epochs=2,
verbose=0,
callbacks=[LRCallback(time.time()), tensorboard_cbk, checkpoint_cbk],
steps_per_epoch=5000)
Custom Training Loop
It can be seen from the above code that the Keras Model Compile and Fit methods can be used to train the model very conveniently. The only flaw is that I found that this process is too black box encapsulated, and some of the details inside should not be covered up. If It may not be convenient if you need to do some additional control over the training process (of course, it should be done in the callback function in theory), but for me, the biggest problem is that the model seems to converge too slowly during training. , The final accuracy is not very satisfactory, and I am not sure about the specific reasons. For this reason, I also deliberately wrote a custom training loop for comparison. If this method is used, the above code will be replaced by the following code from the compilation of the model. It can be seen that the amount of code is slightly more, but From the results of my actual training, it seems to be better:
train_data = train_input_fn()
val_data = val_input_fn()
START_EPOCH = 0
NUM_EPOCH = 1
STEPS_EPOCH = 0
STEPS_OFFSET = 0
with tf.device('/GPU:0'):
model = mobilenet_model_v2()
optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
#model = tf.keras.models.load_model('model/darknet53_custom_training_12.h5')
@tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
predictions = model(inputs, training=True)
regularization_loss = tf.math.add_n(model.losses)
pred_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, predictions)
total_loss = pred_loss + regularization_loss
gradients = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return total_loss
boundaries = [1000, 5000, 65000, 100000]
values = [0.001, 0.1, 0.01, 0.001, 0.0001]
learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)
for epoch in range(NUM_EPOCH):
start_step = tf.keras.backend.get_value(optimizer.iterations)+STEPS_OFFSET
steps = start_step
loss_sum = 0
start_time = time.time()
for inputs, labels in train_data:
if (steps-start_step)>STEPS_EPOCH:
break
loss_sum += train_step(inputs, labels)
steps = tf.keras.backend.get_value(optimizer.iterations)+STEPS_OFFSET
if steps%100 == 0:
elasp_time = time.time()-start_time
lr = tf.keras.backend.get_value(optimizer.lr)
print("Step:{}, Loss:{:4.2f}, LR:{:5f}, Time:{:3.1f}s".format(steps, loss_sum/100, lr, elasp_time))
loss_sum = 0
tf.keras.backend.set_value(optimizer.lr, learning_rate_fn(steps))
start_time = time.time()
steps += 1
model.save('model/darknet53_custom_training_'+str(START_EPOCH+epoch)+'.h5')
m1 = tf.keras.metrics.SparseCategoricalAccuracy()
m2 = tf.keras.metrics.SparseTopKCategoricalAccuracy()
for inputs, labels in val_data:
val_predict_logits = model(inputs, training=False)
val_predict = tf.keras.activations.softmax(val_predict_logits)
m1.update_state(labels, val_predict)
m2.update_state(labels, val_predict)
print("Top-1 Accuracy:%f, Top-2 Accuracy:%f"%(m1.result().numpy(),m2.result().numpy()))
m1.reset_states()
m2.reset_states()