Implement Imagenet training with Tensorflow 2.0

The official version of Tensorflow 2.0 was officially launched in October. I also switched to this new version for the first time. After spending some time researching, my conclusion is that version 2.0 is really easy to use, but there is also a shortcoming that is encapsulated. Great, you can’t understand the mechanism implemented in it well. For example, after I tried the Keras model and training method recommended by 2.0, I found that it was not as fast as the previous 1.x version using low-level API to directly train and converge, and It seems that the training accuracy of 1.x is not reached. For example, if the Keras layer with Batch Normalization is used, if it is directly trained and verified by Keras's Fit method, it is not mentioned in the official document how to distinguish between training and prediction. In the actual test, it is found that the model converges very slowly. Check it Some posts on the Internet also mentioned similar problems. The solution is to call Keras.backend.set_learning_phase. However, I found that there is no big difference between whether to call or not. It may be that the online posts are based on the test version of TF 2.0, which is different from the official version different. Later, I also switched to the Custom Training Loop method for comparison, and found that it seems to converge faster and more stable than the direct model fit method, but it seems that it still does not reach the accuracy of 1.x. Although I haven't used TF2.0 well at the moment, I feel that the ease of use of TF 2.0 is still greatly enhanced, and it is worth continuing to study in depth. Let me record the process of Imagenet training with TF 2.0. The Imagenet file still generates the training set and the validation set according to the method mentioned in my previous blog, which will not be repeated here.

Model definition

I use the MobileNet V2 model, such as the following code:

import tensorflow as tf
l = tf.keras.layers
imageWidth = 224
imageHeight = 224

def _conv(inputs, filters, kernel_size, strides, padding, bias=False, normalize=True, activation='relu'):
    output = inputs
    padding_str = 'same'
    if padding>0:
        output = l.ZeroPadding2D(padding=padding)(output)
        padding_str = 'valid'
    output = l.Conv2D(filters, kernel_size, strides, padding_str, use_bias=bias, \
                      kernel_initializer='he_normal', \
                      kernel_regularizer=tf.keras.regularizers.l2(l=5e-4))(output)
    if normalize:
        output = l.BatchNormalization(axis=3)(output)
    if activation=='relu':
        output = l.ReLU()(output)
    if activation=='relu6':
        output = l.ReLU(max_value=6)(output)
    if activation=='leaky_relu':
        output = l.LeakyReLU(alpha=0.1)(output)
    return output
 
def _dwconv(inputs, filters, kernel_size, strides, padding, bias=False, activation='relu'):
    output = inputs
    padding_str = 'same'
    if padding>0:
        output = l.ZeroPadding2D(padding=(padding, padding))(output)
        padding_str = 'valid'
    output = l.DepthwiseConv2D(kernel_size, strides, padding_str, use_bias=bias, \
                               depthwise_initializer='he_uniform', depthwise_regularizer=tf.keras.regularizers.l2(l=5e-4))(output)
    output = l.BatchNormalization(axis=3)(output)
    if activation=='relu':
        output = l.ReLU()(output)
    if activation=='relu6':
        output = l.ReLU(max_value=6)(output)
    if activation=='leaky_relu':
        output = l.LeakyReLU(alpha=0.1)(output)
    return output
 
def _bottleneck(inputs, in_filters, out_filters, kernel_size, strides, bias=False, activation='relu6', t=1):
    output = inputs
    output = _conv(output, in_filters*t, 1, 1, 0, False, activation)
    padding = 0
    if strides == 2:
        padding = 1
    output = _dwconv(output, in_filters*t, kernel_size, strides, padding, bias=False, activation=activation)
    output = _conv(output, out_filters, 1, 1, 0, False, 'linear')
    if strides==1 and inputs.get_shape().as_list()[3]==output.get_shape().as_list()[3]:
        output = l.add([output, inputs])
    return output

def mobilenet_model_v2():
    # Input Layer
    image = tf.keras.Input(shape=(imageHeight,imageWidth,3))   #224*224*3
    net = _conv(image, 32, 3, 2, 1, False, 'relu6')            #112*112*32
    net = _bottleneck(net, 32, 16, 3, 1, False, 'relu6', 1)    #112*112*16
    net = _bottleneck(net, 16, 24, 3, 2, False, 'relu6', 6)    #56*56*24
    net = _bottleneck(net, 24, 24, 3, 1, False, 'relu6', 6)    #56*56*24
    net = _bottleneck(net, 24, 32, 3, 2, False, 'relu6', 6)    #28*28*32
    net = _bottleneck(net, 32, 32, 3, 1, False, 'relu6', 6)    #28*28*32
    net = _bottleneck(net, 32, 32, 3, 1, False, 'relu6', 6)    #28*28*32
    net = _bottleneck(net, 32, 64, 3, 2, False, 'relu6', 6)    #14*14*64
    net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6)    #14*14*64
    net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6)    #14*14*64
    net = _bottleneck(net, 64, 64, 3, 1, False, 'relu6', 6)    #14*14*64
    net = _bottleneck(net, 64, 96, 3, 1, False, 'relu6', 6)    #14*14*96
    net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6)    #14*14*96
    net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6)    #14*14*96
    net = _bottleneck(net, 96, 96, 3, 1, False, 'relu6', 6)    #14*14*96
    net = _bottleneck(net, 96, 160, 3, 2, False, 'relu6', 6)   #7*7*160
    net = _bottleneck(net, 160, 160, 3, 1, False, 'relu6', 6)  #7*7*160
    net = _bottleneck(net, 160, 160, 3, 1, False, 'relu6', 6)  #7*7*160
    net = _bottleneck(net, 160, 320, 3, 1, False, 'relu6', 6)  #7*7*320
    net = _conv(net, 1280, 3, 1, 0, False, 'relu6')            #7*7*1280
    net = l.AveragePooling2D(7)(net)
    net = l.Flatten()(net)
    logits = l.Dense(1000, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=1/1000))(net)
    model = tf.keras.Model(inputs=image, outputs=logits)
    return model

Build training set and validation set

imageDepth = 3
batch_size = 64
resize_min = 256
train_files_names = os.listdir('/AI/train_tf/')
train_files = ['/AI/train_tf/'+item for item in train_files_names]
valid_files_names = os.listdir('/AI/valid_tf/')
valid_files = ['/AI/valid_tf/'+item for item in valid_files_names]

# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
    features = {"image": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.io.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "bbox_xmin": tf.io.VarLenFeature(tf.float32),
                "bbox_xmax": tf.io.VarLenFeature(tf.float32),
                "bbox_ymin": tf.io.VarLenFeature(tf.float32),
                "bbox_ymax": tf.io.VarLenFeature(tf.float32),
                "text": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "filename": tf.io.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.io.parse_single_example(example_proto, features)
    image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    # Random resize the image 
    shape = tf.shape(image_decoded)
    height, width = shape[0], shape[1]
    resized_height, resized_width = tf.cond(height<width,
        lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
        lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
    image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
    resized = tf.image.resize(image_float, [resized_height, resized_width])
    # Random crop from the resized image
    cropped = tf.image.random_crop(resized, [imageHeight, imageWidth, 3])
    # Flip to add a little more random distortion in.
    flipped = tf.image.random_flip_left_right(cropped)
    # Standardization the image
    image_train = tf.image.per_image_standardization(flipped)
    image_train = tf.transpose(image_train, perm=[2, 0, 1])
    features = {'input_1': image_train}
    return features, parsed_features["label"][0]
 
def train_input_fn():
    dataset_train = tf.data.TFRecordDataset(train_files)
    dataset_train = dataset_train.map(_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_train = dataset_train.shuffle(10000)
    dataset_train = dataset_train.repeat(10)
    dataset_train = dataset_train.batch(batch_size)
    dataset_train = dataset_train.prefetch(batch_size)
    return dataset_train

def _parse_test_function(example_proto):
    features = {"image": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.io.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
                "bbox_xmin": tf.io.VarLenFeature(tf.float32),
                "bbox_xmax": tf.io.VarLenFeature(tf.float32),
                "bbox_ymin": tf.io.VarLenFeature(tf.float32),
                "bbox_ymax": tf.io.VarLenFeature(tf.float32),
                "text": tf.io.FixedLenFeature([], tf.string, default_value=""),
                "filename": tf.io.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.io.parse_single_example(example_proto, features)
    image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    shape = tf.shape(image_decoded)
    height, width = shape[0], shape[1]
    resized_height, resized_width = tf.cond(height<width,
        lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
        lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
    image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
    image_resized = tf.image.resize(image_float, [resized_height, resized_width])
    
    # calculate how many to be center crop
    shape = tf.shape(image_resized)  
    height, width = shape[0], shape[1]
    amount_to_be_cropped_h = (height - imageHeight)
    crop_top = amount_to_be_cropped_h // 2
    amount_to_be_cropped_w = (width - imageWidth)
    crop_left = amount_to_be_cropped_w // 2
    image_cropped = tf.slice(image_resized, [crop_top, crop_left, 0], [imageHeight, imageWidth, -1])
    image_valid = tf.image.per_image_standardization(image_cropped)
    image_valid = tf.transpose(image_valid, perm=[2, 0, 1])
    features = {'input_1': image_valid}
    return features, parsed_features["label"][0]
 
def val_input_fn():
    dataset_valid = tf.data.TFRecordDataset(valid_files)
    dataset_valid = dataset_valid.map(_parse_test_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_valid = dataset_valid.batch(batch_size)
    dataset_valid = dataset_valid.prefetch(batch_size)
    return dataset_valid

Define the callback function of the model

The main function is to adjust the learning rate of the optimizer according to the number of training steps, and to print out the verification set indicators after each training EPOCH is completed, such as the following code:

boundaries = [1000, 5000, 60000, 80000]
values = [0.001, 0.1, 0.01, 0.001, 0.0001]
learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)

class LRCallback(tf.keras.callbacks.Callback):
    def __init__(self, starttime):
        super(LRCallback, self).__init__()
        self.epoch_starttime = starttime
        self.batch_starttime = starttime
    def on_train_batch_end(self, batch, logs):
        step = tf.keras.backend.get_value(self.model.optimizer.iterations)
        if step%100==0:
            elasp_time = time.time()-self.batch_starttime
            self.batch_starttime = time.time()
            lr = tf.keras.backend.get_value(self.model.optimizer.lr)
            tf.keras.backend.set_value(self.model.optimizer.lr, learning_rate_fn(step))
            print("Steps:{}, LR:{:6.4f}, Loss:{:4.2f}, Time:{:4.1f}s"\
                  .format(step, lr, logs['loss'], elasp_time))
    def on_epoch_end(self, epoch, logs=None):
        epoch_elasp_time = time.time()-self.epoch_starttime
        print("Epoch:{}, Top-1 Accuracy:{:5.3f}, Top-5 Accuracy:{:5.3f}, Time:{:5.1f}s"\
              .format(epoch, logs['val_top_1_accuracy'], logs['val_top_5_accuracy'], epoch_elasp_time))
    def on_epoch_begin(self, epoch, logs=None):
        tf.keras.backend.set_learning_phase(True)
        self.epoch_starttime=time.time()
    def on_test_begin(self, logs=None):
        tf.keras.backend.set_learning_phase(False)

tensorboard_cbk = tf.keras.callbacks.TensorBoard(log_dir='mobilenet/logs')
checkpoint_cbk = tf.keras.callbacks.ModelCheckpoint(filepath='mobilenet/test_{epoch}.h5', verbose=1)

Compile the model

Compile the model, define the LOSS function, select the optimizer, and select the verification index.

model = mobilenet_model_v2()
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='top_1_accuracy'),
                       tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy')])

Train and validate the model

Finally, start training and verification. Note that the Callbacks inside fill in the callback function we defined earlier, which can help us adjust the learning rate, print the verification results, and save the model. Afterwards, if you need to load the model, you only need to call tf.keras.models.load_model, and you don't need to compile the model.

train_data = train_input_fn()
val_data = val_input_fn()
_ = model.fit(train_data,
              validation_data=val_data,
              epochs=2,
              verbose=0,
              callbacks=[LRCallback(time.time()), tensorboard_cbk, checkpoint_cbk],
              steps_per_epoch=5000)

Custom Training Loop

It can be seen from the above code that the Keras Model Compile and Fit methods can be used to train the model very conveniently. The only flaw is that I found that this process is too black box encapsulated, and some of the details inside should not be covered up. If It may not be convenient if you need to do some additional control over the training process (of course, it should be done in the callback function in theory), but for me, the biggest problem is that the model seems to converge too slowly during training. , The final accuracy is not very satisfactory, and I am not sure about the specific reasons. For this reason, I also deliberately wrote a custom training loop for comparison. If this method is used, the above code will be replaced by the following code from the compilation of the model. It can be seen that the amount of code is slightly more, but From the results of my actual training, it seems to be better:

train_data = train_input_fn()
val_data = val_input_fn()
START_EPOCH = 0
NUM_EPOCH = 1
STEPS_EPOCH = 0
STEPS_OFFSET = 0
with tf.device('/GPU:0'):
    model = mobilenet_model_v2()
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
    #model = tf.keras.models.load_model('model/darknet53_custom_training_12.h5')
    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)
            regularization_loss = tf.math.add_n(model.losses)
            pred_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, predictions)
            total_loss = pred_loss + regularization_loss
        gradients = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return total_loss

    boundaries = [1000, 5000, 65000, 100000]
    values = [0.001, 0.1, 0.01, 0.001, 0.0001]
    learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)

    for epoch in range(NUM_EPOCH):
        start_step = tf.keras.backend.get_value(optimizer.iterations)+STEPS_OFFSET
        steps = start_step
        loss_sum = 0
        start_time = time.time()
        for inputs, labels in train_data:
            if (steps-start_step)>STEPS_EPOCH:
                break
            loss_sum += train_step(inputs, labels)
            steps = tf.keras.backend.get_value(optimizer.iterations)+STEPS_OFFSET
            if steps%100 == 0:
                elasp_time = time.time()-start_time
                lr = tf.keras.backend.get_value(optimizer.lr)
                print("Step:{}, Loss:{:4.2f}, LR:{:5f}, Time:{:3.1f}s".format(steps, loss_sum/100, lr, elasp_time))
                loss_sum = 0
                tf.keras.backend.set_value(optimizer.lr, learning_rate_fn(steps))
                start_time = time.time()
            steps += 1
        model.save('model/darknet53_custom_training_'+str(START_EPOCH+epoch)+'.h5')
        m1 = tf.keras.metrics.SparseCategoricalAccuracy()
        m2 = tf.keras.metrics.SparseTopKCategoricalAccuracy()
        for inputs, labels in val_data:
            val_predict_logits = model(inputs, training=False)
            val_predict = tf.keras.activations.softmax(val_predict_logits)
            m1.update_state(labels, val_predict)        
            m2.update_state(labels, val_predict)  
        print("Top-1 Accuracy:%f, Top-2 Accuracy:%f"%(m1.result().numpy(),m2.result().numpy()))
        m1.reset_states()
        m2.reset_states()

Implement Imagenet training with Tensorflow 2.0

Guess you like