The main training trim gradient avoid problems gradient explosion, in general, the use of the Batch Normalization is not necessary to use a gradient trim, but still have to achieve the necessary understanding under

In TensorFlow, the optimizer’s minimize() function takes care of both computing the gradients and applying them, so you must instead call the optimizer’s compute_gradients() method first, then create an operation to clip the gradients using the clip_by_value() function, and finally create an operation to apply the clipped gradients using the optimizer’s apply_gradients() method:

threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
　　　　　　　　 for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)

example

import tensorflow as tf

def Swish(features):
    return features*tf.nn.sigmoid(features)

# 1. create data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)

X = tf.placeholder(tf.float32, shape=(None, 784), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')
is_training = tf.placeholder(tf.bool, None, name='is_training')

# 2. define network
he_init = tf.contrib.layers.variance_scaling_initializer()
with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, 300, kernel_initializer=he_init, name='hidden1')
    # hidden1 = tf.layers.batch_normalization(hidden1, momentum=0.9)
    hidden1 = tf.nn.relu(hidden1)
    hidden2 = tf.layers.dense(hidden1, 100, kernel_initializer=he_init, name='hidden2')
    # hidden2 = tf.layers.batch_normalization(hidden2, training=is_training, momentum=0.9)
    hidden2 = tf.nn.relu(hidden2)
    logits = tf.layers.dense(hidden2, 10, kernel_initializer=he_init, name='output')
    # prob = tf.layers.dense(hidden2, 10, tf.nn.softmax, name='prob')

# 3. define loss
with tf.name_scope('loss'):
    # tf.losses.sparse_softmax_cross_entropy() label is not one_hot and dtype is int*
    # xentropy = tf.losses.sparse_softmax_cross_entropy(labels=tf.argmax(y, axis=1), logits=logits)
    # tf.nn.sparse_softmax_cross_entropy_with_logits() label is not one_hot and dtype is int*
    # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y, axis=1), logits=logits)
    # loss = tf.reduce_mean(xentropy)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) # label is one_hot

# 4. define optimizer
learning_rate = 0.01
with tf.name_scope('train'):
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # for batch normalization
    with tf.control_dependencies(update_ops):
        # optimizer_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
        threshold = 1.0
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                      for grad, var in grads_and_vars]
        optimizer_op = optimizer.apply_gradients(capped_gvs)



with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, tf.argmax(y, axis=1), 1) # 目标是否在前K个预测中, label's dtype is int*
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

# 5. initialize
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
saver = tf.train.Saver()
# =================
print([v.name for v in tf.trainable_variables()])
print([v.name for v in tf.global_variables()])
# =================
# 5. train & test
n_epochs = 20
n_batches = 50
batch_size = 50

with tf.Session() as sess:
    sess.run(init_op)
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(optimizer_op, feed_dict={X: X_batch, y: y_batch, is_training:True})
            # =================
            # for grad, var in grads_and_vars:
            #     grad = grad.eval(feed_dict={X: X_batch, y: y_batch, is_training:True})
            #     var = var.eval()
            # =================
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch, is_training:False}) # 最后一个 batch 的 accuracy
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test, "Test loss:", loss_test)
    save_path = saver.save(sess, "./my_model_final.ckpt")

with tf.Session() as sess:
    sess.run(init_op)
    saver.restore(sess, "./my_model_final.ckpt")
    acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
    loss_test = loss.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels, is_training:False})
    print("Test accuracy:", acc_test, ", Test loss:", loss_test)

View Code

Let us look at some of the things involved in the above example

compute_gradients

compute_gradients any optimizer has a method:

compute_gradients(
    loss,
    var_list=None,
    gate_gradients=GATE_OP,
    aggregation_method=None,
    colocate_gradients_with_ops=False,
    grad_loss=None
)

Loss gradient in trainable var_list calculation.
The first step is equivalent to minimize () returns (gradient, variable) list.

After we get the range of gradient you can manually cut the gradient, the following sentence is to limit the gradient to [-threshold, threshold] is:

capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                      for grad, var in grads_and_vars]

apply_gradients

apply_gradients also have any of a method of optimization is:

apply_gradients(
    grads_and_vars,
    global_step=None,
    name=None
)

minimize () a second portion, the gradient returns an update ops performed.

TensorFlow use record (eight): gradient trim

example

compute_gradients

apply_gradients

Guess you like