- loss为Nan(最近实现Arcface(后改名insightface)遇到的困难,网络结构是)
考虑梯度爆炸/消失:
-
把学习率调小;
-
对梯度进行裁剪;
optimizer = tf.train.AdamOptimizer(learning_rate) grads = optimizer.compute_gradients(AMS_loss) # (gradient,variable) #gg = [] ,这里可以看梯度 for i, (g, v) in enumerate(grads): if g is not None: #gg.append(g) grads[i] = (tf.clip_by_norm(g, 5), v) # 梯度修剪,阈值这里设为5
-
不当的loss函数。(手写的极大可能性是这个问题)
像我写的AM-Softmax loss,很low,贴出来记录,有没有大神能帮着指点一下,改好了会再贴出来。def AM_Softmax_Loss(f, s=30, m=0.35): ''' :param f: (batch_size,2048) :param s: :param m: :return: ''' def clip_score(tensor, min_value=-1.79769313e+30, max_value=1.79769313e+30): return tf.clip_by_value(tensor, min_value, max_value) W = tf.Variable( tf.truncated_normal([f.get_shape().as_list()[1], num_classes], stddev=0.1)) # (2048,num_classes) # ground_truth = np.argmax(inputs_Y,axis=1) #(batch_size,) ground_truth = tf.argmax(Y_label, axis=1) # (batch_size,) AMS_loss = 0 cnt = 0 for i in range(batch_size): try: fi = f[i, :] # (2048,) cnt += 1 except: break Wyi = W[:, ground_truth[i]] # (2048,) correct_score = tf.exp(s * (tf.reduce_sum(tf.multiply(fi, Wyi)) - m)) clip_other_score = clip_score(tf.exp(s * tf.matmul(fi[tf.newaxis, :], W))) other_score = tf.reduce_sum(clip_other_score, axis=1) other_score -= tf.exp(s * tf.reduce_sum(tf.multiply(fi, Wyi))) AMS_loss += tf.log(correct_score) - tf.log(correct_score + other_score) AMS_loss *= -(1 / cnt) Y_pred = tf.matmul(f,W) return AMS_loss[0], Y_pred
其中的缩放因子s导致了梯度爆炸,按论文的设定s=10,m=0.35