tensorflow 中关于batch normalization的函数主要有三个:
-tf.nn.batch_normalization
-tf.layers.batch_normalization
-tf.contrib.layers.batch_norm
一、tf.nn.batch_normalization
1、tf.nn.moments函数
def moments(x, axes, name=None, keep_dims=False)
参数解释如下:
输入:
x:输入数据。形如[batchsize,height,width,kernels]
axes:表示在哪个维度上求解。是一个list。
keep_dims:是否保持维度。
输出:
两个tensor:mean 均值和 variance 方差
例1:计算一个2*3维向量的mean和variance
import tensorflow as tf
img = tf.Variable(tf.random_normal([2, 3]))
axis = list(range(len(img.get_shape()) - 1)) #axis=[0]
mean, variance = tf.nn.moments(img, axis)
输出结果如下:
img = array([[ 0.7691303 , -0.35025588, -0.09380586],
[-1.4653573 , 0.29895827, 0.41032326]], dtype=float32),
mean = array([[-0.3481135, -0.0256488, 0.1582587]], dtype=float32),
variance = array([[1.2482337 , 0.10536975, 0.06353654]], dtype=float32)
moments函数就是在第0维上求了均值和方差。
对axis的说明:
axis=0,那么输出矩阵是1行,求每一列的平均;axis=1,输出矩阵是1列,求每一行的平均。
如果是图像数据, 可以传入 [0, 1, 2], 相当于求[batch, height, width] 的均值/方差, 注意不要加入 channel 维度
例2:计算卷积神经网络某层的mean和variance
这里假设输入数据的格式是NHWC的。
import tensorflow as tf
img = tf.Variable(tf.random_normal([128,32,32,64]))
axis = list(range(len(img.get_shape()) - 1))
mean, variance = tf.nn.moments(img, axis)
输出结果如下:
mean = array([[[[ 4.0830136e-04, -4.2963726e-03, -5.0003931e-04,
-4.4543482e-04, 6.0103042e-04, -4.0140026e-04,
2.5256963e-03, -1.0819699e-03, -1.4404759e-03,
-1.4326994e-03, -8.2220486e-04, -1.4163775e-03,
9.7719464e-04, 1.0412441e-03, -1.3563948e-03,
-2.5035394e-03, 7.8339566e-05, -1.4030328e-03,
-7.6795375e-04, 7.8183822e-03, -2.0574513e-03,
6.8343728e-04, 4.1835662e-04, -6.1633415e-03,
-1.1482568e-03, 6.4310152e-03, 1.9873765e-03,
-9.3293225e-04, 3.0720932e-03, -1.4636834e-03,
4.3379748e-04, 2.7630590e-03, -2.0989170e-03,
-7.2691259e-03, -6.3739987e-03, -1.8997930e-03,
-4.7454494e-04, 2.1465109e-03, -3.9908607e-03,
1.4424872e-03, -2.4142924e-03, -6.0538348e-04,
2.5435248e-03, -4.0083809e-04, -3.1555234e-03,
2.5182338e-03, -2.6306501e-03, 8.3392172e-04,
2.5035494e-03, 4.7882642e-03, -2.0719678e-03,
-2.3144923e-04, -2.8795146e-03, 3.9198864e-03,
-5.2687079e-03, -1.1409470e-04, 1.3856608e-03,
3.4842882e-03, -1.0945165e-03, 6.8958546e-03,
2.9155985e-05, 1.3611093e-03, 2.2281366e-03,
2.7138158e-03]]]], dtype=float32),
variance = array([[[[1.0005758 , 1.0044069 , 0.9994525 , 0.9997337 , 1.0071163 ,
1.0019692 , 0.9954032 , 1.0064473 , 1.001137 , 0.9969884 ,
0.9947835 , 0.9986869 , 0.99899316, 1.0043697 , 1.0033542 ,
1.0046039 , 0.99368966, 0.9923917 , 0.99647164, 1.0045955 ,
1.0020585 , 0.9950892 , 1.0052316 , 1.002666 , 1.0090908 ,
1.008016 , 0.9980576 , 0.9993979 , 0.99848366, 0.99986047,
0.99899065, 1.006967 , 1.003453 , 1.0008634 , 1.0005352 ,
0.996747 , 0.99737716, 0.9945858 , 1.0005856 , 0.99736226,
1.0054593 , 1.0028933 , 0.9965185 , 1.000582 , 1.0035369 ,
0.99765056, 0.9987483 , 0.99487376, 0.99546564, 0.99410796,
1.0013032 , 0.99769133, 0.99917245, 1.0011214 , 1.0029986 ,
0.99958813, 0.9991367 , 0.9963123 , 0.9997672 , 1.0027118 ,
0.99747765, 1.0014238 , 1.0001569 , 0.99354565]]]],
dtype=float32)
解释一下对于[128,32,32,64]这样的4维矩阵。
上述结果的形状:mean=(1,1,1,64);variance=(1,1,1,64)
如下图所示,一个batch里有128个图,经过一个64kernels的卷积层处理,得到了128*64张特征图,每张特征图的大小是32*32的。每一个kernels对应128张特征图。求得它们所有像素的mean和variance。因为总共有64个kernels,输出结果就是一个一维长度64的数组。
2、tf.nn.batch_normalization函数
def batch_normalization(x,mean,variance,
offset,scale,variance_epsilon,
name=None)
参数解释如下:
x:输入任意维度的tensor
mean:tensor均值
variance:tensor方差
offset:平移量,公式中的β。需要训练参数,一般初始化为0。
scale:放缩量,公式中的γ。需要训练的参数,一般初始化为1。
要求:offset、scale的shape与mean相同。
例3:一个BN的完整例子
reference:莫烦tensorflow教程之BN
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
ACTIVATION = tf.nn.tanh # 每一层都是用tanh
N_LAYERS = 7 # 一共7层隐藏层
N_HIDDEN_UNITS = 30 # 每个层隐藏层有30个神经元
def fix_seed(seed=1):
# reproducible
np.random.seed(seed)
tf.set_random_seed(seed)
def plot_his(inputs, inputs_norm):
# plot histogram for the inputs of every layer
for j, all_inputs in enumerate([inputs, inputs_norm]):
for i, input in enumerate(all_inputs):
plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
plt.cla()
if i == 0:
the_range = (-7, 10)
else:
the_range = (-1, 1)
plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
plt.yticks(())
if j == 1:
plt.xticks(the_range)
else:
plt.xticks(())
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
plt.draw()
plt.pause(0.01)
# 搭建神经网络
def built_net(xs, ys, norm):
def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
# 添加层
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
# fully connected product
Wx_plus_b = tf.matmul(inputs, Weights) + biases
# 对全连接层输入进行BN,BN加在wx+b之后,激活函数值前
# 每层的 Wx_plus_b 需要进行一次 batch normalize 的步骤,
# 这样输出到 activation 的 Wx_plus_b 就已经被 normalize 过了:
if norm:
# Batch Normalize
fc_mean, fc_var = tf.nn.moments(
Wx_plus_b,
axes=[0], # 想要 normalize 的维度, [0] 代表 batch 维度
# 如果是图像数据, 可以传入 [0, 1, 2], 相当于求[batch, height, width] 的均值/方差, 注意不要加入 channel 维度
)
scale = tf.Variable(tf.ones([out_size]))
shift = tf.Variable(tf.zeros([out_size]))
epsilon = 0.001
# 如果使用batch进行每次的更新,那每个batch的mean/var都会不同。
# 需要用滑动平均的方法记录慢慢改进的mean/var的值,
# 然后将修改提升后的mean/var放入tf.nn.batch_normalization()
# 而且在test阶段,我们就可以直接调用最后一次修改的mean/var值进行测试
# 而不是采用test时的fc_mean/fc_var
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
# 根据新的 batch 数据, 记录并稍微修改之前的 mean/var
mean, var = mean_var_with_update()
Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
# 上面那一步, 在做如下事情:
# Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
# Wx_plus_b = Wx_plus_b * scale + shift
# activation
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs
fix_seed(1)
# 对输入数据进行BN
if norm:
fc_mean, fc_var = tf.nn.moments(
xs,
axes=[0],
)
scale = tf.Variable(tf.ones([1]))
shift = tf.Variable(tf.zeros([1]))
epsilon = 0.001
# apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()
xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)
# 记录每层的输入
layers_inputs = [xs]
# 建立所有的层
for l_n in range(N_LAYERS):
layer_input = layers_inputs[l_n]
in_size = layers_inputs[l_n].get_shape()[1].value
output = add_layer(
layer_input, # input
in_size, # input size
N_HIDDEN_UNITS, # output size
ACTIVATION, # activation function
norm, # normalize before activation
)
layers_inputs.append(output) # 把output加入记录
# 建立输出层
prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)
cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
return [train_op, cost, layers_inputs]
# 创造数据并可视化
fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
np.random.shuffle(x_data)
noise = np.random.normal(0, 8, x_data.shape)
y_data = np.square(x_data) - 5 + noise
# 可视化输入数据
plt.scatter(x_data, y_data)
plt.show()
xs = tf.placeholder(tf.float32, [None, 1]) # [num_samples, num_features]
ys = tf.placeholder(tf.float32, [None, 1])
train_op, cost, layers_inputs = built_net(xs, ys, norm=False) # without BN
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN
sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
else:
init = tf.global_variables_initializer()
sess.run(init)
# 记录两种网络的cost变化
cost_his = []
cost_his_norm = []
record_step = 5
plt.ion()
plt.figure(figsize=(7, 3))
for i in range(250):
if i % 50 == 0:
# 每层在激活函数值前计算结果值的分布
all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs: x_data, ys: y_data})
plot_his(all_inputs, all_inputs_norm)
# train on batch
sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})
if i % record_step == 0:
# 记录损失
cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
cost_his_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))
plt.ioff()
plt.figure()
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his), label='no BN') # no norm
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his_norm), label='BN') # norm
plt.legend()
plt.show()
输入数据分布:
每一层wx+b值,在被激活之前的分布:
可以看出, 没有 BN, 每层的值迅速全部都饱和, 都跑去了 -1/1 这个饱和区间, 有 BN, 即使前一层因变得相对饱和, 但是后面几层的值都被 normalize 到有效的不饱和区间内计算. 确保了一个活的神经网络.
tanh误差对比:
如果把激活函数换成relu
每一层wx+b值,在被激活之前的分布:
可以看出, 没有用 BN 的时候, 每层的值迅速全部都变为 0, 也可以说, 所有的神经元都已经死了. 而有 BN, relu 过后, 每层的值都能有一个比较好的分布效果, 大部分神经元都还活着.
relu误差对比:
因为没有使用 BN 的网络, 大部分神经元都死了, 所以连误差曲线都没了.