前言
本节继续学习深度卷积神经网络
- 批量归一化
- ResNet
- DenseNet
1、批量归一化
- 对深层神经⽹络来说,即使输⼊数据已做标准化,训练中模型参数的更新依然很容易造成靠近输出层输出的剧烈变化
- 批量归⼀化利⽤小批量上的均值和标准差,不断调整神经⽹络中间输出,从而使整个神经⽹络在各层的中间输出的数值更稳定
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn
"""实现批量归一化"""
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
# 通过autograd来判断当前模式是训练模式还是预测模式
if not autograd.is_training():
# 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全连接层的情况,计算特征维上的均值和方差
mean = X.mean(axis=0)
var = ((X - mean) ** 2).mean(axis=0)
else:
# 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差
# 这里保持X的形状以便后面可以做广播运算
mean = X.mean(axis=(0, 2, 3), keepdims=True)
var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
# 训练模式下用当前的均值和方差做标准化
X_hat = (X - mean) / nd.sqrt(var + eps)
# 更新移动平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸和偏移
return Y, moving_mean, moving_var
# BatchNorm层保存参与求梯度和迭代的拉伸参数gamma和偏移参数beta
# 同时也维护移动平均得到的均值和方差,以便能够在模型预测时被使用
class BatchNorm(nn.Block):
def __init__(self, num_features, num_dims, **kwargs):
super(BatchNorm, self).__init__(**kwargs)
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# 参与求梯度和迭代的拉伸和偏移参数,分别初始化成1和0
self.gamma = self.params.get('gamma', shape=shape, init=init.One())
self.beta = self.params.get('beta', shape=shape, init=init.Zero())
# 不参与求梯度和迭代的变量,全在内存上初始化成0
self.moving_mean = nd.zeros(shape)
self.moving_var = nd.zeros(shape)
def forward(self, X):
# 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上
if self.moving_mean.context != X.context:
self.moving_mean = self.moving_mean.copyto(X.context)
self.moving_var = self.moving_var.copyto(X.context)
# 保存更新过的moving_mean和moving_var
Y, self.moving_mean, self.moving_var = batch_norm(
X, self.gamma.data(), self.beta.data(), self.moving_mean,
self.moving_var, eps=1e-5, momentum=0.9)
return Y
2、ResNet
残差网络
基础模块是残差块
残差块
如图所示
- 有2个有相同输出通道数的3 * 3卷积层
- 每个卷积层后接⼀个批量归⼀化层和ReLU激活函数
- 如果想改变通道数,就需要引⼊⼀个额外的1*1卷积层来将输⼊变换成需要的形状后再做
相加运算
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
# 残差块
class Residual(nn.Block): # 本类已保存在d2lzh包中方便以后使用
def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
super(Residual, self).__init__(**kwargs)
self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides)
self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm()
self.bn2 = nn.BatchNorm()
def forward(self, X):
Y = nd.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return nd.relu(Y + X)
# 查看输⼊和输出形状⼀致的情况
blk = Residual(3)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 6, 6))
blk(X).shape
# 可以在增加输出通道数的同时减半输出的高和宽。
blk = Residual(6, use_1x1conv=True, strides=2)
blk.initialize()
blk(X).shape
ResNet模型
- 前两层在输出通道数为64、步幅为2的7 * 7卷积层后接步幅为2的3 * 3的最⼤池化层,每个卷积层后是批量归⼀化层
- 后面是4个由残差块组成的模块,第⼀个模块的通道数同输⼊通道数⼀致,之后的每个模块在第⼀个残差块⾥将上⼀个模块的通道数翻倍,并将⾼和宽减半
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
"""实现ResNet"""
# ResNet模型
net = nn.Sequential()
# 前两层
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3),
nn.BatchNorm(), nn.Activation('relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 后面的模块
def resnet_block(num_channels, num_residuals, first_block=False):
blk = nn.Sequential()
for i in range(num_residuals):
if i == 0 and not first_block:
blk.add(Residual(num_channels, use_1x1conv=True, strides=2))
else:
blk.add(Residual(num_channels))
return blk
net.add(resnet_block(64, 2, first_block=True),
resnet_block(128, 2),
resnet_block(256, 2),
resnet_block(512, 2))
# 全局平均池化层后接上全连接层输出
net.add(nn.GlobalAvgPool2D(), nn.Dense(10))
# 观察⼀下输⼊形状
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
X = layer(X)
print(layer.name, 'output shape:\t', X.shape)
# 数据和训练
lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
3、DenseNet
稠密连接⽹络
- 是ResNet的延伸
- 主要区别如图所示,模块不是相加,是在通道维上连结
稠密块
基础模块
网络名称的由来
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.gluon import nn
# 改良版的批量归⼀化、激活和卷积结构
def conv_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=3, padding=1))
return blk
# 稠密块
class DenseBlock(nn.Block):
def __init__(self, num_convs, num_channels, **kwargs):
super(DenseBlock, self).__init__(**kwargs)
self.net = nn.Sequential()
for _ in range(num_convs):
self.net.add(conv_block(num_channels)) #多个conv_block组成
def forward(self, X):
for blk in self.net:
Y = blk(X)
X = nd.concat(X, Y, dim=1) #在通道维上将输入和输出连结
return X
# 观察
blk = DenseBlock(2, 10) #2个输出通道数为10的卷积块,卷积块的通道数控制了输出通道数相对于输⼊通道数的增⻓,因此也被称为增⻓率(growth rate)
blk.initialize()
X = nd.random.uniform(shape=(4, 3, 8, 8))
Y = blk(X)
print(Y.shape)
过渡层
- 控制模型复杂度
- 通过1 * 1卷积层来减小通道数
- 使⽤步幅为2的平均池化层减半⾼和宽,从而进⼀步降低模型复杂度
def transition_block(num_channels):
blk = nn.Sequential()
blk.add(nn.BatchNorm(), nn.Activation('relu'),
nn.Conv2D(num_channels, kernel_size=1), #减小通道数
nn.AvgPool2D(pool_size=2, strides=2)) #减半高和宽
return blk
# 观察
blk = transition_block(10)
blk.initialize()
print(blk(Y).shape)
DenseNet模型
- 同ResNet⼀样的单卷积层和最⼤池化层
- 多个稠密块和过渡层,控制卷积层数和通道数
- 最后接上全局池化层和全连接层
# DenseNet模型
net = nn.Sequential()
# 同ResNet一样的单卷积层和最大池化层。
net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3),
nn.BatchNorm(), nn.Activation('relu'),
nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# 4个稠密块
num_channels, growth_rate = 64, 32 # num_channels为当前的通道数
num_convs_in_dense_blocks = [4, 4, 4, 4] #设置每个稠密块使⽤多少个卷积层,这⾥设成4
for i, num_convs in enumerate(num_convs_in_dense_blocks):
net.add(DenseBlock(num_convs, growth_rate))
# 上一个稠密块的输出通道数
num_channels += num_convs * growth_rate
# 在稠密块之间加入过渡层减半⾼和宽,并减半通道数
if i != len(num_convs_in_dense_blocks) - 1:
num_channels //= 2
net.add(transition_block(num_channels))
# 同ResNet一样,最后接上全局池化层和全连接层来输出。
net.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(),
nn.Dense(10))
# 数据和训练
lr, num_epochs, batch_size, ctx = 0.1, 5, 256, d2l.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
结语
简单了解了稳定模型数值的批量归一化和延伸出来的ResNet、DenseNet