import torch
from torch import nn
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)
tensor([[-0.3771],
[-0.3822]], grad_fn=<AddmmBackward0>)
5.2.1 Parameter access
A model defined by the Sequential class can access any of its layers by index, just like a list, with the parameters for each layer in its properties. The parameters of the second fully connected layer are shown below. The parameter name uniquely identifies the parameter.
print(net[2].state_dict())
OrderedDict([('weight', tensor([[ 0.0867, 0.1007, 0.2371, 0.1944, -0.2581, -0.2854, -0.0813, -0.1310]])), ('bias', tensor([-0.2911]))])
-
target parameters
Parameters are represented as instances of parameter classes
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
net[2].weight.grad == None
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2911], requires_grad=True)
tensor([-0.2911])
True
-
Access all parameters at once
When the block is very complex, it is necessary to recurse the entire tree to extract the parameters of each sub-block.
print(*[(name, param.shape) for name, param in net[0].named_parameters()]) # 太优雅了!列表推导式+封包
print(*[(name, param.shape) for name, param in net.named_parameters()])
net.state_dict()['2.bias'].data # 也可通过键值访问(这什么奇妙的语法)
('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([-0.2911])
- Collect parameters from nested blocks
def block1():
return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
nn.Linear(8, 4), nn.ReLU())
def block2():
net = nn.Sequential()
for i in range(4):
net.add_module(f'block {
i}', block1())
return net
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)
print(rgnet) # 查看网络结构
rgnet[0][1][0].bias.data # 可嵌套索引访问
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)
tensor([-0.2230, 0.3445, -0.0584, -0.4562, 0.3161, -0.4832, 0.2733, 0.1244])
5.2.2 Parameter initialization
-
Built-in initialization
The framework provides default random initialization and also allows us to create custom initialization methods.
By default, PyTorch initializes the weight and bias matrices uniformly based on a range. PyTorch's nn.init module provides many built-in initialization methods.
def init_normal(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, mean=0, std=0.01) # 提供均值为0,标准差为0.01的初始化
nn.init.zeros_(m.bias) # 偏置设为0
net.apply(init_normal) # 应用参数
net[0].weight.data[0], net[0].bias.data[0]
(tensor([-0.0041, 0.0007, 0.0194, 0.0155]), tensor(0.))
def init_constant(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 1) # 初始化为 1
nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]
(tensor([1., 1., 1., 1.]), tensor(0.))
def init_xavier(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
def init_42(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 42)
net[0].apply(init_xavier) # 使用 Xavier 初始化第一层
net[2].apply(init_42) # 第三层初始化为常数值42
print(net[0].weight.data[0])
print(net[2].weight.data)
tensor([ 0.2770, -0.0892, 0.1333, 0.0069])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])
- Custom initialization
The following example implements a custom distribution initialization
w ∼ { U ( 5 , 10 ) , the possibility is 1 4 0 , the possibility is 1 2 U ( − 10 , − 5 ) , the possibility is 1 4 \begin{equation} w\sim\left\{ \begin {aligned} & U(5,10), && the possibility is\frac{1}{4}\\ & 0, && the possibility is\frac{1}{2}\\ & U(-10,-5 ), && the possibility is\frac{1}{4} \end{aligned} \right. \end{equation}w∼⎩ ⎨ ⎧U(5,10),0,U(−10,−5),The possibility is41The possibility is21The possibility is41
def my_init(m):
if type(m) == nn.Linear:
print("Init", *[(name, param.shape)
for name, param in m.named_parameters()][0])
# 妙蛙,根据对称性,先生成全范围再按范围删除
nn.init.uniform_(m.weight, -10, 10)
m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight[:2]
Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[-5.8878, -5.7943, 8.7435, -7.0228],
[-0.0000, -0.0000, 5.7496, -8.9288]], grad_fn=<SliceBackward0>)
# 我们始终可以直接设置参数
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]
tensor([42.0000, -4.7943, 9.7435, -6.0228])
5.2.3 Parameter binding
shared = nn.Linear(8, 8) # 共享层
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
shared, nn.ReLU(),
shared, nn.ReLU(),
nn.Linear(8, 1))
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100 # 测试一下是否是同一个对象而不是同一个值
print(net[2].weight.data[0] == net[4].weight.data[0])
tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
practise
(1) Use the FancyMLP model to access the parameters of each layer.
What is FancyMLP?
(2) Check the initialization module documentation to learn about the different initialization methods.
dir(nn.init)
['Tensor',
'__builtins__',
'__cached__',
'__doc__',
'__file__',
'__loader__',
'__name__',
'__package__',
'__spec__',
'_calculate_correct_fan',
'_calculate_fan_in_and_fan_out',
'_make_deprecate',
'_no_grad_fill_',
'_no_grad_normal_',
'_no_grad_trunc_normal_',
'_no_grad_uniform_',
'_no_grad_zero_',
'calculate_gain',
'constant',
'constant_',
'dirac',
'dirac_',
'eye',
'eye_',
'kaiming_normal',
'kaiming_normal_',
'kaiming_uniform',
'kaiming_uniform_',
'math',
'normal',
'normal_',
'ones_',
'orthogonal',
'orthogonal_',
'sparse',
'sparse_',
'torch',
'trunc_normal_',
'uniform',
'uniform_',
'warnings',
'xavier_normal',
'xavier_normal_',
'xavier_uniform',
'xavier_uniform_',
'zeros_']
(3) Construct a multivariate perceptron containing a shared parameter layer and train it. During the training process, observe the parameters and gradients of each layer of the model.
slightly
(4) Why is sharing parameters a good way?
Shared parameters can achieve translation invariance, sometimes better, such as convolutional neural networks.