Table of contents
1. Access to model parameters
All parameters (returned as an iterator) can be accessed through the parameters()
or , which returns its name in addition to the parameter Tensor.named_parameters
from torch import nn
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1)) # pytorch已进行默认的参数初始化
print(net)
# Sequential(
# (0): Linear(in_features=4, out_features=3, bias=True)
# (1): ReLU()
# (2): Linear(in_features=3, out_features=1, bias=True)
# )
print(type(net.named_parameters()))
# <class 'generator'>
named_parameters
method to access all parameters
for name, param in net.named_parameters():
print(name, param.size())
# 0.weight torch.Size([3, 4])
# 0.bias torch.Size([3])
# 2.weight torch.Size([1, 3])
# 2.bias torch.Size([1])
parameters
method to access all parameters
for param in net.parameters():
print(param, param.size())
print('*'*20)
# Parameter containing: tensor([[ 0.2202, -0.2954, 0.4630, -0.1012],
# [ 0.2209, -0.4296, -0.3343, -0.4902],
# [ 0.2739, 0.2316, 0.4456, -0.2660]], requires_grad=True)
# torch.Size([3, 4])
# ********************
# Parameter containing: tensor([ 0.0928, 0.4861, -0.1114], requires_grad=True)
# torch.Size([3])
# ********************
# Parameter containing: tensor([[ 0.2844, -0.2298, 0.2219]], requires_grad=True)
# torch.Size([1, 3])
# ********************
# Parameter containing:tensor([0.4926], requires_grad=True)
# torch.Size([1])
# ********************
As can be seen above, the name returned by named_parameters
the method is automatically prefixed with the index of the layer number.
The following uses the index to access a certain layer of the network and obtain the parameters of this layer
# 通过索引来访问网络的任一层, 并获取该层的参数
for name, param in net[0].named_parameters():
print(name, param.size(), type(param))
# weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
# bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>
The type of the returned param is torch.nn.parameter.Parameter
, which is a subclass of Tensor, and the relevant characteristics are described in detail below.
2. Model parameter torch.nn.parameter
torch.nn.parameter.Parameter
: It is a subclass of Tensor. The difference from Tensor is that if a Tensor is a Parameter, it will be automatically added to the parameter list of the model. The following example:
import torch
from torch import nn
class MyModel(nn.Module):
def __init__(self, **kwargs):
super(MyModel, self).__init__()
self.weight1 = nn.Parameter(torch.rand(20, 20))
self.weight2 = torch.rand(20, 20)
def forward(self, x):
pass
n = MyModel()
for name, param in n.named_parameters():
print(name) # weight1
Because Parameter is a Tensor, that is, it has all the properties of Tensor, such asAccess the parameter value according to data, use grad to access the parameter gradient。
import torch
from torch import nn
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1)) # pytorch已进行默认的参数初始化
print(net)
print('*'*20)
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad) # 反向传播前梯度为None
print('*'*20)
X = torch.rand(2, 4)
Y = net(X).sum()
Y.backward()
print(weight_0.grad) # 反向传播后,计算出了参数梯度
# Sequential(
# (0): Linear(in_features=4, out_features=3, bias=True)
# (1): ReLU()
# (2): Linear(in_features=3, out_features=1, bias=True)
# )
# ********************
# tensor([[ 0.1833, 0.1013, 0.4618, -0.2482],
# [-0.4296, 0.2273, -0.1163, 0.0901],
# [-0.0253, 0.3190, 0.3539, -0.0818]])
# None
# ********************
# tensor([[ 0.5202, 0.3802, 0.3237, 0.0833],
# [ 0.0155, 0.0242, 0.0096, 0.0029],
# [-0.0048, -0.0075, -0.0030, -0.0009]])
3. Initialization of model parameters
PyTorch's torch.nn.init
module provides a variety of preset initialization methods
nn.init.uniform_(tensor, a= 0., b= 1.)
nn.init.normal_(tensor, mean= 0., std= 1.)
nn.init.constant_(tensor, val)
nn.init.ones_(tensor)
nn.init.zeros_(tensor)
nn.init.eye_(tensor)
nn.init.dirac_(tensor, groups=1)
In the following example, we initialize the weight parameter to a normally distributed random number with a mean of 0 and a standard deviation of 0.01, and still set the bias parameter to 0.
import torch
from torch import nn
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))
for name, param in net.named_parameters():
if 'weight' in name:
nn.init.normal_(param, mean=0, std=0.01)
print(name, param.data)
if 'bias' in name:
nn.init.constant_(param, val=0)
print(name, param.data)
# 0.weight tensor([[ 1.0030e-03, 9.9017e-03, 1.5393e-03, -1.9146e-02],
# [-1.7850e-02, -9.5327e-03, -9.7842e-03, 2.5997e-02],
# [ 4.6419e-03, -8.4267e-03, -4.2336e-03, 9.3962e-05]])
# 0.bias tensor([0., 0., 0.])
# 2.weight tensor([[-0.0026, -0.0046, 0.0094]])
# 2.bias tensor([0.])
Sometimes the initialization method we need is not provided in the init module. =
At this point, we need to implement an initialization method ourselves, so that we can use it like other initialization methods.
4. Custom parameter initialization method
Before customizing parameter initialization methods, let's take a look at how PyTorch implements these initialization methods, such as torch.nn.init.normal_:
# 可以看到这就是一个 inplace 改变 Tensor 值的函数,而且这个过程是不记录梯度的。
def normal_(tensor, mean=0, std=1):
with torch.no_grad():
return tensor.normal_(mean, std)
for initializing parameters
import torch
from torch import nn
def rewrite_normal_(tensor, mean=0., std=1.):
with torch.no_grad():
return tensor.normal_(mean, std)
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1)) # pytorch已进行默认的参数初始化
for name, param in net.named_parameters():
if 'weight' in name:
rewrite_normal_(param, mean=0, std=0.01)
print(name, param.grad, param.requires_grad)
print(param.data)
print('*'*20)
# 0.weight None True
# tensor([[ 7.3672e-03, 1.9321e-02, -8.8349e-05, -1.9147e-03],
# [ 2.8177e-02, 3.1844e-03, -1.1902e-03, 1.2436e-02],
# [ 6.2070e-04, -3.4600e-03, 2.7835e-03, -1.2652e-02]])
# ********************
# 2.weight None True
# tensor([[-0.0160, -0.0034, 0.0134]])
# ********************
In the following example, we initialize half of the weights to 0 and half to [ − 10 , − 5 ] [−10,−5][−10,− 5 ] and[ 5 , 10 ] [5,10][5,10 ] Random numbers uniformly distributed in two intervals.
import torch
from torch import nn
def init_weight_(tensor):
with torch.no_grad():
tensor.uniform_(-10, 10)
tensor *= (tensor.abs() >= 5).float()
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))
for name, param in net.named_parameters():
if 'weight' in name:
init_weight_(param)
print(name, param.data)
# 0.weight tensor([[-8.2931, 0.0000, 6.2811, -0.0000],
# [-0.0000, 5.2139, -6.8847, -0.0000],
# [ 0.0000, -9.9403, 0.0000, -6.1358]])
# 2.weight tensor([[-5.6402, -7.9804, -9.5935]])
5. Sharing model parameters
In some cases, we want to share model parameters between multiple layers.
- Method 1: Call the same layer multiple times in the forward function
- Method 2: The module passed into Sequential is the same Module instance, and the parameters are also shared
import torch
from torch import nn
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
# Sequential(
# (0): Linear(in_features=1, out_features=1, bias=False)
# (1): Linear(in_features=1, out_features=1, bias=False)
# )
for name, param in net.named_parameters():
nn.init.constant_(param, val=3)
print(name, param.data)
# 0.weight tensor([[3.]])
# 在内存中,这两个线性层其实一个对象:
print(id(net[0]) == id(net[1])) # True
print(id(net[0].weight) == id(net[1].weight)) # True
x = torch.ones(1, 1)
y = net(x).sum()
print(y) # tensor(9., grad_fn=<SumBackward0>)
y.backward()
print(net[0].weight.grad) # tensor([[6.]]) 单次梯度是3,两次所以就是6