1. Access to model parameters

All parameters (returned as an iterator) can be accessed through the parameters()or , which returns its name in addition to the parameter Tensor.named_parameters

from torch import nn

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认的参数初始化

print(net)
# Sequential(
#   (0): Linear(in_features=4, out_features=3, bias=True)
#   (1): ReLU()
#   (2): Linear(in_features=3, out_features=1, bias=True)
# )

print(type(net.named_parameters()))
# <class 'generator'>

named_parametersmethod to access all parameters

for name, param in net.named_parameters():
    print(name, param.size())
    
# 0.weight torch.Size([3, 4])
# 0.bias torch.Size([3])
# 2.weight torch.Size([1, 3])
# 2.bias torch.Size([1])

parametersmethod to access all parameters

for param in net.parameters():
    print(param, param.size())
    print('*'*20)
    
# Parameter containing: tensor([[ 0.2202, -0.2954,  0.4630, -0.1012],
#                               [ 0.2209, -0.4296, -0.3343, -0.4902],
#                               [ 0.2739,  0.2316,  0.4456, -0.2660]], requires_grad=True) 
# torch.Size([3, 4])
# ********************
# Parameter containing: tensor([ 0.0928,  0.4861, -0.1114], requires_grad=True) 
# torch.Size([3])
# ********************
# Parameter containing: tensor([[ 0.2844, -0.2298,  0.2219]], requires_grad=True) 
# torch.Size([1, 3])
# ********************
# Parameter containing:tensor([0.4926], requires_grad=True) 
# torch.Size([1])
# ********************

As can be seen above, the name returned by named_parametersthe method is automatically prefixed with the index of the layer number.
The following uses the index to access a certain layer of the network and obtain the parameters of this layer

# 通过索引来访问网络的任一层, 并获取该层的参数
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))
    
# weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
# bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>

The type of the returned param is torch.nn.parameter.Parameter, which is a subclass of Tensor, and the relevant characteristics are described in detail below.

2. Model parameter torch.nn.parameter

torch.nn.parameter.Parameter: It is a subclass of Tensor. The difference from Tensor is that if a Tensor is a Parameter, it will be automatically added to the parameter list of the model. The following example:

import torch
from torch import nn

class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__()
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    def forward(self, x):
        pass

n = MyModel()
for name, param in n.named_parameters():
    print(name)   # weight1

Because Parameter is a Tensor, that is, it has all the properties of Tensor, such asAccess the parameter value according to data, use grad to access the parameter gradient。

import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认的参数初始化
print(net)

print('*'*20)

weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad)  # 反向传播前梯度为None

print('*'*20)

X = torch.rand(2, 4)
Y = net(X).sum()
Y.backward()
print(weight_0.grad)   # 反向传播后，计算出了参数梯度


# Sequential(
#   (0): Linear(in_features=4, out_features=3, bias=True)
#   (1): ReLU()
#   (2): Linear(in_features=3, out_features=1, bias=True)
# )
# ********************
# tensor([[ 0.1833,  0.1013,  0.4618, -0.2482],
#         [-0.4296,  0.2273, -0.1163,  0.0901],
#         [-0.0253,  0.3190,  0.3539, -0.0818]])
# None
# ********************
# tensor([[ 0.5202,  0.3802,  0.3237,  0.0833],
#         [ 0.0155,  0.0242,  0.0096,  0.0029],
#         [-0.0048, -0.0075, -0.0030, -0.0009]])

3. Initialization of model parameters

PyTorch's torch.nn.initmodule provides a variety of preset initialization methods

nn.init.uniform_(tensor, a= 0., b= 1.)
nn.init.normal_(tensor, mean= 0., std= 1.)
nn.init.constant_(tensor, val)
nn.init.ones_(tensor)
nn.init.zeros_(tensor)
nn.init.eye_(tensor)
nn.init.dirac_(tensor, groups=1)

In the following example, we initialize the weight parameter to a normally distributed random number with a mean of 0 and a standard deviation of 0.01, and still set the bias parameter to 0.

import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1)) 

for name, param in net.named_parameters():
    
    if 'weight' in name:
        nn.init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

    if 'bias' in name:
        nn.init.constant_(param, val=0)
        print(name, param.data)

# 0.weight tensor([[ 1.0030e-03,  9.9017e-03,  1.5393e-03, -1.9146e-02],
#         [-1.7850e-02, -9.5327e-03, -9.7842e-03,  2.5997e-02],
#         [ 4.6419e-03, -8.4267e-03, -4.2336e-03,  9.3962e-05]])
# 0.bias tensor([0., 0., 0.])
# 2.weight tensor([[-0.0026, -0.0046,  0.0094]])
# 2.bias tensor([0.])

Sometimes the initialization method we need is not provided in the init module. =
At this point, we need to implement an initialization method ourselves, so that we can use it like other initialization methods.

4. Custom parameter initialization method

Before customizing parameter initialization methods, let's take a look at how PyTorch implements these initialization methods, such as torch.nn.init.normal_:

# 可以看到这就是一个 inplace 改变 Tensor 值的函数，而且这个过程是不记录梯度的。 
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

for initializing parameters

import torch
from torch import nn


def rewrite_normal_(tensor, mean=0., std=1.):
    with torch.no_grad():
        return tensor.normal_(mean, std)


net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认的参数初始化

for name, param in net.named_parameters():
    if 'weight' in name:
        rewrite_normal_(param, mean=0, std=0.01)
        print(name, param.grad, param.requires_grad)
        print(param.data)
        print('*'*20)


# 0.weight None True
# tensor([[ 7.3672e-03,  1.9321e-02, -8.8349e-05, -1.9147e-03],
#         [ 2.8177e-02,  3.1844e-03, -1.1902e-03,  1.2436e-02],
#         [ 6.2070e-04, -3.4600e-03,  2.7835e-03, -1.2652e-02]])
# ********************
# 2.weight None True
# tensor([[-0.0160, -0.0034,  0.0134]])
# ********************

In the following example, we initialize half of the weights to 0 and half to $[- 10, - 5]$ and $[5, 10]$ Random numbers uniformly distributed in two intervals.

import torch
from torch import nn

def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()


net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))
for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)


# 0.weight tensor([[-8.2931,  0.0000,  6.2811, -0.0000],
#         [-0.0000,  5.2139, -6.8847, -0.0000],
#         [ 0.0000, -9.9403,  0.0000, -6.1358]])
# 2.weight tensor([[-5.6402, -7.9804, -9.5935]])

5. Sharing model parameters

In some cases, we want to share model parameters between multiple layers.

Method 1: Call the same layer multiple times in the forward function
Method 2: The module passed into Sequential is the same Module instance, and the parameters are also shared

import torch
from torch import nn

linear = nn.Linear(1, 1, bias=False)

net = nn.Sequential(linear, linear)
print(net)
# Sequential(
#   (0): Linear(in_features=1, out_features=1, bias=False)
#   (1): Linear(in_features=1, out_features=1, bias=False)
# )


for name, param in net.named_parameters():
    nn.init.constant_(param, val=3)
    print(name, param.data)
# 0.weight tensor([[3.]])


# 在内存中，这两个线性层其实一个对象:
print(id(net[0]) == id(net[1]))   # True
print(id(net[0].weight) == id(net[1].weight))   # True


x = torch.ones(1, 1)
y = net(x).sum()
print(y)   # tensor(9., grad_fn=<SumBackward0>)
y.backward()
print(net[0].weight.grad) # tensor([[6.]])    单次梯度是3，两次所以就是6

[pytorch] access and initialization of model parameters

Table of contents

1. Access to model parameters

2. Model parameter torch.nn.parameter

3. Initialization of model parameters

4. Custom parameter initialization method

5. Sharing model parameters

Guess you like