A General Framework for Uncertainty Estimation in Deep Learning Source Code Reading (2)

continued

ResNet definition:

code usage

def ResNet18ADF(noise_variance=1e-3, min_variance=1e-3):
    return ResNet(BasicBlock, [2,2,2,2], num_classes=10, noise_variance=1e-3, min_variance=1e-3, initialize_msra=False)

Define the model, where ResNet is defined as:

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10, noise_variance=1e-3, min_variance=1e-3, initialize_msra=False):
        super(ResNet, self).__init__()

        self.keep_variance_fn = lambda x: keep_variance(x, min_variance=min_variance)
        self._noise_variance = noise_variance

        self.in_planes = 64

        self.conv1 = adf.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
        self.bn1 = adf.BatchNorm2d(64, keep_variance_fn=self.keep_variance_fn)
        self.ReLU = adf.ReLU(keep_variance_fn=self.keep_variance_fn)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1, keep_variance_fn=self.keep_variance_fn)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2, keep_variance_fn=self.keep_variance_fn)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2, keep_variance_fn=self.keep_variance_fn)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2, keep_variance_fn=self.keep_variance_fn)
        self.linear = adf.Linear(512*block.expansion, num_classes, keep_variance_fn=self.keep_variance_fn)
        self.AvgPool2d = adf.AvgPool2d(keep_variance_fn=self.keep_variance_fn)

    def _make_layer(self, block, planes, num_blocks, stride, keep_variance_fn=None):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, keep_variance_fn=self.keep_variance_fn))
            self.in_planes = planes * block.expansion
        return adf.Sequential(*layers)

    def forward(self, x):
 
        inputs_mean = x
        inputs_variance = torch.zeros_like(inputs_mean) + self._noise_variance
        x = inputs_mean, inputs_variance

        out = self.ReLU(*self.bn1(*self.conv1(*x)))
        out = self.layer1(*out)
        out = self.layer2(*out)
        out = self.layer3(*out)
        out = self.layer4(*out)
        out = self.AvgPool2d(*out, 4)
        out_mean = out[0].view(out[0].size(0), -1) # Flatten
        out_var = out[1].view(out[1].size(0), -1)
        out = out_mean, out_var
        out = self.linear(*out)
        return out

Among them, the role of * is:

In Python, an asterisk (*) is usually used for unpacking operations. When an asterisk appears in front of an argument in a function call, it tells Python to unpack that argument into individual values before passing those values to the function.
When an asterisk appears before a variable name, it can be used to denote a variable number of arguments. This is called a variable-length argument list or an arbitrary argument. Such syntax allows functions to accept an indeterminate number of arguments.
Here is an example showing how to use an asterisk to define a variable argument list:
def my_func(*args):
for arg in args:
print(arg)
my_func(1, 2, 3, 4, 5)
This function can Accepts any number of arguments, and prints them. In a function, the parameter args is defined as a tuple containing all the arguments passed to the function.

(It can be observed that the output of the model layer such as self.layer should contain two parts)

adf.Sequential()

The key in ResNet is _make_layer, which uses adf.Sequential:

class Sequential(nn.Module):
    def __init__(self, *args):
        super(Sequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def _get_item_by_idx(self, iterator, idx):
        """Get the idx-th item of the iterator"""
        size = len(self)
        idx = operator.index(idx)
        if not -size <= idx < size:
            raise IndexError('index {} is out of range'.format(idx))
        idx %= size
        return next(islice(iterator, idx, None))

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return Sequential(OrderedDict(list(self._modules.items())[idx]))
        else:
            return self._get_item_by_idx(self._modules.values(), idx)

    def __setitem__(self, idx, module):
        key = self._get_item_by_idx(self._modules.keys(), idx)
        return setattr(self, key, module)

    def __delitem__(self, idx):
        if isinstance(idx, slice):
            for key in list(self._modules.keys())[idx]:
                delattr(self, key)
        else:
            key = self._get_item_by_idx(self._modules.keys(), idx)
            delattr(self, key)

    def __len__(self):
        return len(self._modules)

    def __dir__(self):
        keys = super(Sequential, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def forward(self, inputs, inputs_variance):
        for module in self._modules.values():
            inputs, inputs_variance = module(inputs, inputs_variance)

        return inputs, inputs_variance

Among them, the function of add_module() is:

add_module() is a method of the nn.Module class in PyTorch, which is used to add submodules to the current module. It accepts two parameters:

name: The name of the submodule. module: The submodule to add. Here is an example showing how to
add submodules to a module using the add_module() method:
     import torch.nn as nn

   class MyModel(nn.Module):
def __init__(self):
    super(MyModel, self).__init__()
    self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
    self.relu1 = nn.ReLU()
    self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
    self.relu2 = nn.ReLU()
    self.fc1 = nn.Linear(32 * 28 * 28, 1024)
    self.relu3 = nn.ReLU()
    self.fc2 = nn.Linear(1024, 10)
    self.softmax = nn.Softmax(dim=1)
    # 使用 add_module() 方法添加一个 BatchNorm2d 模块
    self.bn = nn.BatchNorm2d(32)
    self.add_module('batch_norm', self.bn)
def forward(self, x):
    x = self.conv1(x)
    x = self.relu1(x)
    x = self.conv2(x)
    x = self.bn(x)
    x = self.relu2(x)
    x = x.view(-1, 32 * 28 * 28)
    x = self.fc1(x)
    x = self.relu3(x)
    x = self.fc2(x)
    x = self.softmax(x)
    return x 
In this example, we create a custom model MyModel and use the add_module() method to add a BatchNorm2d module to the model. This method makes it easy to manage the submodules in the model and access and modify them when needed.

In the above code, it is indeed possible to directly use self.bn to call the BatchNorm2d
module without using the add_module() method. However, using the add_module()
method binds the module's name to its instance, making the module's name accessible and modifiable elsewhere in the model. This is useful when the model is complex.

For example, if we want to access all BatchNorm2d modules in the model, we can use the named_modules()
method to get all modules and their names, and filter out the BatchNorm2d modules:
  for name, module in my_model.named_modules():
		if isinstance(module, nn.BatchNorm2d):
    		print(f'{name}: {module}') 这将输出模型中所有的 BatchNorm2d 模块及其名称。
In addition, the add_module() method can also be
used together with the register_parameter() method and other methods to conveniently manage the parameters and other attributes of the model. Therefore, when designing complex models, using the add_module()
method can improve the readability and maintainability of the code.

In forward, read the added module for calculation:

    def forward(self, inputs, inputs_variance):
        for module in self._modules.values():
            inputs, inputs_variance = module(inputs, inputs_variance)

        return inputs, inputs_variance

BasicBlock

The key in ResNet is _make_layer, which uses block(), which refers to:

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, keep_variance_fn=None):
        super(BasicBlock, self).__init__()
        
        self.keep_variance_fn = keep_variance_fn
        
        self.conv1 = adf.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
        self.bn1 = adf.BatchNorm2d(planes, keep_variance_fn=self.keep_variance_fn)
        self.conv2 = adf.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
        self.bn2 = adf.BatchNorm2d(planes, keep_variance_fn=self.keep_variance_fn)
        self.ReLU = adf.ReLU(keep_variance_fn=self.keep_variance_fn)

        self.shortcut = adf.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = adf.Sequential(
                adf.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False, keep_variance_fn=self.keep_variance_fn),
                adf.BatchNorm2d(self.expansion*planes, keep_variance_fn=self.keep_variance_fn)
            )

    def forward(self, inputs_mean, inputs_variance):
        x = inputs_mean, inputs_variance
        
        out = self.ReLU(*self.bn1(*self.conv1(*x)))
        out_mean, out_var = self.bn2(*self.conv2(*out))
        shortcut_mean, shortcut_var = self.shortcut(*x)
        out_mean, out_var = out_mean + shortcut_mean, out_var + shortcut_var
        out = out_mean, out_var 
        out = self.ReLU(*out)
        return out

In BasicBlock, mean and var are synthesized into one out, and Sequential chooses to spread mean and var separately. Is there any explanation?

adf.Conv2d()

You can see that BasicBlock uses adf.Conv2d():

class Conv2d(_ConvNd):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True,
                 keep_variance_fn=None, padding_mode='zeros'):
        self._keep_variance_fn = keep_variance_fn
        kernel_size = _pair(kernel_size)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)
        super(Conv2d, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            False, _pair(0), groups, bias, padding_mode)

    def forward(self, inputs_mean, inputs_variance):
        outputs_mean = F.conv2d(
            inputs_mean, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
        outputs_variance = F.conv2d(
            inputs_variance, self.weight ** 2, None, self.stride, self.padding, self.dilation, self.groups)
        if self._keep_variance_fn is not None:
            outputs_variance = self._keep_variance_fn(outputs_variance)
        return outputs_mean, outputs_variance

Among them, the weights are shared for the mean and variance:

outputs_mean, outputs_variance Do the two convolution operations share weights?
Yes, the two convolution operations, outputs_mean and outputs_variance, share weights in this class. In the init () method of this class , the convolution kernel weights and biases of the model are initialized using the constructor of the parent class _ConvNd, and these weights and biases are called in the forward() method .
Specifically, the first parameter inputs_mean and the second parameter inputs_variance of the F.conv2d() function both use the self.weight weight as the convolution kernel, which means that the two convolution operations share the same weight. At the same time, since inputs_variance uses the square of self.weight as the convolution kernel, the weights of the two convolution operations are actually different.
It should be noted that in convolutional neural networks, sharing weights is a common technique to reduce the number of parameters that need to be trained in the model, thereby mitigating the risk of overfitting. However, shared weights may also limit the expressiveness of the model, so trade-offs and choices need to be made according to the specific problem and data set.

This code, using $weight^2$ as the weight of the variance, what is the reason? Barely understandable!

adf.BatchNorm2d()

You can see that adf.BatchNorm2d() is used in BasicBlock:

class BatchNorm2d(nn.Module):
    _version = 2
    __constants__ = ['track_running_stats', 'momentum', 'eps', 'weight', 'bias',
                     'running_mean', 'running_var', 'num_batches_tracked']

    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
                 track_running_stats=True, keep_variance_fn=None):
        super(BatchNorm2d, self).__init__()
        self._keep_variance_fn = keep_variance_fn
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        self.track_running_stats = track_running_stats
        if self.affine:
            self.weight = Parameter(torch.Tensor(num_features))
            self.bias = Parameter(torch.Tensor(num_features))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
        if self.track_running_stats:
            self.register_buffer('running_mean', torch.zeros(num_features))
            self.register_buffer('running_var', torch.ones(num_features))
            self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
        else:
            self.register_parameter('running_mean', None)
            self.register_parameter('running_var', None)
            self.register_parameter('num_batches_tracked', None)
        self.reset_parameters()

    def reset_running_stats(self):
        if self.track_running_stats:
            self.running_mean.zero_()
            self.running_var.fill_(1)
            self.num_batches_tracked.zero_()

    def reset_parameters(self):
        self.reset_running_stats()
        if self.affine:
            nn.init.uniform_(self.weight)
            nn.init.zeros_(self.bias)

    def _check_input_dim(self, input):
        raise NotImplementedError

    def forward(self, inputs_mean, inputs_variance):

        # exponential_average_factor is self.momentum set to
        # (when it is available) only so that if gets updated
        # in ONNX graph when this node is exported to ONNX.
        if self.momentum is None:
            exponential_average_factor = 0.0
        else:
            exponential_average_factor = self.momentum

        if self.training and self.track_running_stats:
            if self.num_batches_tracked is not None:
                self.num_batches_tracked += 1
                if self.momentum is None:  # use cumulative moving average
                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
                else:  # use exponential moving average
                    exponential_average_factor = self.momentum

        outputs_mean = F.batch_norm(
            inputs_mean, self.running_mean, self.running_var, self.weight, self.bias,
            self.training or not self.track_running_stats,
            exponential_average_factor, self.eps)
        outputs_variance = inputs_variance
        weight = ((self.weight.unsqueeze(0)).unsqueeze(2)).unsqueeze(3)
        outputs_variance = outputs_variance*weight**2
        """
        for i in range(outputs_variance.size(1)):
            outputs_variance[:,i,:,:]=outputs_variance[:,i,:,:].clone()*self.weight[i]**2
        """
        if self._keep_variance_fn is not None:
            outputs_variance = self._keep_variance_fn(outputs_variance)
        return outputs_mean, outputs_variance

For the mean and variance, only the mean needs to be normalized, not the variance;
and this function adds two learnable parameters:

In the constructor of the BatchNorm2d class, when the affine parameter is True, two learnable parameters, self.weight and self.bias, will be initialized. Specifically, here self.weight is a 1D tensor of shape (num_features,) used to scale the normalized data. And self.bias is also a 1D tensor of shape (num_features,) used to translate the normalized data.
These two parameters will be applied to the output tensor in the forward() method, thereby further improving the expressiveness and flexibility of the model. In the forward() method, self.weight is used to scale the normalized data, that is, data with a mean of 0 and a variance of 1 is scaled to a data with a mean of 0 and a variance of self.weight. And self.bias is used to translate the normalized data, that is, to add a bias item self.bias to the scaled data, so that the model can adapt to more complex and diverse data distributions.
It should be noted that if the affine parameter is set to False in the constructor, self.weight and self.bias will not be initialized, and zoom and translation operations will not be applied. In this case, the BatchNorm2d class actually just normalizes the mean and variance of the input data without introducing additional learnable parameters.

adf.ReLU()

You can see that adf.ReLU() is used in BasicBlock:

class ReLU(nn.Module):
    def __init__(self, keep_variance_fn=None):
        super(ReLU, self).__init__()
        self._keep_variance_fn = keep_variance_fn

    def forward(self, features_mean, features_variance):
        features_stddev = torch.sqrt(features_variance)
        div = features_mean / features_stddev
        pdf = normpdf(div)
        cdf = normcdf(div)
        outputs_mean = features_mean * cdf + features_stddev * pdf
        outputs_variance = (features_mean ** 2 + features_variance) * cdf \
                           + features_mean * features_stddev * pdf - outputs_mean ** 2
        if self._keep_variance_fn is not None:
            outputs_variance = self._keep_variance_fn(outputs_variance)
        return outputs_mean, outputs_variance

In the forward() method, the input parameters include two parts, namely features_mean represents the mean of the input features, and features_variance represents the variance of the input features. When calculating the output, first calculate the standard deviation features_stddev of the input features, and then calculate cdf and pdf, which represent the cumulative distribution function and probability density function of the standard normal distribution, respectively. Finally, according to the definition of the ReLU function, the mean and variance are processed separately to obtain the mean outputs_mean and variance outputs_variance of the output features.