Paper reading notes: SqueezeNet

1. SqueezeNet

Iandola, Forrest N., et al. “SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and< 0.5 MB model size.” arXiv preprint arXiv:1602.07360 (2016).

This paper proposes a lightweight image classification network: SqueezeNet, as the title of the article says, has 50 times fewer parameters than AlexNet. Can be deployed on embedded devices to reduce communication overhead during distributed training.

The idea of ​​the article comes from three strategies for designing network structures:

  1. Replace 3x3 filters with 1x1 filters (try to replace 3x3 convolution with 1x1 convolution)
  2. Decrease the number of input channels to 3x3 filters (the number of input channels for 3x3 convolution is reduced as much as possible)
  3. Downsample late in the network so that convolution layers have large activation
    maps (downsampling is done behind the network, which can make the size of the middle feature map of the network large enough, which is conducive to improving the accuracy of the model)
    where strategies 1 and 2 can reduce the amount of parameters , strategy 3 is used to ensure the accuracy of the model.
    insert image description here

Based on these three strategies, the author designed the network module shown in the figure above, called Fire Module. The input is first processed through the 1x1 convolution of the squeeze layer for dimensionality reduction. Here, the channel dimension is reduced, because 3x3 convolution is required for processing later. Following strategy 2, the number of channels for 3x3 convolution input needs to be reduced. The expand layer consists of a 1x1 convolution kernel and a 3x3 convolution mixture, and then splices the outputs of the two convolutions in the channel dimension. There is no downsampling in the whole process, so the output image size is the same as the input, but the number of output channels becomes the number of output channels of the expand layer 1x1 convolution plus the number of output channels of the 3x3 convolution.

The code for the Fire Module is given below:

class Fire(nn.Module):

    def __init__(self, inplanes, squeeze_planes, expland1x1_planes, expand3x3_planes):
        # 输出的形状为 [batch_size, expand1x1_palnes + expand3x3_planes, H, W]
        super(Fire, self).__init__()
        self.squeeze = nn.Conv2d(in_channels=inplanes,
                                 out_channels=squeeze_planes,
                                 kernel_size=1)  # 用1x1卷积减少通道数
        self.squeeze_activation = nn.ReLU(inplace=True)
        # 混合卷积,不下采样
        self.expand1x1 = nn.Conv2d(in_channels=squeeze_planes,
                                   out_channels=expland1x1_planes,
                                   kernel_size=1)
        self.expand1x1_activation = nn.ReLU(inplace=True)
        self.expand3x3 = nn.Conv2d(in_channels=squeeze_planes,
                                   out_channels=expand3x3_planes,
                                   kernel_size=3,
                                   padding=1)
        self.expand3x3_activation = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.squeeze(x)
        x = self.squeeze_activation(x)
        y1 = self.expand1x1_activation(self.expand1x1(x))
        y2 = self.expand3x3_activation(self.expand3x3(x))
        return torch.cat((y1, y2), dim=1) # 通道维上拼接

Overlay several Fire Modules, and use the maximum pooling layer for downsampling:
insert image description here

class SqueezeNet(nn.Module):

    def __init__(self, version="1.0", num_classes=10):
        super(SqueezeNet, self).__init__()
        self.num_classes = num_classes
        if version == "1.0":
            self.features = nn.Sequential(
                nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(512, 64, 256, 256),
            )
        elif version == "1.1":
             self.features = nn.Sequential(
                nn.Conv2d(1, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Conv2d(512, self.num_classes, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return torch.flatten(x, 1)

Finally, attach the one with the residual connection structure:

class Squeeze(nn.Module):

    def __init__(self, num_classes=10):
        super(Squeeze, self).__init__()
        self.conv1 = nn.Conv2d(1, 96, kernel_size=3, stride=2)
        self.relu = nn.ReLU(inplace=True)
        self.max1 = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)

        self.fire2 = Fire(96, 16, 64, 64)
        self.fire3 = Fire(128, 16, 64, 64)
        self.fire4 = Fire(128, 16, 128,128)
        self.max2 = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)

        self.fire5 = Fire(256, 32, 128, 128)
        self.fire6 = Fire(256, 32, 192, 192)
        self.fire7 = Fire(384, 64, 192, 192)
        self.fire8 = Fire(384, 64, 256, 256)
        self.max3 = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)

        self.fire9 = Fire(512, 64, 256, 256)
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Conv2d(512, num_classes, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.max1(x)

        x = self.fire2(x)
        x = x + self.fire3(x)
        x = self.fire4(x)

        x = self.max2(x)
        x = x + self.fire5(x)
        x = self.fire6(x)
        x = x + self.fire7(x)
        x = self.fire8(x)

        x = self.max3(x)
        x = x + self.fire9(x)

        x = self.classifier(x)
        return torch.flatten(x, 1)

Guess you like

Origin blog.csdn.net/loki2018/article/details/124863810