Simple reproduction of residual network, Googlenet, mobilenet, SqueezeNet, ShuffleNet, Densenet

1. Residual network

1) Network structure

When calculating the partial derivative of x, when the partial derivative of F(x) for x is very small, the partial derivative of x for the whole will be close to 1

This solves the vanishing gradient problem, and we can have nice updates for layers that are close to the input.

Note that F(x) is the same tensor dimension as x.

2) Code 

import torch
from torchvision import transforms   #对图像做原始处理的工具
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim 
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,),(0.3081,))
])
train_dataset = datasets.MNIST(root = './data',train = True,download=True,transform=transform)
train_loader = DataLoader(train_dataset,shuffle = True ,batch_size = batch_size)
test_dataset = datasets.MNIST(root = './data',train = False,download=True,transform=transform)
test_loader = DataLoader(test_dataset,shuffle = False ,batch_size = batch_size)
class ResidualBlock(torch.nn.Module):
    def __init__(self,channels):
        super(ResidualBlock,self).__init__()
        self.channels = channels
        self.conv1 = torch.nn.Conv2d(channels,channels,kernel_size=3,padding=1)
        self.conv2 = torch.nn.Conv2d(channels,channels,kernel_size=3,padding=1)
        
    def forward(self,x):
        y = F.relu(self.conv1(x))
        y = self.conv2(y)
        return F.relu(x+y)
class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = torch.nn.Conv2d(1,16,kernel_size=5)
        self.conv2 = torch.nn.Conv2d(16,32,kernel_size=5)
        self.pooling = torch.nn.MaxPool2d(2)
        
        self.rblock1 = ResidualBlock(16)
        self.rblock2 = ResidualBlock(32)
        
        
        self.fc = torch.nn.Linear(512,10)
    
    def forward(self,x):
        in_size = x.size(0)
        x = self.pooling(F.relu(self.conv1(x)))
        x = self.rblock1(x)
        x = self.pooling(F.relu(self.conv2(x)))
        x = self.rblock2(x)
        x = x.view(in_size,-1)   #flatten
        x = self.fc(x)
        return x
model = Net()
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01,momentum=0.5)
def train(epoch):
    running_loss = 0.0
    model.train()
    for batch_idx,data in enumerate(train_loader,0):
        inputs,target = data
        #print(inputs.shape)
        optimizer.zero_grad()
        
        #向前传播
        outputs = model(inputs)
        loss = criterion(outputs,target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if batch_idx % 300 == 299:
            print('[%d,%5d] loss:%.3f' %(epoch +1 ,batch_idx+1,running_loss/300))
            running_loss = 0.0
for epoch in range(10):
    train(epoch)

 2.Googlenet

1) Network structure

The starting point of the model: I don’t know which convolution kernel size is the best, so different convolution kernels are needed to obtain feature maps, and then the feature maps are stitched together.

Note: The feature map generated by each convolutional layer, c can be different, but w, h must be the same.

Let's take a look at the 1*1 convolution kernel

This is obviously more like a kind of information fusion in the form of weighted summation. The information of each pixel does not contain the information of other adjacent pixels.

 core network structure

 2) Core code

class InceptionA(nn.Module):
    def __init__(self,in_channels):
        super(InceptionA,self).__init__()
        self.branch1x1 = nn.Conv2d(in_channels,16,kernel_size=1)
        
        self.branch5x5_1 = nn.Conv2d(in_channels,16,kernel_size=1)
        self.branch5x5_2 = nn.Conv2d(16,24,kernel_size=5,padding=2)
        
        self.branch3x3_1 = nn.Conv2d(in_channels,16,kernel_size=1)
        self.branch3x3_2 = nn.Conv2d(16,24,kernel_size=3,padding=1)
        self.branch3x3_3 = nn.Conv2d(24,24,kernel_size=3,padding=1)
        
        self.branch_pool = nn.Conv2d(in_channels,24,kernel_size=1)
    
    def forward(self,x):
        branch1x1 = self.branch1x1(x)
        
        branch5x5 = self.branch5x5_1(x)
        branch5x5 = self.branch5x5_2(branch5x5)
        
        branch3x3 = self.branch3x3_1(x)
        branch3x3 = self.branch3x3_2(branch3x3)
        branch3x3 = self.branch3x3_3(branch3x3)
        
        branch_pool = F.avg_pool2d(x,kernel_size=3,stride=1,padding=1)
        branch_pool = self.branch_pool(branch_pool)
        
        outputs = [branch1x1,branch5x5,branch3x3,branch_pool]
        
        return torch.cat(outputs,dim=1)
        

class Net(nn.Module):
    def  __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(1,10,kernel_size=5)
        self.conv2 = nn.Conv2d(88,20,kernel_size=5)
            
        self.incep1 = InceptionA(in_channels = 10)
        self.incep2 = InceptionA(in_channels = 20)
            
        self.mp = nn.MaxPool2d(2)
        self.fc = nn.Linear(1408,10)
    def forward(self,x):
         in_size = x.size(0)
         x = F.relu(self.mp(self.conv1(x)))
         x = self.incep1(x)
         x = F.relu(self.mp(self.conv2(x)))
         x = self.incep2(x)
         x = x.view(in_size,-1)
         s = self.fc(x)
         return x
model = Net()

3.mobilenet

1) Network structure

The model uses depthwise separation convolution,  which can be divided into two steps, Depthwise Convolution and Pointwise Convolution.

The calculation of Depthwise Convolution is very simple. It uses a convolution kernel for each channel of the input feature map, and then splices the output of all convolution kernels to obtain its final output. Because the number of output channels of the convolution operation is equal to the number of convolution kernels, and only one convolution kernel is used for each channel in Depthwise Convolution, the number of output channels of a single channel after the convolution operation is also 1. Then, if the number of channels of the input feature map is N (as shown in Figure 1.1), after using a convolution kernel for each of the N channels, a feature map with N channels of 1 is obtained. Then splice the N feature maps in sequence to obtain an output feature map with N channels.

Pointwise Convolution is actually a 1×1 convolution, which plays two roles in DSC. The first function is to allow DSC to freely change the number of output channels; the second function is to perform channel fusion on the feature map output by Depthwise Convolution.

 

2) Core code

class MobileNet(nn.Module):
    def __init__(self,inp,oup,stride):
        super(MobileNet,self).__init__()
        self.dw = torch.nn.Conv2d(inp,inp,kernel_size=3,stride=stride,padding=1,groups=inp,bias=False)
        self.bn1 = torch.nn.BatchNorm2d(inp)
        self.pw = torch.nn.Conv2d(inp,oup,kernel_size=1,stride=1,padding=0,bias=False)
        self.bn2 = torch.nn.BatchNorm2d(oup)
    def forward(self,x):
        x = F.relu(self.bn1(self.dw(x)))
        x = F.relu(self.bn2(self.pw(x)))
        return x
class Net(nn.Module):
    def  __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(1,32,kernel_size=3,stride=2,padding=1,bias=False) 
        self.bn1 = torch.nn.BatchNorm2d(32)
        
        self.m1 = MobileNet(32,64,1)
        self.m2 = MobileNet(64,128,2)
        self.m3 = MobileNet(128,128,1)
        
        self.mp = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(128,10)  
    def forward(self,x):
         in_size = x.size(0)
         x = F.relu(self.bn1(self.conv1(x)))
         x = self.m1(x)
         x = self.m2(x)
         x = self.m3(x)
         print(x.shape)
         x = self.mp(x)
         x = x.view(in_size,-1)
         x = self.fc(x)
         return x
model = Net()

4.SqueezeNet

1) Network structure

 

 2) Core code

class Fire(torch.nn.Module):
    def __init__(self,inp,squ_outp,e1x1_outp,e3x3_outp):
        super(Fire,self).__init__()
        self.squeeze  = torch.nn.Conv2d(inp,squ_outp,kernel_size=1)
        self.conve1x1 = torch.nn.Conv2d(squ_outp,e1x1_outp,kernel_size=1)
        self.conve3x3 = torch.nn.Conv2d(squ_outp,e3x3_outp,kernel_size=3,padding=1)
    def forward(self,x):
        x = F.relu(self.squeeze (x))
        x1 = F.relu(self.conve1x1(x))
        x3 = F.relu(self.conve3x3(x))
        
        return torch.cat([x1,x3],1)
        
class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = torch.nn.Conv2d(1,32,kernel_size=3,stride=2)
        self.fire1 = Fire(32,16,64,64)
        self.fire2 = Fire(128,16,64,64)
        self.fire3 = Fire(128,16,64,64)
        self.final_conv = torch.nn.Conv2d(128,10,kernel_size=1)
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(p=0.5),
            self.final_conv,
            torch.nn.ReLU(inplace=True),
            torch.nn.AdaptiveAvgPool2d((1, 1))
        )
        self.pooling = torch.nn.MaxPool2d(kernel_size=3,stride=2,ceil_mode=True)
        self.fc = torch.nn.Linear(320,10)
    
    def forward(self,x):
        batch_size = x.size(0)
        x = self.pooling(F.relu(self.conv1(x)))
        x = self.fire1(x)
        x = self.pooling(self.fire2(x))
        x = self.fire3(x)
        x = self.classifier(x)
        return torch.flatten(x, 1)   

model = Net()

5.ShuffleNet

1) Use matrix transposition for Shuffle (channel rearrangement)

First look at a simple example

The code to implement channel rearrangement is as follows:

 The effect is as follows: Disrupt each group feature map (each group is three feature maps)

 2) Group convolution

3) Core unit SHUFFLENET UNIT

The basic unit of ShuffleNet is improved on the basis of a residual unit. As shown in Figure a, this is a residual unit that contains 3 layers: first, 1x1 convolution, then 3x3 depthwise convolution (DWConv, mainly to reduce the amount of calculation), where the 3x3 convolution is the bottleneck layer (bottleneck ), followed by a 1x1 convolution, and finally a short-circuit connection that adds the input directly to the output.

Now, make the following improvements:

Replace the dense 1x1 convolution with a 1x1 group convolution, but add a channel shuffle operation after the first 1x1 convolution. It is worth noting that the channel shuffle is not added after the 3x3 convolution. According to the paper, for such a residual unit, a channel shuffle operation is enough. Also, the ReLU activation function is not used after the 3x3 depthwise convolution. After improvement, it is shown in Figure b.

For the residual unit, if stride=1, the input and output shapes can be added directly at this time, and when stride=2, the number of channels increases, and the size of the feature map decreases , and the input and output do not match at this time. In general, a 1x1 convolution can be used to map the input to the same shape as the output. But in ShuffleNet, a different strategy is adopted, as shown in Figure c: a 3x3 avg pool with stride=2 is used for the original input, so that a feature map of the same size as the output is obtained, and then the obtained feature map is connected to the output (concat), rather than adding. The purpose of this is mainly to reduce the amount of calculation and parameter size.

 

To sum up, the shape output by unitA is the same as the shape input x.

The number of unitB channels has changed, and the size of the feature map has been reduced by half.

4) Core code

def shuffle_channels(x, groups):
    """shuffle channels of a 4-D Tensor"""
    batch_size, channels, height, width = x.size()
    assert channels % groups == 0
    channels_per_group = channels // groups
    # split into groups
    x = x.view(batch_size, groups, channels_per_group,
               height, width)
    # transpose 1, 2 axis
    x = x.transpose(1, 2).contiguous()
    # reshape into orignal
    x = x.view(batch_size, channels, height, width)
    return x
class ShuffelUnitA(torch.nn.Module):
    def __init__(self,in_channels, out_channels, groups=3):
        super(ShuffelUnitA,self).__init__()
        assert in_channels == out_channels
        assert out_channels % 4 == 0
        bottleneck_channels = out_channels // 4
        self.groups = groups
        self.group_conv1x1_1 = nn.Conv2d(in_channels, bottleneck_channels,kernel_size=1, groups=groups, stride=1)
        self.bn1 = nn.BatchNorm2d(bottleneck_channels)
        self.depthwise_conv = nn.Conv2d(bottleneck_channels,bottleneck_channels,kernel_size=3, padding=1, stride=1,groups=bottleneck_channels)
        self.bn2 = nn.BatchNorm2d(bottleneck_channels)
        self.group_conv1x1_2 = nn.Conv2d(bottleneck_channels, out_channels,kernel_size=1, groups=groups, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels)
    def forward(self, x):
        out = self.group_conv1x1_1(x)
        out = F.relu(self.bn1(out))
        out = shuffle_channels(out,groups=self.groups)
        out = self.depthwise_conv(out)
        out = self.bn2(out)
        out = self.group_conv1x1_2(out)
        out = self.bn3(out)
        out = F.relu(x+out)
        return out
class ShuffelUnitB(torch.nn.Module):
    def __init__(self,in_channels, out_channels, groups=3):
        super(ShuffelUnitB,self).__init__()
        out_channels -= in_channels
        assert out_channels % 4 == 0
        bottleneck_channels = out_channels // 4
        self.groups = groups
        self.group_conv1x1_1 = nn.Conv2d(in_channels, bottleneck_channels,kernel_size=1, groups=groups, stride=1)
        self.bn1 = nn.BatchNorm2d(bottleneck_channels)
        self.depthwise_conv = nn.Conv2d(bottleneck_channels,bottleneck_channels,kernel_size=3, padding=1, stride=2,groups=bottleneck_channels)
        self.bn2 = nn.BatchNorm2d(bottleneck_channels)
        self.group_conv1x1_2 = nn.Conv2d(bottleneck_channels, out_channels,kernel_size=1, groups=groups, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels)
    def forward(self, x):
        out = self.group_conv1x1_1(x)
        out = F.relu(self.bn1(out))
        out = shuffle_channels(out,groups=self.groups)
        out = self.depthwise_conv(out)
        out = self.bn2(out)
        out = self.group_conv1x1_2(out)
        out = self.bn3(out)
        x = F.avg_pool2d(x, 3, stride=2, padding=1)
        out = F.relu(torch.cat([x,out],1))
        return out
class Net(torch.nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = torch.nn.Conv2d(1,24,kernel_size=3,stride=2,padding=1)
        stage2_seq = [ShuffelUnitB(24, 240, groups=3)] + [ShuffelUnitA(240, 240, groups=3) for i in range(3)]
        self.stage2 = nn.Sequential(*stage2_seq)
        stage3_seq = [ShuffelUnitB(240, 480, groups=3)] + [ShuffelUnitA(480, 480, groups=3) for i in range(7)]
        self.stage3 = nn.Sequential(*stage3_seq)
        stage4_seq = [ShuffelUnitB(480, 960, groups=3)] + [ShuffelUnitA(960, 960, groups=3) for i in range(3)]
        self.stage4 = nn.Sequential(*stage4_seq)
        self.fc = torch.nn.Linear(960,10)
    
    def forward(self,x):
        batch_size = x.size(0)
        print('nihao')
        x = self.conv1(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = F.avg_pool2d(x, 1)
        x = x.view(batch_size,-1)   #flatten
        x = self.fc(x)
        return x

model = Net()

5.Densenet

(1) Network core

dense, the channel increases, and the size of the feature map remains unchanged

transition, reduce the number of channels, reduce the size of the feature map

 

 (2) Code

def conv_block(in_channel, out_channel):
    layer = nn.Sequential(
        nn.BatchNorm2d(in_channel),
        nn.ReLU(True),
        nn.Conv2d(in_channel, out_channel, 3, padding=1, bias=False)
    )
    return layer
class dense_block(nn.Module):
    def __init__(self, in_channel, growth_rate, num_layers):
        super(dense_block, self).__init__()
        block = []
        channel = in_channel
        for i in range(num_layers):
            block.append(conv_block(channel, growth_rate))
            channel += growth_rate
        self.net = nn.Sequential(*block)
    def forward(self, x):
        for layer in self.net:
            out = layer(x)
            x = torch.cat((out, x), dim=1)
        return x
def transition(in_channel, out_channel):
    trans_layer = nn.Sequential(
        nn.BatchNorm2d(in_channel),
        nn.ReLU(True),
        nn.Conv2d(in_channel, out_channel, 1),
        nn.AvgPool2d(kernel_size = 2, stride = 2)
    )
    return trans_layer
class Net(nn.Module):
    def __init__(self, in_channel=1, num_classes=10, growth_rate=32, block_layers=[6, 12]):
        super(Net, self).__init__()
        # 模型开始部分的卷积池化层
        self.block1 = nn.Sequential(
            nn.Conv2d(in_channels=in_channel,out_channels=64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(3, 2, padding=1)  
            )
        self.channels = 64
        block = []
        # 循环添加dense_block模块,并在非最后一层的层末尾添加过渡层
        for i, layers in enumerate(block_layers):
            block.append(dense_block(self.channels, growth_rate, layers))
            self.channels += layers * growth_rate
            
            if i != len(block_layers) - 1:
                # 每经过一个dense_block模块,则在后面添加一个过渡模块,通道数减半channels//2
                block.append(transition(self.channels, self.channels // 2))
                self.channels = self.channels // 2
        self.block2 = nn.Sequential(*block) #将block层展开赋值给block2
        # 添加其他最后层
        self.block2.add_module('bn', nn.BatchNorm2d(self.channels))
        self.block2.add_module('relu', nn.ReLU(True))
        self.block2.add_module('avg_pool', nn.AvgPool2d(3))
        self.classifier = nn.Linear(512, num_classes)
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = x.view(x.shape[0], -1)
        x = self.classifier(x)
        return x
model = Net()
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,),(0.3081,))
])
train_dataset = datasets.MNIST(root = './data',train = True,download=True,transform=transform)
train_loader = DataLoader(train_dataset,shuffle = True ,batch_size = batch_size)
test_dataset = datasets.MNIST(root = './data',train = False,download=True,transform=transform)
test_loader = DataLoader(test_dataset,shuffle = False ,batch_size = batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01,momentum=0.5)
def train(epoch):
    running_loss = 0.0
    for batch_idx,data in enumerate(train_loader,0):
        inputs,target = data
        #print(inputs.shape)
        optimizer.zero_grad()
        
        #向前传播
        outputs = model(inputs)
        loss = criterion(outputs,target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if batch_idx % 300 == 299:
            print('[%d,%5d] loss:%.3f' %(epoch +1 ,batch_idx+1,running_loss/300))
            running_loss = 0.0
for epoch in range(10):
    train(epoch)

Guess you like

Origin blog.csdn.net/zhang2362167998/article/details/128769883