Der Aufmerksamkeitsmechanismus (Aufmerksamkeitsmechanismus) entstand aus der Erforschung des menschlichen Sehens. In der Kognitionswissenschaft konzentriert sich der Mensch aufgrund des Engpasses bei der Informationsverarbeitung selektiv auf einen Teil aller Informationen, während er andere sichtbare Informationen ignoriert. Der obige Mechanismus wird oft als Aufmerksamkeitsmechanismus bezeichnet. Verschiedene Teile der menschlichen Netzhaut verfügen über unterschiedliche Grade der Fähigkeit zur Informationsverarbeitung, dh der Sehschärfe (Sehschärfe), und nur die Fovea der Netzhaut weist die stärkste Sehschärfe auf. Um die begrenzten visuellen Informationsverarbeitungsressourcen rational zu nutzen, muss der Mensch einen bestimmten Teil im visuellen Bereich auswählen und sich dann darauf konzentrieren. Wenn Menschen beispielsweise lesen, werden in der Regel nur wenige zu lesende Wörter berücksichtigt und verarbeitet. Zusammenfassend hat der Aufmerksamkeitsmechanismus zwei Hauptaspekte: Entscheiden, auf welchen Teil der Eingabe Aufmerksamkeit geachtet werden muss, und Zuweisen begrenzter Informationsverarbeitungsressourcen zu wichtigen Teilen. Kommen Sie zu einer bestimmten Enzyklopädie.
Aufmerksamkeit hat viele Anwendungsszenarien bei visuellen Aufgaben, und jedes Jahr gibt es auf dem Spitzentreffen viele Vorträge zum Thema Aufmerksamkeit. Machen Sie sich Notizen darüber, wie Sie dem Rückgrat Aufmerksamkeit schenken. Sprühen Sie nicht, wenn es Ihnen nicht gefällt. Am Beispiel von Resnet können auch andere Backbones lernen, Aufmerksamkeit zu erregen. Machen Sie Schluss mit dem Unsinn und gehen Sie direkt zum Code über
import torch.nn as nn
import math
import torch
#3x3卷积会改变feature map 大小(当stride不等于1时),反之defconv3x3(in_planes, out_planes, stride=1):"""3x3 convolution with padding"""return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)#conv1x1只改变了输出的通道数defconv1x1(in_planes, out_planes, stride=1):"""1x1 convolution """return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
bias=False)classBasicBlock(nn.Module):
expansion =1def__init__(self, inplanes, planes, stride=1, downsample=None):super(BasicBlock, self).__init__()#第一个conv3x3的stride是可变的,当取2的时候,会导致特征图的size变成二分之一
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)#第二个conv3x3的stride=1,只可以改变通道数,不会改变特征图的大小。
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
if planes ==64:
self.globalAvgPool = nn.AvgPool2d(56, stride=1)elif planes ==128:
self.globalAvgPool = nn.AvgPool2d(28, stride=1)elif planes ==256:
self.globalAvgPool = nn.AvgPool2d(14, stride=1)elif planes ==512:
self.globalAvgPool = nn.AvgPool2d(7, stride=1)
self.fc1 = nn.Linear(in_features=planes, out_features=round(planes /16))
self.fc2 = nn.Linear(in_features=round(planes /16), out_features=planes)
self.sigmoid = nn.Sigmoid()defforward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)if self.downsample isnotNone:
residual = self.downsample(x)
original_out = out
out = self.globalAvgPool(out)
out = out.view(out.size(0),-1)
out = self.fc1(out)
out = self.relu(out)
out = self.fc2(out)
out = self.sigmoid(out)
out = out.view(out.size(0), out.size(1),1,1)
out = out * original_out
out += residual
out = self.relu(out)return out
classBottleneck(nn.Module):
expansion =4def__init__(self, inplanes, planes, stride=1, downsample=None):super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes *4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes *4)
self.relu = nn.ReLU(inplace=True)if planes ==64:
self.globalAvgPool = nn.AvgPool2d(56, stride=1)elif planes ==128:
self.globalAvgPool = nn.AvgPool2d(28, stride=1)elif planes ==256:
self.globalAvgPool = nn.AvgPool2d(14, stride=1)elif planes ==512:
self.globalAvgPool = nn.AvgPool2d(7, stride=1)
self.fc1 = nn.Linear(in_features=planes *4, out_features=round(planes /4))
self.fc2 = nn.Linear(in_features=round(planes /4), out_features=planes *4)
self.sigmoid = nn.Sigmoid()
self.downsample = downsample
self.stride = stride
defforward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)if self.downsample isnotNone:
residual = self.downsample(x)
original_out = out
out = self.globalAvgPool(out)
out = out.view(out.size(0),-1)
out = self.fc1(out)
out = self.relu(out)
out = self.fc2(out)
out = self.sigmoid(out)
out = out.view(out.size(0),out.size(1),1,1)
out = out * original_out
out += residual
out = self.relu(out)return out
# 通道注意力机制classChannelAttention(nn.Module):def__init__(self, in_planes, ratio=16):super(ChannelAttention, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.max_pool = nn.AdaptiveMaxPool2d(1)
self.fc1 = nn.Conv2d(in_planes, in_planes //16,1, bias=False)
self.relu1 = nn.ReLU()
self.fc2 = nn.Conv2d(in_planes //16, in_planes,1, bias=False)
self.sigmoid = nn.Sigmoid()defforward(self, x):
avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
out = avg_out + max_out
return self.sigmoid(out)# 空间注意力机制classSpatialAttention(nn.Module):def__init__(self, kernel_size=7):super(SpatialAttention, self).__init__()assert kernel_size in(3,7),'kernel size must be 3 or 7'
padding =3if kernel_size ==7else1
self.conv1 = nn.Conv2d(2,1, kernel_size, padding=padding, bias=False)
self.sigmoid = nn.Sigmoid()defforward(self, x):
avg_out = torch.mean(x, dim=1, keepdim=True)
max_out, _ = torch.max(x, dim=1, keepdim=True)
x = torch.cat([avg_out, max_out], dim=1)
x = self.conv1(x)return self.sigmoid(x)classResNet(nn.Module):def__init__(self, block, layers, num_classes=1000, zero_init_residual=False,
groups=1, width_per_group=64, replace_stride_with_dilation=None,
norm_layer=None):super(ResNet, self).__init__()if norm_layer isNone:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.inplanes =64
self.dilation =1if replace_stride_with_dilation isNone:# each element in the tuple indicates if we should replace# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation =[False,False,False]iflen(replace_stride_with_dilation)!=3:raise ValueError("replace_stride_with_dilation should be None ""or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)# 网络的第一层加入注意力机制
self.ca = ChannelAttention(self.inplanes)
self.sa = SpatialAttention()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block,64, layers[0])
self.layer2 = self._make_layer(block,128, layers[1], stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block,256, layers[2], stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block,512, layers[3], stride=2,
dilate=replace_stride_with_dilation[2])# 网络的卷积层的最后一层加入注意力机制
self.ca1 = ChannelAttention(self.inplanes)
self.sa1 = SpatialAttention()
self.avgpool = nn.AdaptiveAvgPool2d((1,1))
self.fc = nn.Linear(512* block.expansion, num_classes)for m in self.modules():ifisinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')elifisinstance(m,(nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight,1)
nn.init.constant_(m.bias,0)if zero_init_residual:for m in self.modules():ifisinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight,0)elifisinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight,0)#每个layer的生成函数_make_layerdef_make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample =None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride =1if stride !=1or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),)
layers =[]#参数block传入:BasicBlock还是BottleNeck作为基本模块
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ inrange(1, blocks):#layer中剩余的block
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))return nn.Sequential(*layers)defforward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.ca(x)* x
x = self.sa(x)* x
x = self.maxpool(x)#layer1-layer4又由若干层基本的block(BasicBlock或者BottleNeck)构成,其中block参决定是BasicBlock还是BottleNeck. layers是一个四维的列表,每个元素分别决定这四层分别包含多少个基本block
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.ca1(x)* x
x = self.sa1(x)* x
x = self.avgpool(x)
x = x.reshape(x.size(0),-1)
x = self.fc(x)return x