The author of the paper said that the effect on yolo has improved, but it has not been tested yet, and the details are not clear, so let's roll out the code first. The code structure of yolov7 is similar, and can be modified by referring to tiny's yaml
First upload the configuration file yolov7-tiny-AFPN.yaml (hight-level is done twice)
# parameters
nc: 80 # number of classes
depth_multiple: 1.0 # model depth multiple
width_multiple: 1.0 # layer channel multiple
activation: nn.LeakyReLU(0.1)
# anchors
anchors:
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# yolov7-tiny backbone
backbone:
# [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True
[[-1, 1, Conv, [32, 3, 2, None, 1]], # 0-P1/2
[-1, 1, Conv, [64, 3, 2, None, 1]], # 1-P2/4
[-1, 1, Conv, [32, 1, 1, None, 1]],
[-2, 1, Conv, [32, 1, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [64, 1, 1, None, 1]], # 7
[-1, 1, MP, []], # 8-P3/8
[-1, 1, Conv, [64, 1, 1, None, 1]],
[-2, 1, Conv, [64, 1, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [128, 1, 1, None, 1]], # 14
[-1, 1, MP, []], # 15-P4/16
[-1, 1, Conv, [128, 1, 1, None, 1]],
[-2, 1, Conv, [128, 1, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [256, 1, 1, None, 1]], # 21
[-1, 1, MP, []], # 22-P5/32
[-1, 1, Conv, [256, 1, 1, None, 1]],
[-2, 1, Conv, [256, 1, 1, None, 1]],
[-1, 1, Conv, [256, 3, 1, None, 1]],
[-1, 1, Conv, [256, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [512, 1, 1, None, 1]], # 28
]
# yolov7-tiny head
head:
[[14, 1, Conv, [64, 1, 1, None, 1]], # 29
[21, 1, Conv, [128, 1, 1, None, 1]], # 30
[28, 1, Conv, [256, 1, 1, None, 1]], # 31
[ 29, 1, Conv, [ 64, 1, 1, None, 1 ] ], # 32
[ 30, 1, Conv, [ 128, 1, 1, None, 1 ] ], # 33
[ 31, 1, Conv, [ 256, 1, 1, None, 1 ] ], # 34
[ [ 33, 34 ], 1, ASFF_2, [ 128, 0 ] ], # 35
[ [ 33, 34 ], 1, ASFF_2, [ 256, 1 ] ], # 36
[35, 1, Conv, [64, 1, 1, None, 1]],
[35, 1, Conv, [64, 1, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [128, 1, 1, None, 1]], # 42
[ 36, 1, Conv, [ 128, 1, 1, None, 1 ] ],
[ 36, 1, Conv, [ 128, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 128, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 128, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 256, 1, 1, None, 1 ] ], # 48
[[32, 42, 48], 1, ASFF_3, [64, 0]], # 49
[[32, 42, 48], 1, ASFF_3, [128, 1]], # 50
[[32, 42, 48], 1, ASFF_3, [256, 2]], # 51
[ 49, 1, Conv, [ 32, 1, 1, None, 1 ] ],
[ 49, 1, Conv, [ 32, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 32, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 32, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 64, 1, 1, None, 1 ] ], # 57
[ 50, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ 50, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 128, 1, 1, None, 1 ] ], # 63
[51, 1, Conv, [128, 1, 1, None, 1]],
[51, 1, Conv, [128, 1, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [256, 1, 1, None, 1]], # 69
[57, 1, Conv, [64, 3, 1, None, 1]],
[63, 1, Conv, [128, 3, 1, None, 1]],
[69, 1, Conv, [256, 3, 1, None, 1]],
[[70,71,72], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
yolov7-tiny-AFPN.yaml (do it twice at low-level, the method of the paper)
# parameters
nc: 80 # number of classes
depth_multiple: 1.0 # model depth multiple
width_multiple: 1.0 # layer channel multiple
activation: nn.LeakyReLU(0.1)
# anchors
anchors:
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# yolov7-tiny backbone
backbone:
# [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True
[[-1, 1, Conv, [32, 3, 2, None, 1]], # 0-P1/2
[-1, 1, Conv, [64, 3, 2, None, 1]], # 1-P2/4
[-1, 1, Conv, [32, 1, 1, None, 1]],
[-2, 1, Conv, [32, 1, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [64, 1, 1, None, 1]], # 7
[-1, 1, MP, []], # 8-P3/8
[-1, 1, Conv, [64, 1, 1, None, 1]],
[-2, 1, Conv, [64, 1, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[-1, 1, Conv, [64, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [128, 1, 1, None, 1]], # 14
[-1, 1, MP, []], # 15-P4/16
[-1, 1, Conv, [128, 1, 1, None, 1]],
[-2, 1, Conv, [128, 1, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [256, 1, 1, None, 1]], # 21
[-1, 1, MP, []], # 22-P5/32
[-1, 1, Conv, [256, 1, 1, None, 1]],
[-2, 1, Conv, [256, 1, 1, None, 1]],
[-1, 1, Conv, [256, 3, 1, None, 1]],
[-1, 1, Conv, [256, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [512, 1, 1, None, 1]], # 28
]
# yolov7-tiny head
head:
[[14, 1, Conv, [64, 1, 1, None, 1]], # 29
[21, 1, Conv, [128, 1, 1, None, 1]], # 30
[28, 1, Conv, [256, 1, 1, None, 1]], # 31
[ 29, 1, Conv, [ 64, 1, 1, None, 1 ] ], # 32
[ 30, 1, Conv, [ 128, 1, 1, None, 1 ] ], # 33
[ 31, 1, Conv, [ 256, 1, 1, None, 1 ] ], # 34
[ [ 32, 33 ], 1, ASFF_2, [ 64, 0 ] ], # 35
[ [ 32, 33 ], 1, ASFF_2, [ 128, 1 ] ], # 36
[35, 1, Conv, [32, 1, 1, None, 1]],
[35, 1, Conv, [32, 1, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[-1, 1, Conv, [32, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [64, 1, 1, None, 1]], # 42
[ 36, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ 36, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 128, 1, 1, None, 1 ] ], # 48
[[42, 48, 34], 1, ASFF_3, [64, 0]], # 49
[[42, 48, 34], 1, ASFF_3, [128, 1]], # 50
[[42, 48, 34], 1, ASFF_3, [256, 2]], # 51
[ 49, 1, Conv, [ 32, 1, 1, None, 1 ] ],
[ 49, 1, Conv, [ 32, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 32, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 32, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 64, 1, 1, None, 1 ] ], # 57
[ 50, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ 50, 1, Conv, [ 64, 1, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ -1, 1, Conv, [ 64, 3, 1, None, 1 ] ],
[ [ -1, -2, -3, -4 ], 1, Concat, [ 1 ] ],
[ -1, 1, Conv, [ 128, 1, 1, None, 1 ] ], # 63
[51, 1, Conv, [128, 1, 1, None, 1]],
[51, 1, Conv, [128, 1, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[-1, 1, Conv, [128, 3, 1, None, 1]],
[[-1, -2, -3, -4], 1, Concat, [1]],
[-1, 1, Conv, [256, 1, 1, None, 1]], # 69
[57, 1, Conv, [64, 3, 1, None, 1]],
[63, 1, Conv, [128, 3, 1, None, 1]],
[69, 1, Conv, [256, 3, 1, None, 1]],
[[70,71,72], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
The difference between the two configuration files is that one is to do ASFF twice on the output of the last layer in the backbone, and the other is to do ASFF twice on the output of the penultimate layer, which is the practice of the paper, but the parameters of the paper will be measured A little less, a little more than 5.9M in total
Modify in models/common.py to ensure that parameters can be passed in from yaml
class Conv(nn.Module):
# Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, activation)
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
x = self.act(self.bn(self.conv(x)))
return x
def forward_fuse(self, x):
return self.act(self.conv(x))
Add in models/common.py
class Upsample(nn.Module):
def __init__(self, in_channels, out_channels, scale_factor=2):
super(Upsample, self).__init__()
self.upsample = nn.Sequential(
Conv(in_channels, out_channels, 1),
nn.Upsample(scale_factor=scale_factor, mode='bilinear')
)
# carafe
# from mmcv.ops import CARAFEPack
# self.upsample = nn.Sequential(
# BasicConv(in_channels, out_channels, 1),
# CARAFEPack(out_channels, scale_factor=scale_factor)
# )
def forward(self, x):
x = self.upsample(x)
return x
class Downsample(nn.Module):
def __init__(self, in_channels, out_channels, scale_factor=2):
super(Downsample, self).__init__()
self.downsample = nn.Sequential(
Conv(in_channels, out_channels, scale_factor, scale_factor, 0)
)
def forward(self, x):
x = self.downsample(x)
return x
class ASFF_2(nn.Module):
def __init__(self, inter_dim=512, level=0, channel=[64, 128]):
super(ASFF_2, self).__init__()
self.inter_dim = inter_dim
compress_c = 8
self.weight_level_1 = Conv(self.inter_dim, compress_c, 1, 1)
self.weight_level_2 = Conv(self.inter_dim, compress_c, 1, 1)
self.weight_levels = nn.Conv2d(compress_c * 2, 2, kernel_size=1, stride=1, padding=0)
self.conv = Conv(self.inter_dim, self.inter_dim, 3, 1)
self.upsample = Upsample(channel[1], channel[0])
self.downsample = Downsample(channel[0], channel[1])
self.level = level
def forward(self, x):
input1, input2 = x
if self.level == 0:
input2 = self.upsample(input2)
elif self.level == 1:
input1 = self.downsample(input1)
level_1_weight_v = self.weight_level_1(input1)
level_2_weight_v = self.weight_level_2(input2)
levels_weight_v = torch.cat((level_1_weight_v, level_2_weight_v), 1)
levels_weight = self.weight_levels(levels_weight_v)
levels_weight = F.softmax(levels_weight, dim=1)
fused_out_reduced = input1 * levels_weight[:, 0:1, :, :] + \
input2 * levels_weight[:, 1:2, :, :]
out = self.conv(fused_out_reduced)
return out
class ASFF_3(nn.Module):
def __init__(self, inter_dim=512, level=0, channel=[64, 128, 256]):
super(ASFF_3, self).__init__()
self.inter_dim = inter_dim
compress_c = 8
self.weight_level_1 = Conv(self.inter_dim, compress_c, 1, 1)
self.weight_level_2 = Conv(self.inter_dim, compress_c, 1, 1)
self.weight_level_3 = Conv(self.inter_dim, compress_c, 1, 1)
self.weight_levels = nn.Conv2d(compress_c * 3, 3, kernel_size=1, stride=1, padding=0)
self.conv = Conv(self.inter_dim, self.inter_dim, 3, 1)
self.level = level
if self.level == 0:
self.upsample4x = Upsample(channel[2], channel[0], scale_factor=4)
self.upsample2x = Upsample(channel[1], channel[0], scale_factor=2)
elif self.level == 1:
self.upsample2x1 = Upsample(channel[2], channel[1], scale_factor=2)
self.downsample2x1 = Downsample(channel[0], channel[1], scale_factor=2)
elif self.level == 2:
self.downsample2x = Downsample(channel[1], channel[2], scale_factor=2)
self.downsample4x = Downsample(channel[0], channel[2], scale_factor=4)
def forward(self, x):
input1, input2, input3 = x
if self.level == 0:
input2 = self.upsample2x(input2)
input3 = self.upsample4x(input3)
elif self.level == 1:
input3 = self.upsample2x1(input3)
input1 = self.downsample2x1(input1)
elif self.level == 2:
input1 = self.downsample4x(input1)
input2 = self.downsample2x(input2)
level_1_weight_v = self.weight_level_1(input1)
level_2_weight_v = self.weight_level_2(input2)
level_3_weight_v = self.weight_level_3(input3)
levels_weight_v = torch.cat((level_1_weight_v, level_2_weight_v, level_3_weight_v), 1)
levels_weight = self.weight_levels(levels_weight_v)
levels_weight = F.softmax(levels_weight, dim=1)
fused_out_reduced = input1 * levels_weight[:, 0:1, :, :] + \
input2 * levels_weight[:, 1:2, :, :] + \
input3 * levels_weight[:, 2:, :, :]
out = self.conv(fused_out_reduced)
return out
Modify in models/yolo.py:
def parse_model(d, ch): # model_dict, input_channels(3)
logger.info('\n%3s%18s%3s%10s %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
# anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')
if act:
Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU()
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out
for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args
m = eval(m) if isinstance(m, str) else m # eval strings
for j, a in enumerate(args):
try:
args[j] = eval(a) if isinstance(a, str) else a # eval strings
except:
pass
n = max(round(n * gd), 1) if n > 1 else n # depth gain
if m in [nn.Conv2d, Conv, RobustConv, RobustConv2, DWConv, GhostConv, RepConv, RepConv_OREPA, DownC,
SPP, SPPF, SPPCSPC, GhostSPPCSPC, MixConv2d, Focus, Stem, GhostStem, CrossConv,
Bottleneck, BottleneckCSPA, BottleneckCSPB, BottleneckCSPC,
RepBottleneck, RepBottleneckCSPA, RepBottleneckCSPB, RepBottleneckCSPC,
Res, ResCSPA, ResCSPB, ResCSPC,
RepRes, RepResCSPA, RepResCSPB, RepResCSPC,
ResX, ResXCSPA, ResXCSPB, ResXCSPC,
RepResX, RepResXCSPA, RepResXCSPB, RepResXCSPC,
Ghost, GhostCSPA, GhostCSPB, GhostCSPC,
SwinTransformerBlock, STCSPA, STCSPB, STCSPC,
SwinTransformer2Block, ST2CSPA, ST2CSPB, ST2CSPC]:
c1, c2 = ch[f], args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args = [c1, c2, *args[1:]]
if m in [DownC, SPPCSPC, GhostSPPCSPC,
BottleneckCSPA, BottleneckCSPB, BottleneckCSPC,
RepBottleneckCSPA, RepBottleneckCSPB, RepBottleneckCSPC,
ResCSPA, ResCSPB, ResCSPC,
RepResCSPA, RepResCSPB, RepResCSPC,
ResXCSPA, ResXCSPB, ResXCSPC,
RepResXCSPA, RepResXCSPB, RepResXCSPC,
GhostCSPA, GhostCSPB, GhostCSPC,
STCSPA, STCSPB, STCSPC,
ST2CSPA, ST2CSPB, ST2CSPC]:
args.insert(2, n) # number of repeats
n = 1
elif m is nn.BatchNorm2d:
args = [ch[f]]
elif m is Concat:
c2 = sum([ch[x] for x in f])
elif m is Chuncat:
c2 = sum([ch[x] for x in f])
elif m is Shortcut:
c2 = ch[f[0]]
elif m is Foldcut:
c2 = ch[f] // 2
elif m in [Detect, IDetect, IAuxDetect, IBin, IKeypoint]:
args.append([ch[x] for x in f])
if isinstance(args[1], int): # number of anchors
args[1] = [list(range(args[1] * 2))] * len(f)
elif m is ReOrg:
c2 = ch[f] * 4
elif m is Contract:
c2 = ch[f] * args[0] ** 2
elif m is Expand:
c2 = ch[f] // args[0] ** 2
elif m in {ASFF_2, ASFF_3}:
c2 = args[0]
if c2 != no: # if not output
c2 = make_divisible(c2 * gw, 8)
args[0] = c2
args.append([ch[x] for x in f])
else:
c2 = ch[f]
m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args) # module
t = str(m)[8:-2].replace('__main__.', '') # module type
np = sum([x.numel() for x in m_.parameters()]) # number params
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
logger.info('%3s%18s%3s%10.0f %-40s%-30s' % (i, f, n, np, t, args)) # print
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
layers.append(m_)
if i == 0:
ch = []
ch.append(c2)
return nn.Sequential(*layers), sorted(save)
Configure --cfg in yolo.py to yolov7-tiny-AFPN.yaml, click to run, and you can see the following figure:
PS: However, this 14.5GFLOPs is the result of the yolov5 code framework. It is calculated to be 315.8GFLOPs on v7, which is a bit outrageous. After looking around, I didn't find the problem. . . It is said in the comment area that it is normal, it should be a problem with my v7