CondLaneNet: a Top-to-down Lane Detection Framework Based on Conditional Convolution
Paper:https://arxiv.org/pdf/2105.05003.pdf
code:GitHub - aliyun/conditional-lane-detection
Interpretation of the paper:
1. Summary
This work is a lane line detection task, and what is relatively novel is the detection head. Unlike the conventional bbox-based target detection, this work uses the detection of key points to construct a mask, and the output form is similar to instance segmentation.
2. Network structure
- The backbone uses ordinary CNN, such as ResNet;
- The neck uses TransformerFPN. In fact, considering that the lane line is relatively long and requires global attention, the self-attention operation of the Transformer is performed on the feature output by the backbone before the basic FPN constructs the pyramid.
- head is divided into two parts :
- Proposal head is used to detect lane line instances and generate dynamic convolution kernel parameters for each instance;
- The Conditional shape head uses the dynamic convolution kernel parameters generated by the Proposal head step and the conditional convolution to determine the point set of the lane line. Then connect according to these point sets to get the final lane line result.
Code analysis:
The code is developed based on the mmdetection framework (v2.0.0). In config/condlanenet/, you can see that there are three folders, corresponding to the author's configuration on the three data sets CurveLanes, CULane, and TuSimple. The biggest difference between them is that RIM is designed for CurveLanes. Let me focus on analyzing some of their common modules:
backbone
Resnet is used, depending on the size of the model, you may choose resnet18 to resnet101
neck
TransConvFPN is used here, in mmdet/models/necks/trans_fpn.py
The main difference from FPN is that there is an additional transformer operation. The motivation is that the lane line is relatively slender and needs a non-local structure such as self-attention.
That is, there is an extra transformer module between resnet and FPN.
## TransConvFPN 不重要的代码部分已省略
def forward(self, src):
assert len(src) >= len(self.in_channels)
src = list(src)
if self.attention:
trans_feat = self.trans_head(src[self.trans_idx])
else:
trans_feat = src[self.trans_idx]
inputs = src[:-1]
inputs.append(trans_feat)
if len(inputs) > len(self.in_channels):
for _ in range(len(inputs) - len(self.in_channels)):
del inputs[0]
## 下面内容跟FPN一致
# build laterals
laterals = [
lateral_conv(inputs[i + self.start_level])
for i, lateral_conv in enumerate(self.lateral_convs)
]
## 省略
## 在TransConvFPN的__init__里
if self.attention:
self.trans_head = TransConvEncoderModule(**trans_cfg)
class TransConvEncoderModule(nn.Module):
def __init__(self, in_dim, attn_in_dims, attn_out_dims, strides, ratios, downscale=True, pos_shape=None):
super(TransConvEncoderModule, self).__init__()
if downscale:
stride = 2
else:
stride = 1
# self.first_conv = ConvModule(in_dim, 2*in_dim, kernel_size=3, stride=stride, padding=1)
# self.final_conv = ConvModule(attn_out_dims[-1], attn_out_dims[-1], kernel_size=3, stride=1, padding=1)
attn_layers = []
for dim1, dim2, stride, ratio in zip(attn_in_dims, attn_out_dims, strides, ratios):
attn_layers.append(AttentionLayer(dim1, dim2, ratio, stride))
if pos_shape is not None:
self.attn_layers = nn.ModuleList(attn_layers)
else:
self.attn_layers = nn.Sequential(*attn_layers)
self.pos_shape = pos_shape
self.pos_embeds = []
if pos_shape is not None:
for dim in attn_out_dims:
pos_embed = build_position_encoding(dim, pos_shape).cuda()
self.pos_embeds.append(pos_embed)
def forward(self, src):
# src = self.first_conv(src)
if self.pos_shape is None:
src = self.attn_layers(src)
else:
for layer, pos in zip(self.attn_layers, self.pos_embeds):
src = layer(src, pos.to(src.device))
# src = self.final_conv(src)
return src
class AttentionLayer(nn.Module):
""" Position attention module"""
def __init__(self, in_dim, out_dim, ratio=4, stride=1):
super(AttentionLayer, self).__init__()
self.chanel_in = in_dim
norm_cfg = dict(type='BN', requires_grad=True)
act_cfg = dict(type='ReLU')
self.pre_conv = ConvModule(
in_dim,
out_dim,
kernel_size=3,
stride=stride,
padding=1,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
inplace=False)
self.query_conv = nn.Conv2d(
in_channels=out_dim, out_channels=out_dim // ratio, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=out_dim, out_channels=out_dim // ratio, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=out_dim, out_channels=out_dim, kernel_size=1)
self.final_conv = ConvModule(
out_dim,
out_dim,
kernel_size=3,
padding=1,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.softmax = nn.Softmax(dim=-1)
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x, pos=None):
"""
inputs :
x : inpput feature maps( B X C X H X W)
returns :
out : attention value + input feature
attention: B X (HxW) X (HxW)
"""
x = self.pre_conv(x)
m_batchsize, _, height, width = x.size()
if pos is not None:
x += pos
proj_query = self.query_conv(x).view(m_batchsize, -1,
width * height).permute(0, 2, 1)
proj_key = self.key_conv(x).view(m_batchsize, -1, width * height)
energy = torch.bmm(proj_query, proj_key)
attention = self.softmax(energy)
attention = attention.permute(0, 2, 1)
proj_value = self.value_conv(x).view(m_batchsize, -1, width * height)
out = torch.bmm(proj_value, attention)
out = out.view(m_batchsize, -1, height, width)
proj_value = proj_value.view(m_batchsize, -1, height, width)
out_feat = self.gamma * out + x
out_feat = self.final_conv(out_feat)
return out_feat
head
CondLaneHead is used, in mmdet/models/dense_heads/condlanenet_head.py
Focused analysis is required, which is very different from general detection tasks:
First of all, the forward method of the CondLaneHead class directly calls forward_test, so it is necessary to see what function of the head is called after the output of the neck from the model.
# mmdet/models/detectors/condlanenet.py
def forward(self, img, img_metas=None, return_loss=True, **kwargs):
...
if img_metas is None:
return self.test_inference(img)
elif return_loss:
return self.forward_train(img, img_metas, **kwargs)
else:
return self.forward_test(img, img_metas, **kwargs)
def forward_train(self, img, img_metas, **kwargs):
...
if self.head:
outputs = self.bbox_head.forward_train(output, poses, num_ins)
...
def forward_test(self,
img,
img_metas,
benchmark=False,
hack_seeds=None,
**kwargs):
...
if self.head:
seeds, hm = self.bbox_head.forward_test(output, hack_seeds,
kwargs['thr'])
...
So in fact, the forward of the head is useless, just go to the forward_train and forward_test of the head directly.
forward_train
# mmdet/models/dense_heads/condlanenet_head.py
def forward_train(self, inputs, pos, num_ins):
# x_list是backbone+neck输出后的multi level feature map
x_list = list(inputs)
# 这里根据hm_idx参数来取某个level 的feature map,用它去生成heat_map
# mask同理
f_hm = x_list[self.hm_idx]
f_mask = x_list[self.mask_idx]
m_batchsize = f_hm.size()[0]
# f_mask
z = self.ctnet_head(f_hm)
hm, params = z['hm'], z['params']
h_hm, w_hm = hm.size()[2:]
h_mask, w_mask = f_mask.size()[2:]
params = params.view(m_batchsize, self.num_classes, -1, h_hm, w_hm)
mask_branch = self.mask_branch(f_mask)
reg_branch = mask_branch
# reg_branch = self.reg_branch(f_mask)
params = params.permute(0, 1, 3, 4,
2).contiguous().view(-1, self.num_gen_params)
pos_tensor = torch.from_numpy(np.array(pos)).long().to(
params.device).unsqueeze(1)
pos_tensor = pos_tensor.expand(-1, self.num_gen_params)
mask_pos_tensor = pos_tensor[:, :self.num_mask_params]
reg_pos_tensor = pos_tensor[:, self.num_mask_params:]
if pos_tensor.size()[0] == 0:
masks = None
feat_range = None
else:
mask_params = params[:, :self.num_mask_params].gather(
0, mask_pos_tensor)
masks = self.mask_head(mask_branch, mask_params, num_ins)
if self.regression:
reg_params = params[:, self.num_mask_params:].gather(
0, reg_pos_tensor)
regs = self.reg_head(reg_branch, reg_params, num_ins)
else:
regs = masks
# regs = regs.view(sum(num_ins), 1, h_mask, w_mask)
feat_range = masks.permute(0, 1, 3,
2).view(sum(num_ins), w_mask, h_mask)
feat_range = self.mlp(feat_range)
return hm, regs, masks, feat_range, [mask_branch, reg_branch]
forward_test
# mmdet/models/dense_heads/condlanenet_head.py
def forward_test(
self,
inputs,
hack_seeds=None,
hm_thr=0.3,
):
def parse_pos(seeds, batchsize, num_classes, h, w, device):
pos_list = [[p['coord'], p['id_class'] - 1] for p in seeds]
poses = []
for p in pos_list:
[c, r], label = p
pos = label * h * w + r * w + c
poses.append(pos)
poses = torch.from_numpy(np.array(
poses, np.long)).long().to(device).unsqueeze(1)
return poses
# with Timer("Elapsed time in stage1: %f"): # ignore
x_list = list(inputs)
f_hm = x_list[self.hm_idx]
f_mask = x_list[self.mask_idx]
m_batchsize = f_hm.size()[0]
f_deep = f_mask
m_batchsize = f_deep.size()[0]
# with Timer("Elapsed time in ctnet_head: %f"): # 0.3ms
z = self.ctnet_head(f_hm)
h_hm, w_hm = f_hm.size()[2:]
h_mask, w_mask = f_mask.size()[2:]
hm, params = z['hm'], z['params']
hm = torch.clamp(hm.sigmoid(), min=1e-4, max=1 - 1e-4)
params = params.view(m_batchsize, self.num_classes, -1, h_hm, w_hm)
# with Timer("Elapsed time in two branch: %f"): # 0.6ms
mask_branch = self.mask_branch(f_mask)
reg_branch = mask_branch
# reg_branch = self.reg_branch(f_mask)
params = params.permute(0, 1, 3, 4,
2).contiguous().view(-1, self.num_gen_params)
batch_size, num_classes, h, w = hm.size()
# with Timer("Elapsed time in ct decode: %f"): # 0.2ms
seeds = self.ctdet_decode(hm, thr=hm_thr)
if hack_seeds is not None:
seeds = hack_seeds
# with Timer("Elapsed time in stage2: %f"): # 0.08ms
pos_tensor = parse_pos(seeds, batch_size, num_classes, h, w, hm.device)
pos_tensor = pos_tensor.expand(-1, self.num_gen_params)
num_ins = [pos_tensor.size()[0]]
mask_pos_tensor = pos_tensor[:, :self.num_mask_params]
if self.regression:
reg_pos_tensor = pos_tensor[:, self.num_mask_params:]
# with Timer("Elapsed time in stage3: %f"): # 0.8ms
if pos_tensor.size()[0] == 0:
return [], hm
else:
mask_params = params[:, :self.num_mask_params].gather(
0, mask_pos_tensor)
# with Timer("Elapsed time in mask_head: %f"): #0.3ms
masks = self.mask_head(mask_branch, mask_params, num_ins)
if self.regression:
reg_params = params[:, self.num_mask_params:].gather(
0, reg_pos_tensor)
# with Timer("Elapsed time in reg_head: %f"): # 0.25ms
regs = self.reg_head(reg_branch, reg_params, num_ins)
else:
regs = masks
feat_range = masks.permute(0, 1, 3,
2).view(sum(num_ins), w_mask, h_mask)
feat_range = self.mlp(feat_range)
for i in range(len(seeds)):
seeds[i]['reg'] = regs[0, i:i + 1, :, :]
m = masks[0, i:i + 1, :, :]
seeds[i]['mask'] = m
seeds[i]['range'] = feat_range[i:i + 1]
return seeds, hm
It can be found that the operation of this part is similar to that described in the paper.
(I'll take a look at it when I have time, I'm very busy recently)