1 Introduction
Many papers have mentioned this article NCNet, which is based on code and combined with papers to supplement the method.
2. Method
2.1 Process
Fig 1: Using CNN to extract images IA , IB I_A,I_BIA,IBThe feature map f A , f B f^A,f^BfA,fB. _ All features matchfij A , fkl B f^A_{ij},f^B_{kl}fijA,fklBAll in the 4D matching space ( i , j , k , l ) (i,j,k,l)(i,j,k,l ) , the matching score is represented in the 4D tensorccc . These matches are processed with soft-nearest neighbor filtering and neighborhood consensus network to obtain the final correspondence set.
The method is divided into 5 parts:
(1) dense feature extraction and matching feature extraction and matching
(2) the neighborhood consensus network
(3) a soft mutual nearest neighbor filtering
(4) extraction of correspondences from the output 4D filtered match tensor extraction Correspondence
(5) weakly supervised training loss weakly supervised loss
class ImMatchNet(nn.Module):
# used only for foward pass at eval and for training with strong supervision
def forward(self, tnf_batch):
# feature extraction
feature_A = self.FeatureExtraction(tnf_batch['source_image'])
feature_B = self.FeatureExtraction(tnf_batch['target_image'])
if self.half_precision:
feature_A=feature_A.half()
feature_B=feature_B.half()
# feature correlation
corr4d = self.FeatureCorrelation(feature_A,feature_B)
# do 4d maxpooling for relocalization
if self.relocalization_k_size>1: # default 0
corr4d,max_i,max_j,max_k,max_l=maxpool4d(corr4d,k_size=self.relocalization_k_size)
# run match processing model
corr4d = MutualMatching(corr4d)
corr4d = self.NeighConsensus(corr4d)
corr4d = MutualMatching(corr4d)
if self.relocalization_k_size>1:
delta4d=(max_i,max_j,max_k,max_l)
return (corr4d,delta4d)
else:
return corr4d
Look directly at the forward part of the network:
(1) feature extraction feature extraction, the feature is finally done with L2Norm, after all, the similarity must be calculated later, and the routine operation
(2) feature correlation is estimated to calculate the similarity of features, self.FeatureCorrelation(feature_A,feature_B)
(3) run match processing modelMutualMatching(corr4d), self.NeighConsensus(corr4d)
2.2 FeatureCorrelation
self.FeatureCorrelation = FeatureCorrelation(shape='4D',normalization=False)
class FeatureCorrelation(torch.nn.Module):
def __init__(self,shape='3D',normalization=True):
super(FeatureCorrelation, self).__init__()
self.normalization = normalization
self.shape=shape
self.ReLU = nn.ReLU()
def forward(self, feature_A, feature_B):
if self.shape=='3D':
elif self.shape=='4D':
b,c,hA,wA = feature_A.size()
b,c,hB,wB = feature_B.size()
# reshape features for matrix multiplication
feature_A = feature_A.view(b,c,hA*wA).transpose(1,2) # size [b,c,h*w]
feature_B = feature_B.view(b,c,hB*wB) # size [b,c,h*w]
# perform matrix mult.
feature_mul = torch.bmm(feature_A,feature_B)
# indexed [batch,row_A,col_A,row_B,col_B]
correlation_tensor = feature_mul.view(b,hA,wA,hB,wB).unsqueeze(1)
if self.normalization:
correlation_tensor = featureL2Norm(self.ReLU(correlation_tensor))
return correlation_tensor
Obviously, the similarity is calculated for all the features of the two images, that is, the dot product of the feature vectors, and finally a ( b , 1 , h A , w A , h B , w B ) (b,1,h_A, w_A,h_B,w_B)(b,1,hA,wA,hB,wB) size tensor.
2.3 MutualMatching
def MutualMatching(corr4d):
# mutual matching
batch_size,ch,fs1,fs2,fs3,fs4 = corr4d.size()
corr4d_B=corr4d.view(batch_size,fs1*fs2,fs3,fs4) # [batch_idx,k_A,i_B,j_B]
corr4d_A=corr4d.view(batch_size,fs1,fs2,fs3*fs4)
# get max
corr4d_B_max,_=torch.max(corr4d_B,dim=1,keepdim=True)
corr4d_A_max,_=torch.max(corr4d_A,dim=3,keepdim=True)
eps = 1e-5
corr4d_B=corr4d_B/(corr4d_B_max+eps)
corr4d_A=corr4d_A/(corr4d_A_max+eps)
corr4d_B=corr4d_B.view(batch_size,1,fs1,fs2,fs3,fs4)
corr4d_A=corr4d_A.view(batch_size,1,fs1,fs2,fs3,fs4)
corr4d=corr4d*(corr4d_A*corr4d_B) # parenthesis are important for symmetric output
return corr4d
Corresponds to the soft mutual nearest neighbor filtering in the paper. The input is the similarity tensor between all features of two images ( b , 1 , i , j , k , l ) (b,1,i,j,k,l)(b,1,i,j,k,l ) , for each pair of features( fij A , fkl B ) (f^A_{ij},f^B_{kl})(fijA,fklB) has a similarity valuecijkl c_{ijkl}cijkl. The role of MutualMatching here is to multiply the similarity by two coefficients cijkl × s A × s B c_{ijkl} \times s_A \times s_Bcijkl×sA×sB。其中 s A = c i j k l / c i j m a x s_A=c_{ijkl}/c_{ij_{max}} sA=cijkl/cijmax, c i j m a x c_{ij_{max}} cijmax 为 f i j A f^A_{ij} fijAwith all f B f^BfB has the highest similarity value. This set of operations is not complicated, but it feels difficult to describe simply. In layman's terms, each match is multiplied by a proportion of all matches. In matching, it is often considered that two pointsp A , p B p_A,p_BpA,pB, only if all points in B are consistent with p A p_ApAThe highest matching degree is p B p_BpB, and all points in A are related to p B p_BpBThe highest matching degree is p A p_ApA, the pair of points is considered to be successfully matched. It's a bit like making a derivative approximation of this operation.
2.4 NeighConsensus
self.NeighConsensus = NeighConsensus(use_cuda=self.use_cuda,
kernel_sizes=ncons_kernel_sizes, # [3,3,3]
channels=ncons_channels) # [10,10,1]
class NeighConsensus(torch.nn.Module):
def __init__(self, use_cuda=True, kernel_sizes=[3,3,3], channels=[10,10,1], symmetric_mode=True):
super(NeighConsensus, self).__init__()
self.symmetric_mode = symmetric_mode
self.kernel_sizes = kernel_sizes
self.channels = channels
num_layers = len(kernel_sizes)
nn_modules = list()
for i in range(num_layers):
if i==0:
ch_in = 1
else:
ch_in = channels[i-1]
ch_out = channels[i]
k_size = kernel_sizes[i]
nn_modules.append(Conv4d(in_channels=ch_in,out_channels=ch_out,kernel_size=k_size,bias=True))
nn_modules.append(nn.ReLU(inplace=True))
self.conv = nn.Sequential(*nn_modules)
if use_cuda:
self.conv.cuda()
def forward(self, x):
if self.symmetric_mode:
# apply network on the input and its "transpose" (swapping A-B to B-A ordering of the correlation tensor),
# this second result is "transposed back" to the A-B ordering to match the first result and be able to add together
x = self.conv(x)+self.conv(x.permute(0,1,4,5,2,3)).permute(0,1,4,5,2,3)
# because of the ReLU layers in between linear layers,
# this operation is different than convolving a single time with the filters+filters^T
# and therefore it makes sense to do this.
else:
x = self.conv(x)
return x
Corresponds to the neighborhood consensus network in the paper, which is the NC-Net part. The network structure is that Conv4d(1,10,3) + ReLU + Conv4d(10,10,3) + ReLU + Conv4d(10,1,3) + ReLU
the 4D convolution here is a custom convolution operation.
Take a look at what the paper says here. The role of NC-Net is to further process and filter the obtained correlation map (similarity score). The difficulty in finding the correct match from this correlation graph is that the correct match has hw hwh w to be from( hw ) 2 (hw)^2(hw)2 scores were found, so most of the information is an incorrect match. The article only explains the reasons for the network design, why convolution is used, the general function of each layer, and the matching obtained by exchanging the order of two images is the same, etc. Although it didn't say how the 4D convolution is calculated, it should be the same mode as the 2D convolution.
The role of NC-Net is to filter matches based on local information, and soft mutual nearest neighbor filtering is to filter based on global information. In fact, this filtering is to multiply the weight on the original matching score. This weight is based on global information or local information.
2.5 loss
def weak_loss(model, batch, normalization="softmax", alpha=30):
if normalization is None:
normalize = lambda x: x
elif normalization == "softmax":
normalize = lambda x: torch.nn.functional.softmax(x, 1)
elif normalization == "l1":
normalize = lambda x: x / (torch.sum(x, dim=1, keepdim=True) + 0.0001)
b = batch["source_image"].size(0)
# positive
# corr4d = model({'source_image':batch['source_image'], 'target_image':batch['target_image']})
corr4d = model(batch)
batch_size = corr4d.size(0)
feature_size = corr4d.size(2)
nc_B_Avec = corr4d.view(
batch_size, feature_size * feature_size, feature_size, feature_size
) # [batch_idx,k_A,i_B,j_B]
nc_A_Bvec = corr4d.view(
batch_size, feature_size, feature_size, feature_size * feature_size
).permute(
0, 3, 1, 2
) #
nc_B_Avec = normalize(nc_B_Avec)
nc_A_Bvec = normalize(nc_A_Bvec)
# compute matching scores
scores_B, _ = torch.max(nc_B_Avec, dim=1)
scores_A, _ = torch.max(nc_A_Bvec, dim=1)
score_pos = torch.mean(scores_A + scores_B) / 2
# negative
batch["source_image"] = batch["source_image"][np.roll(np.arange(b), -1), :] # roll
corr4d = model(batch)
# corr4d = model({'source_image':batch['source_image'], 'target_image':batch['negative_image']})
batch_size = corr4d.size(0)
feature_size = corr4d.size(2)
nc_B_Avec = corr4d.view(
batch_size, feature_size * feature_size, feature_size, feature_size
) # [batch_idx,k_A,i_B,j_B]
nc_A_Bvec = corr4d.view(
batch_size, feature_size, feature_size, feature_size * feature_size
).permute(
0, 3, 1, 2
) #
nc_B_Avec = normalize(nc_B_Avec)
nc_A_Bvec = normalize(nc_A_Bvec)
# compute matching scores
scores_B, _ = torch.max(nc_B_Avec, dim=1)
scores_A, _ = torch.max(nc_A_Bvec, dim=1)
score_neg = torch.mean(scores_A + scores_B) / 2
# loss
loss = score_neg - score_pos
return loss
Here is a kind of weak supervision designed by the author that does not require labels. When a pair of images can be matched, softmax can be used to get the classification score of each feature that matches it, and maximize this score (that is, it can be very good classification, there is a clear match); when a pair of images cannot be matched, the score is minimized (that is, cannot be classified, there is no match)