FCOS: Fully Convolutional One-Stage Object Detection

如果对你有帮助的话，希望帮我点个赞~

文章目录

FCOS: Fully Convolutional One-Stage Object Detection
FCOS网络结构以及论文中重点内容
1. AdelaiDet/adet/modeling/fcos/fcos.py
2. AdelaiDet/adet/modeling/fcos/fcos_outputs.py
3. AdelaiDet/adet/layers/iou_loss.py
4. AdelaiDet/adet/utils/comm.py

FCOS网络结构以及论文中重点内容

FCOS网络结构

注意文中提到的share weight指的是5个fpn层输出的特征图经过各自的share head，其中share head分为回归的4个conv 以及分类的4个conv，共享权重指的是5个head 共享回归的conv权重以及分类的conv权重，而在同一个FPN中回归和分类各自的权重并不共享。
详情见代码部分。

			feature = self.share_tower(feature) # torch.Size([1, 256, 52, 76])  torch.Size([1, 256, 26, 38])
            cls_tower = self.cls_tower(feature) # torch.Size([1, 256, 52, 76])
            bbox_tower = self.bbox_tower(feature) # torch.Size([1, 256, 52, 76])

self.cls_tower 以及 self.bbox_tower 用的都是统一的分类conv以及回归的conv,即共享权重。
在这里插入图片描述

5层FPN上的每个点映射回原图的公式。（s/2 + xs, s/2+ys)

根据max(l*, t*, r*, b*)，FPN上的挑选postivie axample的限制条件。

在这里插入图片描述
centerness计算公式

1. AdelaiDet/adet/modeling/fcos/fcos.py

import math
from typing import List, Dict
import torch
from torch import nn
from torch.nn import functional as F

from detectron2.layers import ShapeSpec, NaiveSyncBatchNorm
from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY

from adet.layers import DFConv2d, NaiveGroupNorm
from adet.utils.comm import compute_locations
from .fcos_outputs import FCOSOutputs
import pdb

__all__ = ["FCOS"]

INF = 100000000


class Scale(nn.Module):
    def __init__(self, init_value=1.0):
        super(Scale, self).__init__()
        self.scale = nn.Parameter(torch.FloatTensor([init_value]))

    def forward(self, input):
        return input * self.scale


class ModuleListDial(nn.ModuleList):
    def __init__(self, modules=None):
        super(ModuleListDial, self).__init__(modules)
        self.cur_position = 0

    def forward(self, x):
        result = self[self.cur_position](x)
        self.cur_position += 1
        if self.cur_position >= len(self):
            self.cur_position = 0
        return result

# 从detectron2/detectron2/modeling/proposal_generator/build.py  PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
@PROPOSAL_GENERATOR_REGISTRY.register()
class FCOS(nn.Module):
    """
    Implement FCOS (https://arxiv.org/abs/1904.01355).
    """
    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
        super().__init__()
        self.in_features = cfg.MODEL.FCOS.IN_FEATURES # ["p3", "p4", "p5", "p6", "p7"]
        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES # [8, 16, 32, 64, 128]
        self.yield_proposal = cfg.MODEL.FCOS.YIELD_PROPOSAL # False
        # 调用FCOSHead()函数
        self.fcos_head = FCOSHead(cfg, [input_shape[f] for f in self.in_features])
        pdb.set_trace()
        self.in_channels_to_top_module = self.fcos_head.in_channels_to_top_module # 256
        # 调用FCOSOutpues(cfg)
        self.fcos_outputs = FCOSOutputs(cfg) #FCOSOutputs((loc_loss_func): IOULoss())
        pdb.set_trace()

    def forward_head(self, features, top_module=None):
        features = [features[f] for f in self.in_features]
        pred_class_logits, pred_deltas, pred_centerness, top_feats, bbox_towers = self.fcos_head(
            features, top_module, self.yield_proposal)
        pdb.set_trace()
        return pred_class_logits, pred_deltas, pred_centerness, top_feats, bbox_towers

    def forward(self, images, features, gt_instances=None, top_module=None):
        """
        Arguments:
            images (list[Tensor] or ImageList): images to be processed
            targets (list[BoxList]): ground-truth boxes present in the image (optional)

        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).

        """
        pdb.set_trace()
        features = [features[f] for f in self.in_features] #  len(features) = num(FPN) = 5 详细见下面注释
        # locations(x, y)就是相当于 训练时候的bbox的中心
        locations = self.compute_locations(features) # 调用compute_locations len(locations) = 5
        pdb.set_trace()
        logits_pred, reg_pred, ctrness_pred, top_feats, bbox_towers = self.fcos_head( # 走了fcos_head的forward
            features, top_module, self.yield_proposal
        )

        results = {
    
    }
        if self.yield_proposal: # self.yield_proposal: false
            results["features"] = {
    
    
                f: b for f, b in zip(self.in_features, bbox_towers)
            }

        if self.training:
            results, losses = self.fcos_outputs.losses( # 调用fcos_outpus.py的losses()
                logits_pred, reg_pred, ctrness_pred,
                locations, gt_instances, top_feats
            )
            
            if self.yield_proposal:
                with torch.no_grad():
                    results["proposals"] = self.fcos_outputs.predict_proposals(
                        logits_pred, reg_pred, ctrness_pred,
                        locations, images.image_sizes, top_feats
                    )
            pdb.set_trace()
            return results, losses # len(results) = 2 len(losses) =3 
        else:
            results = self.fcos_outputs.predict_proposals(
                logits_pred, reg_pred, ctrness_pred,
                locations, images.image_sizes, top_feats
            )
        pdb.set_trace()
        return results, {
    
    }

    def compute_locations(self, features):
        locations = []
        for level, feature in enumerate(features): #  levels 0 - 4
            h, w = feature.size()[-2:]
            locations_per_level = compute_locations( # 调用comm.py的文件
                h, w, self.fpn_strides[level],
                feature.device
            )
            locations.append(locations_per_level)
        pdb.set_trace()  # len(locations) = 5
        # locations[i].shape  ==> (torch.Size([3952, 2]), torch.Size([988, 2]), torch.Size([247, 2]), torch.Size([70, 2]), torch.Size([20, 2]))
        # example :  locations[0].shape : torch.Size([3952, 2]) [1, 3, 52, 76]  --> 3952 = 52 * 76 这就是全卷积网络，对于每一个pixel,进行计算   
        return locations


class FCOSHead(nn.Module):
    def __init__(self, cfg, input_shape: List[ShapeSpec]):
        """
        Arguments:
            in_channels (int): number of channels of the input feature
        """
        super().__init__()
        # TODO: Implement the sigmoid version first.
        self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES # num_classes 80
        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES # fpn_strides [8, 16, 32, 64, 128]
        head_configs = {
    
    "cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS,
                                cfg.MODEL.FCOS.USE_DEFORMABLE),
                        "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS,
                                 cfg.MODEL.FCOS.USE_DEFORMABLE),
                        "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS,
                                  False)}
        # head_configs = {'cls': (4, False), 'bbox': (4, False), 'share': (0, False)}
        norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM # GN
        self.num_levels = len(input_shape) # 5
 
        in_channels = [s.channels for s in input_shape] # 256
        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
        in_channels = in_channels[0] # in_channels 256
        
        # input_shape:
        # [
            # ShapeSpec(channels=256, height=None, width=None, stride=8), 
            # ShapeSpec(channels=256, height=None, width=None, stride=16), 
            # ShapeSpec(channels=256, height=None, width=None, stride=32), 
            # ShapeSpec(channels=256, height=None, width=None, stride=64), 
            # ShapeSpec(channels=256, height=None, width=None, stride=128)
        # ]
        self.in_channels_to_top_module = in_channels # 256

        for head in head_configs:
            tower = []
            num_convs, use_deformable = head_configs[head]
            for i in range(num_convs):
                if use_deformable and i == num_convs - 1:
                    conv_func = DFConv2d
                else:
                    conv_func = nn.Conv2d
                tower.append(conv_func(
                    in_channels, in_channels,
                    kernel_size=3, stride=1,
                    padding=1, bias=True
                ))
                if norm == "GN":
                    tower.append(nn.GroupNorm(32, in_channels))
                elif norm == "NaiveGN":
                    tower.append(NaiveGroupNorm(32, in_channels))
                elif norm == "BN":
                    tower.append(ModuleListDial([
                        nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)
                    ]))
                elif norm == "SyncBN":
                    tower.append(ModuleListDial([
                        NaiveSyncBatchNorm(in_channels) for _ in range(self.num_levels)
                    ]))
                tower.append(nn.ReLU())
            self.add_module('{}_tower'.format(head),
                            nn.Sequential(*tower))

        self.cls_logits = nn.Conv2d(
            in_channels, self.num_classes,
            kernel_size=3, stride=1,
            padding=1
        )
        # cls_logtis 
            # Conv2d(256, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            # 256  --> 3 num_classes = 3
        self.bbox_pred = nn.Conv2d(
            in_channels, 4, kernel_size=3,
            stride=1, padding=1
        )
        # bbox_pred 
            # Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            # 256  --> 4  [left, top, right, bottom] 4d-vector
        self.ctrness = nn.Conv2d(
            in_channels, 1, kernel_size=3,
            stride=1, padding=1
        )
        # ctrness 
            # Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            # 256  --> 1  h * w * 1
        pdb.set_trace()

        if cfg.MODEL.FCOS.USE_SCALE: # True
            self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
        else:
            self.scales = None

        for modules in [
            self.cls_tower, self.bbox_tower,
            self.share_tower, self.cls_logits,
            self.bbox_pred, self.ctrness
        ]:
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)

        # self.add_module:
            # cls_tower:
                # (Sequential(
                # (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (1): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (2): ReLU()
                # (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (4): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (5): ReLU()
                # (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (7): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (8): ReLU()
                # (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (10): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (11): ReLU()

            # bbox_tower:
                # Sequential(
                # (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (1): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (2): ReLU()
                # (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (4): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (5): ReLU()
                # (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (7): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (8): ReLU()
                # (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                # (10): GroupNorm(32, 256, eps=1e-05, affine=True)
                # (11): ReLU()

            # share_tower  --> 空
                # Sequential()

            # cls_logits
                # (Conv2d(256, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))

            # ctrness
                # Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))

            # bbox_pred
                # Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))

            # scales: ModuleList(
                # (0): Scale()
                # (1): Scale()
                # (2): Scale()
                # (3): Scale()
                # (4): Scale()
                # ) 
          
        # initialize the bias for focal loss
        prior_prob = cfg.MODEL.FCOS.PRIOR_PROB # 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob) # log(1 - 0.01) / 0.01
        torch.nn.init.constant_(self.cls_logits.bias, bias_value) # self.cls_logits.bias: tensor([ 0.0070, -0.0133, -0.0184], requires_grad=True)

        pdb.set_trace()

    def forward(self, x, top_module=None, yield_bbox_towers=False):
        logits = []
        bbox_reg = []
        ctrness = []
        top_feats = []
        bbox_towers = []
        for l, feature in enumerate(x):
            # x --> list  x[tensor()] x[0] --> tensor 
            # len(x) = 5  循环5次
            # x[0].shape,                    x[1].shape,                   x[2].shape,                   x[3].shape,                  x[4].shape
            # (torch.Size([1, 256, 52, 76]), torch.Size([1, 256, 26, 38]), torch.Size([1, 256, 13, 19]), torch.Size([1, 256, 7, 10]), torch.Size([1, 256, 4, 5]))
            # feature, torch.Size([1, 256, 52, 76])
            # feature经过cls_tower 以及 bbox_tower网络训练
            feature = self.share_tower(feature) # torch.Size([1, 256, 52, 76])  torch.Size([1, 256, 26, 38])
            cls_tower = self.cls_tower(feature) # torch.Size([1, 256, 52, 76])
            bbox_tower = self.bbox_tower(feature) # torch.Size([1, 256, 52, 76])
            if yield_bbox_towers: # false
                bbox_towers.append(bbox_tower)

            logits.append(self.cls_logits(cls_tower))
            ctrness.append(self.ctrness(bbox_tower))
            reg = self.bbox_pred(bbox_tower)
            if self.scales is not None:
                reg = self.scales[l](reg)
            # Note that we use relu, as in the improved FCOS, instead of exp.
            bbox_reg.append(F.relu(reg))
            if top_module is not None:
                top_feats.append(top_module(bbox_tower))
        pdb.set_trace()    
        return logits, bbox_reg, ctrness, top_feats, bbox_towers
        # logits[i].shape   len(logits) = 5
        # (torch.Size([1, Num_classes, 52, 76]), torch.Size([1, Num_classes, 26, 38]), torch.Size([1, Num_classes, 13, 19]), torch.Size([1, Num_classes, 7, 10]), torch.Size([1, Num_classes, 4, 5]))

        # bbox_reg[i].shape
        # (torch.Size([1, 4, 52, 76]), torch.Size([1, 4, 26, 38]), torch.Size([1, 4, 13, 19]), torch.Size([1, 4, 7, 10]), torch.Size([1, 4, 4, 5]))

        # ctrness[i].shape
        # (torch.Size([1, 1, 52, 76]), torch.Size([1, 1, 26, 38]), torch.Size([1, 1, 13, 19]), torch.Size([1, 1, 7, 10]), torch.Size([1, 1, 4, 5]))
        # when training, top_feats is empty[]
        '''
        (Pdb) top_feats
        []
        (Pdb) bbox_towers
        []

        '''



'''
images:
    len(images) = 4 # batch_size
    (Pdb) images[0].size()
    torch.Size([3, 796, 1333])
    (Pdb) images[1].size()
    torch.Size([3, 800, 1067])
    (Pdb) images[2].size()
    torch.Size([3, 800, 1208])
    (Pdb) images[3].size()
    torch.Size([3, 800, 1199])

features:
    (Pdb) len(features)
    5
    (Pdb) type(features)
    <class 'dict'>
    (Pdb) len(features)
    5
    (Pdb) type(features['p3'])
    <class 'torch.Tensor'>
    (Pdb) features.keys()
    dict_keys(['p3', 'p4', 'p5', 'p6', 'p7'])
    (Pdb) features['p3'].size()
    torch.Size([4, 256, 100, 168])
    (Pdb) features['p4'].size()
    torch.Size([4, 256, 50, 84])
    (Pdb) features['p5'].size()
    torch.Size([4, 256, 25, 42])
    (Pdb) features['p6'].size()
    torch.Size([4, 256, 13, 21])
    (Pdb) features['p7'].size()
    torch.Size([4, 256, 7, 11])

(Pdb) gt_instances
len(gt_instances) = 4
(Pdb) gt_instances[0]
Instances(num_instances=1, image_height=796, image_width=1333, fields=[gt_boxes: Boxes(tensor([[ 495.7760,  499.0487, 1057.7688,  663.6442]], device='cuda:0')), gt_classes: tensor([71], device='cuda:0')])
(Pdb) gt_instances[1]
[[DInstances(num_instances=15, image_height=800, image_width=1067, fields=[gt_boxes: Boxes(tensor([[ 482.0673,   70.1667,  979.0725,  775.1000],
        [ 162.0007,   86.8333,  625.1620,  775.5667],
        [ 540.8690,   67.5500,  606.1227,  165.7000],
        [ 604.8557,   55.1167,  669.3591,  158.6333],
        [ 433.5355,   60.9333,  516.9781,  201.9333],
        [ 224.0534,   19.8500,  308.5297,  222.3000],
        [  98.1139,   55.5333,  177.6555,  216.3833],
        [ 880.6085,  168.0667,  937.5095,  250.1500],
        [  45.6643,  431.8667,  213.9501,  549.9833],
        [ 901.5150,  322.3500, 1002.3632,  604.6333],
        [ 468.1629,  212.1333,  545.4871,  267.8667],
        [ 578.2806,  183.0000,  673.3270,  259.4833],
        [ 685.3808,   13.5333,  764.7556,  137.0500],
        [  47.1648,  227.1667,  116.8532,  276.7333],
        [  34.7275,   96.5833,   96.3801,  225.6333]], device='cuda:0')), gt_classes: tensor([ 0,  0,  0,  0,  0,  0,  0,  0, 34, 34,  0,  0,  0,  0,  0],
       device='cuda:0')])
(Pdb) gt_instances[2]
Instances(num_instances=16, image_height=800, image_width=1208, fields=[gt_boxes: Boxes(tensor([[ 137.3534,  567.5095,  815.9096,  791.2264],
        [ 747.8652,  569.3019, 1208.0000,  789.4340],
        [ 491.8447,   28.4717,  502.6978,   93.0000],
        [ 202.6798,  276.8491,  564.1549,  600.4528],
        [ 558.0204,  110.3208,  637.0690,  201.0566],
        [ 437.8246,   34.3585,  505.6235,  192.4717],
        [ 613.8527,  315.5660,  926.6115,  656.7548],
        [   0.0000,  506.0189,   70.1018,  593.1509],
        [ 570.1760,  499.3774,  645.6572,  586.9246],
        [ 562.0031,  140.1887,  581.4255,  158.5094],
        [   0.0000,  576.9811,   93.5067,  800.0000],
        [ 844.9960,  503.9434,  892.6176,  536.6603],
        [ 473.8947,  478.2641,  527.3109,  511.4529],
        [ 624.1963,   67.9057,  705.8683,  209.0755],
        [ 935.6904,  328.3773, 1201.7712,  577.7170],
        [  81.3512,  517.3962,  172.8006,  644.1509]], device='cuda:0')), gt_classes: tensor([56, 56, 34,  0,  0,  0,  0,  0,  0, 35, 56,  0,  0,  0,  0,  0],
       device='cuda:0')])
(Pdb) gt_instances[3]
Instances(num_instances=6, image_height=800, image_width=1199, fields=[gt_boxes: Boxes(tensor([[ 799.9578,   48.4871, 1190.1012,  701.7330],
        [ 692.1040,   40.8056,  952.7554,  734.7260],
        [  34.6586,  390.4262,  163.6448,  431.9438],
        [ 880.0472,  120.8431,  969.2791,  159.0820],
        [ 884.8995,  379.5972,  934.4894,  396.9087],
        [   2.0795,  367.7939,  536.0841,  704.4122]], device='cuda:0')), gt_classes: tensor([72,  0, 45, 45, 45, 60], device='cuda:0')])

'''

2. AdelaiDet/adet/modeling/fcos/fcos_outputs.py

import logging
import torch
from torch import nn
import torch.nn.functional as F

from detectron2.layers import cat
from detectron2.structures import Instances, Boxes
from detectron2.utils.comm import get_world_size
from fvcore.nn import sigmoid_focal_loss_jit

from adet.utils.comm import reduce_sum
from adet.layers import ml_nms, IOULoss

import pdb
logger = logging.getLogger(__name__)

INF = 100000000

"""
Shape shorthand in this module:

    N: number of images in the minibatch
    L: number of feature maps per image on which RPN is run
    Hi, Wi: height and width of the i-th feature map
    4: size of the box parameterization

Naming convention:

    labels: refers to the ground-truth class of an position.

    reg_targets: refers to the 4-d (left, top, right, bottom) distances that parameterize the ground-truth box.

    logits_pred: predicted classification scores in [-inf, +inf];
    
    reg_pred: the predicted (left, top, right, bottom), corresponding to reg_targets 

    ctrness_pred: predicted centerness scores

"""

def compute_ctrness_targets(reg_targets):
    if len(reg_targets) == 0:
        return reg_targets.new_zeros(len(reg_targets))
    #print("reg_targets.shape: ", reg_targets.shape)  # torch.Size([155, 4]) 155代表正样本。
    '''
    reg_targets:
        
    tensor([[3.5625, 7.6250, 5.3750, 8.0000],
            [4.5625, 7.6250, 4.3750, 8.0000],
            [5.5625, 7.6250, 3.3750, 8.0000],
            [1.0312, 3.0625, 3.4375, 4.7500],
            [2.0312, 3.0625, 2.4375, 4.7500],
            [3.0312, 3.0625, 1.4375, 4.7500],
            [1.0312, 4.0625, 3.4375, 3.7500],
            [2.0312, 4.0625, 2.4375, 3.7500],
            [3.0312, 4.0625, 1.4375, 3.7500],
            [1.0312, 5.0625, 3.4375, 2.7500],
            [2.0312, 5.0625, 2.4375, 2.7500],
            [3.0312, 5.0625, 1.4375, 2.7500]], device='cuda:0')
    '''
    pdb.set_trace()
    left_right = reg_targets[:, [0, 2]] # torch.Size([12, 2])
    top_bottom = reg_targets[:, [1, 3]] # torch.Size([12, 2])
    '''
    top_bottom:
        tensor([[7.6250, 8.0000],
            [7.6250, 8.0000],
            [7.6250, 8.0000],
            [3.0625, 4.7500],
            [3.0625, 4.7500],
            [3.0625, 4.7500],
            [4.0625, 3.7500],
            [4.0625, 3.7500],
            [4.0625, 3.7500],
            [5.0625, 2.7500],
            [5.0625, 2.7500],
            [5.0625, 2.7500]], device='cuda:0')

    left_right:
        tensor([[3.5625, 5.3750],
            [4.5625, 4.3750],
            [5.5625, 3.3750],
            [1.0312, 3.4375],
            [2.0312, 2.4375],
            [3.0312, 1.4375],
            [1.0312, 3.4375],
            [2.0312, 2.4375],
            [3.0312, 1.4375],
            [1.0312, 3.4375],
            [2.0312, 2.4375],
            [3.0312, 1.4375]], device='cuda:0')
    '''
    ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
                 (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) # torch.size([12])
    '''
    tensor([0.6317, 0.9140, 0.5783, 0.1934, 0.5373, 0.3058, 0.2769, 0.7692, 0.4377,
        0.1630, 0.4527, 0.2576], device='cuda:0')
    '''
                
    return torch.sqrt(ctrness)

#__init__, _transpose, _get_ground_truth, get_sample_region
class FCOSOutputs(nn.Module):
    def __init__(self, cfg):
        super(FCOSOutputs, self).__init__()

        self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA  # 0.25
        self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA # 2.0
        self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE # True
        self.radius = cfg.MODEL.FCOS.POS_RADIUS # POS_RADIUS = 1.5
        self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN # 0.05
        self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN # 1000
        self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN  # 100
        self.loc_loss_func = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE) # IOULoss

        self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST # 0.05
        self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST  # 1000
        self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST # 100
        self.nms_thresh = cfg.MODEL.FCOS.NMS_TH # 0.6
        self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR # Flase

        self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES # 80
        self.strides = cfg.MODEL.FCOS.FPN_STRIDES # [8, 16, 32, 64, 128]

        # generate sizes of interest
        soi = []
        prev_size = -1
        for s in cfg.MODEL.FCOS.SIZES_OF_INTEREST: # cfg.MODEL.FCOS.SIZES_OF_INTEREST : [64, 128, 256, 512]
            soi.append([prev_size, s])
            prev_size = s
        soi.append([prev_size, INF])
        self.sizes_of_interest = soi # [[-1, 64], [64, 128], [128, 256], [256, 512], [512, 100000000]]
        pdb.set_trace()
    def _transpose(self, training_targets, num_loc_list):
        '''
        This function is used to transpose image first training targets to level first ones
        :return: level first training targets
        '''
        for im_i in range(len(training_targets)):
            training_targets[im_i] = torch.split( 
                training_targets[im_i], num_loc_list, dim=0
            )
        # type( training_targets[im_i]) --> tuple()  len(training_targets) = instances,  len(training_targets[i])  = 5
        targets_level_first = []
        for targets_per_level in zip(*training_targets):
            targets_level_first.append(
                torch.cat(targets_per_level, dim=0)
            )
        pdb.set_trace()
        return targets_level_first # 把training_targets的5个value都进行transpose划分到每个FPN。 eg. 第一个len(targets_level_first) = 5  targets_level_first[0].size() = 33600 = 16800 * 2 = 第i个fpn层的像素点 * batch_size

    # _get_ground_truth 调用了 compute_targets_for_locations() 和 _transpose()
    def _get_ground_truth(self, locations, gt_instances):
        # gt_instances:
            # [Instances(num_instances=4, image_height=400, image_width=600, 
            # fields=
            # [gt_boxes: 
            # Boxes(tensor([[192., 210., 233., 240.],[137., 210., 177., 238.],[197., 193., 224., 238.],[143., 197., 170., 237.]], device='cuda:0')), 
            # gt_classes: tensor([0, 0, 2, 2], device='cuda:0')])]

        num_loc_list = [len(loc) for loc in locations] # num_loc_list [16800, 4200, 1050, 273, 77]

        # compute locations to size ranges
        loc_to_size_range = []
        for l, loc_per_level in enumerate(locations):
            # 根据paper，争对max(l*,r*,t*,b*)，每层FPN设置M_i M_(i-1)的值进行分配
            #self.sizes_of_interest [[-1, 64], [64, 128], [128, 256], [256, 512], [512, 100000000]] 相当于将FPN每层上的点对应到了input image原图上，然后原图上的点进行设置Mi的划分，见论文。
            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])  # torch.size([2])
            loc_to_size_range.append(
                loc_to_size_range_per_level[None].expand(num_loc_list[l], -1)
            ) # loc_to_size_range_per_level[None].expand(num_loc_list[l], -1) --> 就是 torch.size([2]) --> torch.size([1, 2]) --> torch.size([16800, 2])
            pdb.set_trace()
        loc_to_size_range = torch.cat(loc_to_size_range, dim=0) # [22400 , 2]  第一维代表像素点的个数，第二维代表每个像素点的(x,y)
        locations = torch.cat(locations, dim=0) # [22400 , 2] 
        pdb.set_trace()
        training_targets = self.compute_targets_for_locations( # 调用了 compute_targets_for_locations()
            locations, gt_instances, loc_to_size_range, num_loc_list
        )

        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))] # [22400, 2]
        training_targets["im_inds"] = [
            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
        ] 

        # transpose im first training_targets to level first ones
        training_targets = {
    
    
            k: self._transpose(v, num_loc_list) for k, v in training_targets.items()
        }

        training_targets["fpn_levels"] = [ # [3952, 988, 247, 70, 20]
            loc.new_ones(len(loc), dtype=torch.long) * level
            for level, loc in enumerate(training_targets["locations"]) 
        ]
        #print(training_targets["fpn_levels"])
        # we normalize reg_targets by FPN's strides here
        reg_targets = training_targets["reg_targets"] # [3952, 4] [988, 4] [274, 4] [70, 4] [20, 4]
        for l in range(len(reg_targets)):
            reg_targets[l] = reg_targets[l] / float(self.strides[l]) 
        pdb.set_trace()
        return training_targets # dict_keys(['labels', 'reg_targets', 'target_inds', 'locations', 'im_inds', 'fpn_levels'])

    # 输出false, ture的采样矩阵
    def get_sample_region(self, boxes, strides, num_loc_list, loc_xs, loc_ys, bitmasks=None, radius=1):
        if bitmasks is not None:
            _, h, w = bitmasks.size()

            ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device)
            xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device)

            m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
            m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1)
            m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1)
            center_x = m10 / m00
            center_y = m01 / m00
        else:
            # boxes [2, 4] dim=-1表示对最后一维度求和。 center_x center_y 求中心坐标(相加除2)
            center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5 # boxes[..., [0, 2]] --> [ 2]
            center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
            pdb.set_trace()
        num_gts = boxes.shape[0] # 2 gt_instances的数量
        K = len(loc_xs)  # 5277
        boxes = boxes[None].expand(K, num_gts, 4) # [5277, 2, 4]  dim = 1 --> num_classes：2
        center_x = center_x[None].expand(K, num_gts)  # [5277, 2]
        center_y = center_y[None].expand(K, num_gts)  # [5277, 2]
        center_gt = boxes.new_zeros(boxes.shape) # [5277, 2, 4]
        # no gt
        if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
            return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
        beg = 0
        for level, num_loc in enumerate(num_loc_list): # [3952, 988, 247, 70, 20]
            end = beg + num_loc # 
            stride = strides[level] * radius  # strides [8, 16, 32, 64, 128]
            xmin = center_x[beg:end] - stride 
            ymin = center_y[beg:end] - stride
            xmax = center_x[beg:end] + stride
            ymax = center_y[beg:end] + stride
            # limit sample region in gt
            center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
            center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
            center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
            center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
            beg = end
            pdb.set_trace()
        #l, r, t, b
        left = loc_xs[:, None] - center_gt[..., 0] # [5277, num_instance]
        right = center_gt[..., 2] - loc_xs[:, None] # [5277, num_instance]
        top = loc_ys[:, None] - center_gt[..., 1] # [5277, num_instance]
        bottom = center_gt[..., 3] - loc_ys[:, None] # [5277, num_instance]
        center_bbox = torch.stack((left, top, right, bottom), -1) # [5277, 2] --> [5277, num_instance, 4]  torch.stack(x, dim=-1)指在新增加的一维进行扩充
        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 #bool [5277, 2]
        # inside_gt_bbox_mask 判断图像中的每个像素点是否是正样本，若为正样本，则为True
        pdb.set_trace()
        return inside_gt_bbox_mask

    #调用get_sample_region()函数
    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list): # target 就是 gt_instances, len(targets) = batch size
        pdb.set_trace() 
        labels = []
        reg_targets = []
        target_inds = []
        xs, ys = locations[:, 0], locations[:, 1] # xs, ys 是每个像素点的坐标  xs, ys :[22400]

        num_targets = 0
        for im_i in range(len(targets)):
            targets_per_im = targets[im_i] # Instances(num_instances=2, image_height=400, image_width=600, fields=[gt_boxes: Boxes(tensor([[401.,  65., 523., 339.],[336.,  68., 441., 318.]], device='cuda:0')), gt_classes: tensor([2, 2], device='cuda:0')])
            # spec的使用
            bboxes = targets_per_im.gt_boxes.tensor   # [2, 4] 4d vector
            labels_per_im = targets_per_im.gt_classes # 类别 

            # no gt
            if bboxes.numel() == 0: # torch.numel()  返回元素数目 （torch.numel）
                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
                reg_targets.append(locations.new_zeros((locations.size(0), 4)))
                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
                continue

            area = targets_per_im.gt_boxes.area() # area =  (ymax - ymin) * (xmax - xmin)  torch.size([1])
            
            # bboxes(x1, y1, x2, y2)  compute  center (l, t, r, b)
            # xs[:, None] -->  size(): [22400, 1]
            # a[10 , 1]  b[1, 2]  (a-b).size() --> (10,2) 我认为这是一种广播！
            l = xs[:, None] - bboxes[:, 0][None] # tensor[None]永远都是补充在第一维。eg [2] --> [1, 2]; bboxes[:, 0][None] --> [1, instance];  xs[:, None] --> [22400, instance];  l --> [22400, instance]
            t = ys[:, None] - bboxes[:, 1][None]
            r = bboxes[:, 2][None] - xs[:, None] # [22400, instance]  22400 表示 图片经过了5个fpn。22400 =  \sum(hi * wi) 像素点的个数 
            b = bboxes[:, 3][None] - ys[:, None] # [22400, instance]
            reg_targets_per_im = torch.stack([l, t, r, b], dim=2) #  [22400, instance, 4]
            pdb.set_trace()
            if self.center_sample: # True 采用中心采样方法  难点理解和论文不太一样的地方
                if targets_per_im.has("gt_bitmasks_full"): # false
                    bitmasks = targets_per_im.gt_bitmasks_full
                else:
                    bitmasks = None
                # 调用get_sample_region
                # [22400, instances] is_in_boxes表示根据gt的中心点(center_x, center_y)以及围绕中心点的radius半径的一个中心采样(center sample)区域，进行筛选。
                is_in_boxes = self.get_sample_region( 
                    bboxes, self.strides, num_loc_list, xs, ys,
                    bitmasks=bitmasks, radius=self.radius
                )
            else:
                # 如果不采用canter_sample 这就是论文中直接拿location和bbox之间的是否在里面的关系。
                # is_in_boxes 得到[l,t,r,b]中最小的一个点是否大于0的bool值。
                is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0 #如果小于0 代表负样本  注意torch.min() 返回值有value和index。a.min(dim=2)[0]代表取value
            
            # max_reg_targets_per_im 得到[l,t,r,b]中最大的一个点。
            max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0] 


            # limit the regression range for each location 
            # 根据paper 这样就把FPN层的map回input image的每个的像素的点进行限制。
            is_cared_in_the_level = \
                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
                (max_reg_targets_per_im <= size_ranges[:, [1]])
            # [22400, instances] bool类型
 
            locations_to_gt_area = area[None].repeat(len(locations), 1) # [22400, instances]
            locations_to_gt_area[is_in_boxes == 0] = INF # 1. 第一个限制，把在center sample区域外面的location置为负样本。
            locations_to_gt_area[is_cared_in_the_level == 0] = INF # 2. 第二个限制，把max(l,r,t,b)超过或者低于第i个FPN层，就相当于把大小物体的预测分开了

            # if there are still more than one objects for a location,
            # we choose the one with minimal area
            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
            # reg_targets_per_im [22400, instances, 4] 取经过筛选后的回归的点。
            reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds] # [22400, instances, 4]
            target_inds_per_im = locations_to_gt_inds + num_targets # 为什么要加num_targets??? 答: blendmask中有用到

            num_targets += len(targets_per_im)   # len(targets_per_im) 的值是  Instances(num_instances=23,........)中的23，也就是instances的个数。

            labels_per_im = labels_per_im[locations_to_gt_inds]
            labels_per_im[locations_to_min_area == INF] = self.num_classes # 如果min area还是为INF，那么就把负样本的点的类别设置为80 backbround

            labels.append(labels_per_im) # [22400]
            reg_targets.append(reg_targets_per_im) # [22400, 4]
            target_inds.append(target_inds_per_im) # [22400]
            pdb.set_trace()
        pdb.set_trace()
        return {
    
    
            "labels": labels,
            "reg_targets": reg_targets,
            "target_inds": target_inds
        }
    
    
    def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None):
        """
        Return the losses from a set of FCOS predictions and their associated ground-truth.

        Returns:
            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
        """
        #losses 调用了 _get_ground_truth函数
        training_targets = self._get_ground_truth(locations, gt_instances)

        # Collect all logits and regression predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
        # The final ordering is L, N, H, W from slowest to fastest axis.

        instances = Instances((0, 0))
        instances.labels = cat([
            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
            x.reshape(-1) for x in training_targets["labels"]
        ], dim=0)
        instances.gt_inds = cat([
            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
            x.reshape(-1) for x in training_targets["target_inds"]
        ], dim=0)
        instances.im_inds = cat([
            x.reshape(-1) for x in training_targets["im_inds"]
        ], dim=0)
        instances.reg_targets = cat([
            # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
            x.reshape(-1, 4) for x in training_targets["reg_targets"]
        ], dim=0,)
        instances.locations = cat([
            x.reshape(-1, 2) for x in training_targets["locations"]
        ], dim=0)
        instances.fpn_levels = cat([
            x.reshape(-1) for x in training_targets["fpn_levels"]
        ], dim=0)

        instances.logits_pred = cat([
            # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
            x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred
        ], dim=0,)
        instances.reg_pred = cat([
            # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
            x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred
        ], dim=0,)
        instances.ctrness_pred = cat([
            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
            x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred
        ], dim=0,)

        if len(top_feats) > 0: # blendmask 
            instances.top_feats = cat([
                # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1)   [784, -1]
                x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats
            ], dim=0,)\
                
        '''
        in blendmask:
            top_feats[0].size()
        torch.Size([2, 784, 96, 148])
            top_feats[1].size()
        torch.Size([2, 784, 48, 74])
            top_feats[2].size()
        torch.Size([2, 784, 24, 37])
            top_feats[3].size()
        torch.Size([2, 784, 12, 19])
            top_feats[4].size()
        torch.Size([2, 784, 6, 10])
        '''
        # instances.top_feats.size() [37872, 784]  在接下来的fcos_losses(self, instances)函数中会继续筛选，最后只剩下[instances, 784]的大小。
        # 这就是attention的矩阵方法:
        # 每一行有784个特征。784代表又784个channel,而37872代表了hw * batchsize的大小.
        # 说白了就把二维的图像h*w平铺成了1维度hw
        pdb.set_trace()
        return self.fcos_losses(instances)

    def fcos_losses(self, instances):
        num_classes = instances.logits_pred.size(1)
        assert num_classes == self.num_classes

        labels = instances.labels.flatten() # [36268]
        pdb.set_trace()
        # 开始将 H * W * 5(eg 36268)的像素点 筛选到只包含正样本的数量。
        pos_inds = torch.nonzero(labels != num_classes).squeeze(1)# [155], 因为num_classes = 80代表的是负样本（正样本应该是0 ~ 79), 因此labels != num_classes计算正样本的索引
        num_pos_local = pos_inds.numel() # eg. 155 总共正样本的像素点
        num_gpus = get_world_size()
        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() # 155
        num_pos_avg = max(total_num_pos / num_gpus, 1.0)

        # prepare one_hot
        class_target = torch.zeros_like(instances.logits_pred) # [36268, 80]
        class_target[pos_inds, labels[pos_inds]] = 1

        class_loss = sigmoid_focal_loss_jit(
            instances.logits_pred, #eg. [36268, 80]
            class_target, # eg. [36268, 80]
            alpha=self.focal_loss_alpha,
            gamma=self.focal_loss_gamma,
            reduction="sum", # 求和
        ) / num_pos_avg # 只除以前景框的数量, reference: https://blog.csdn.net/bhfs9999/article/details/103754077

        instances = instances[pos_inds] # 这一步后 len(instances) =  155 直接筛选了到所需的instances
        instances.pos_inds = pos_inds # 同样筛选pos_inds
        pdb.set_trace()
        ctrness_targets = compute_ctrness_targets(instances.reg_targets)
        #调式
        pdb.set_trace()
        ctrness_targets_sum = ctrness_targets.sum()
        loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
        instances.gt_ctrs = ctrness_targets
        #IOU LOSS FCOS是先检测每个点是否为正样本，得到类别后，再根据这些正样本去检测
        if pos_inds.numel() > 0: # 并且将ctrness的gt值， 加入了reg_loss中训练
            # IOU loss
            reg_loss = self.loc_loss_func(
                instances.reg_pred, # eg. [155, 4]
                instances.reg_targets, # eg. [155, 4]
                ctrness_targets # eg. [155]
            ) / loss_denorm # loss_denor为ctrness_targets.sum()

            ctrness_loss = F.binary_cross_entropy_with_logits(
                instances.ctrness_pred,
                ctrness_targets,
                reduction="sum"
            ) / num_pos_avg # num_pos_avg为前景框的数量
        else:
            reg_loss = instances.reg_pred.sum() * 0
            ctrness_loss = instances.ctrness_pred.sum() * 0

        losses = {
    
    
            "loss_fcos_cls": class_loss,
            "loss_fcos_loc": reg_loss,
            "loss_fcos_ctr": ctrness_loss
        }
        extras = {
    
    
            "instances": instances,
            "loss_denorm": loss_denorm
        }
        return extras, losses

    def predict_proposals(
            self, logits_pred, reg_pred, ctrness_pred,
            locations, image_sizes, top_feats=None
    ):
        if self.training:
            self.pre_nms_thresh = self.pre_nms_thresh_train
            self.pre_nms_topk = self.pre_nms_topk_train
            self.post_nms_topk = self.post_nms_topk_train
        else:
            self.pre_nms_thresh = self.pre_nms_thresh_test
            self.pre_nms_topk = self.pre_nms_topk_test
            self.post_nms_topk = self.post_nms_topk_test

        sampled_boxes = []

        bundle = {
    
    
            "l": locations, "o": logits_pred,
            "r": reg_pred, "c": ctrness_pred,
            "s": self.strides,
        }

        if len(top_feats) > 0:
            bundle["t"] = top_feats

        for i, per_bundle in enumerate(zip(*bundle.values())):
            # get per-level bundle
            per_bundle = dict(zip(bundle.keys(), per_bundle))
            # recall that during training, we normalize regression targets with FPN's stride.
            # we denormalize them here.
            l = per_bundle["l"]
            o = per_bundle["o"]
            r = per_bundle["r"] * per_bundle["s"]
            c = per_bundle["c"]
            t = per_bundle["t"] if "t" in bundle else None

            sampled_boxes.append(
                self.forward_for_single_feature_map(
                    l, o, r, c, image_sizes, t
                )
            )

            for per_im_sampled_boxes in sampled_boxes[-1]:
                per_im_sampled_boxes.fpn_levels = l.new_ones(
                    len(per_im_sampled_boxes), dtype=torch.long
                ) * i

        boxlists = list(zip(*sampled_boxes))
        boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
        boxlists = self.select_over_all_levels(boxlists)
        pdb.set_trace()
        return boxlists

    def forward_for_single_feature_map(
            self, locations, logits_pred, reg_pred,
            ctrness_pred, image_sizes, top_feat=None
    ):
        pdb.set_trace()
        N, C, H, W = logits_pred.shape

        # put in the same format as locations
        logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1)
        logits_pred = logits_pred.reshape(N, -1, C).sigmoid()
        box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
        box_regression = box_regression.reshape(N, -1, 4)
        ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1)
        ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid()
        if top_feat is not None:
            top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
            top_feat = top_feat.reshape(N, H * W, -1)

        # if self.thresh_with_ctr is True, we multiply the classification
        # scores with centerness scores before applying the threshold.
        if self.thresh_with_ctr:
            logits_pred = logits_pred * ctrness_pred[:, :, None]
        candidate_inds = logits_pred > self.pre_nms_thresh
        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk)

        if not self.thresh_with_ctr:
            logits_pred = logits_pred * ctrness_pred[:, :, None]

        results = []
        for i in range(N):
            per_box_cls = logits_pred[i]
            per_candidate_inds = candidate_inds[i]
            per_box_cls = per_box_cls[per_candidate_inds]

            per_candidate_nonzeros = per_candidate_inds.nonzero()
            per_box_loc = per_candidate_nonzeros[:, 0]
            per_class = per_candidate_nonzeros[:, 1]

            per_box_regression = box_regression[i]
            per_box_regression = per_box_regression[per_box_loc]
            per_locations = locations[per_box_loc]
            if top_feat is not None:
                per_top_feat = top_feat[i]
                per_top_feat = per_top_feat[per_box_loc]

            per_pre_nms_top_n = pre_nms_top_n[i]

            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
                per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
                per_class = per_class[top_k_indices]
                per_box_regression = per_box_regression[top_k_indices]
                per_locations = per_locations[top_k_indices]
                if top_feat is not None:
                    per_top_feat = per_top_feat[top_k_indices]

            detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
                per_locations[:, 1] - per_box_regression[:, 1],
                per_locations[:, 0] + per_box_regression[:, 2],
                per_locations[:, 1] + per_box_regression[:, 3],
            ], dim=1)

            boxlist = Instances(image_sizes[i])
            boxlist.pred_boxes = Boxes(detections)
            boxlist.scores = torch.sqrt(per_box_cls)
            boxlist.pred_classes = per_class
            boxlist.locations = per_locations
            if top_feat is not None:
                boxlist.top_feat = per_top_feat
            results.append(boxlist)
        pdb.set_trace()
        return results

    def select_over_all_levels(self, boxlists):
        num_images = len(boxlists)
        results = []
        for i in range(num_images):
            # multiclass nms
            result = ml_nms(boxlists[i], self.nms_thresh)
            number_of_detections = len(result)

            # Limit to max_per_image detections **over all classes**
            if number_of_detections > self.post_nms_topk > 0:
                cls_scores = result.scores
                image_thresh, _ = torch.kthvalue(
                    cls_scores.cpu(),
                    number_of_detections - self.post_nms_topk + 1
                )
                keep = cls_scores >= image_thresh.item()
                keep = torch.nonzero(keep).squeeze(1)
                result = result[keep]
            results.append(result)
        return results



'''
in blendmask:
(Pdb) top_feats[0].size()
torch.Size([2, 784, 96, 148])
(Pdb) top_feats[1].size()
torch.Size([2, 784, 48, 74])
(Pdb) top_feats[2].size()
torch.Size([2, 784, 24, 37])
(Pdb) top_feats[3].size()
torch.Size([2, 784, 12, 19])
(Pdb) top_feats[4].size()
torch.Size([2, 784, 6, 10])

'''

3. AdelaiDet/adet/layers/iou_loss.py


class IOULoss(nn.Module):
    """
    Intersetion Over Union (IoU) loss which supports three
    different IoU computations:

    * IoU
    * Linear IoU
    * gIoU
    """
    def __init__(self, loc_loss_type='iou'):
        super(IOULoss, self).__init__()
        self.loc_loss_type = loc_loss_type

    def forward(self, pred, target, weight=None): # !!!对于Fcos 这里的weight是 center-ness
        """
        Args:
            pred: Nx4 predicted bounding boxes
            target: Nx4 target bounding boxes
            weight: N loss weight for each instance
        """
        pred_left = pred[:, 0]
        pred_top = pred[:, 1]
        pred_right = pred[:, 2]
        pred_bottom = pred[:, 3]

        target_left = target[:, 0]
        target_top = target[:, 1]
        target_right = target[:, 2]
        target_bottom = target[:, 3]

        target_aera = (target_left + target_right) * \
                      (target_top + target_bottom)
        pred_aera = (pred_left + pred_right) * \
                    (pred_top + pred_bottom)

        w_intersect = torch.min(pred_left, target_left) + \
                      torch.min(pred_right, target_right)
        h_intersect = torch.min(pred_bottom, target_bottom) + \
                      torch.min(pred_top, target_top)

        g_w_intersect = torch.max(pred_left, target_left) + \
                        torch.max(pred_right, target_right)
        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
                        torch.max(pred_top, target_top)
        ac_uion = g_w_intersect * g_h_intersect

        area_intersect = w_intersect * h_intersect
        area_union = target_aera + pred_aera - area_intersect

        ious = (area_intersect + 1.0) / (area_union + 1.0)
        gious = ious - (ac_uion - area_union) / ac_uion
        if self.loc_loss_type == 'iou':
            losses = -torch.log(ious)
        elif self.loc_loss_type == 'linear_iou':
            losses = 1 - ious
        elif self.loc_loss_type == 'giou':
            losses = 1 - gious
        else:
            raise NotImplementedError

        if weight is not None:
            return (losses * weight).sum() # 通过IOU loss 得到的L_reg, 还需要乘上weight(ctrness)
        else:
            return losses.sum()

4. AdelaiDet/adet/utils/comm.py

import torch
import torch.nn.functional as F
import torch.distributed as dist

from detectron2.utils.comm import get_world_size
import pdb

def reduce_sum(tensor):
    world_size = get_world_size()
    if world_size < 2:
        return tensor
    tensor = tensor.clone()
    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    return tensor


def aligned_bilinear(tensor, factor):
    assert tensor.dim() == 4
    assert factor >= 1
    assert int(factor) == factor

    if factor == 1:
        return tensor

    h, w = tensor.size()[2:]
    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
    oh = factor * h + 1
    ow = factor * w + 1
    tensor = F.interpolate(
        tensor, size=(oh, ow),
        mode='bilinear',
        align_corners=True
    )
    tensor = F.pad(
        tensor, pad=(factor // 2, 0, factor // 2, 0),
        mode="replicate"
    )

    return tensor[:, :, :oh - 1, :ow - 1]

# FCOS self.fpn_strides  [8, 16, 32, 64, 128]
# 其实就是把FPN feature_map[i]上的点映射到原图input image
def compute_locations(h, w, stride, device):
    shifts_x = torch.arange(
        0, w * stride, step=stride,
        dtype=torch.float32, device=device
    )

    # 76
    shifts_y = torch.arange(
        0, h * stride, step=stride,
        dtype=torch.float32, device=device
    )
    # 52
    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)  # level = 1 : [52 * 76] 
    shift_x = shift_x.reshape(-1) #  level = 1 : [52, 76]  --> 3952
    shift_y = shift_y.reshape(-1) #  level = 1 : [52, 76]  --> 3952
    '''
    paper :
    For each location (x, y) on the feature map Fi, we can
    map it back onto the input image as (s/2 + xs, s/2 + ys)
    )
    '''
    locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 # [3952, 2]
    pdb.set_trace()
    return locations
    '''
    (Pdb) locations[0].size()
    torch.Size([16800, 2]) 16800 = 100 * 168
    (Pdb) locations[1].size() 
    torch.Size([4200, 2])  4200 = 50 * 84
    (Pdb) locations[2].size()
    torch.Size([1050, 2])
    (Pdb) locations[3].size()
    torch.Size([273, 2])
    (Pdb) locations[4].size()
    torch.Size([77, 2])

    '''

[FCOS]FCOS: Fully Convolutional One-Stage Object Detection代码笔记