【pytorch】Mask-RCNN官方源码剖析(Ⅲ)

模型定义(modeling)-关键部分

无论是在前面的训练文件中还是测试文件中，都使用了build_detection_model(cfg)函数来创建模型，该函数可以通过配置文件组合出不同类型的模型，为了了解模型的内部定义细节，需对./maskrcnn_benchmark/modeling/下的文件进行分析：
在这里插入图片描述

detector 模型定义了入口

detectors.py 文件解析：
根据给定的配置信息实例化一个class GeneralizedRCNN的对象

from .generalized_rcnn import GeneralizedRCNN

_DETECTION_META_ARCHITECTURES = {
    
    "GeneralizedRCNN": GeneralizedRCNN}
# 该函数是创建模型的入口函数，也是唯一的模型创建函数
def build_detection_model(cfg):
# 构建一个模型字典，虽然只有一对键值，但是方便后续的扩展
    meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE]
    return meta_arch(cfg)
    # 上面的语句等价于
    # return GeneralizedRCNN(cfg)

上面代码利用配置信息 cfg 实例化了一个class GeneralizedRCNN 类，该类定义在./maskrcnn_benchmark/modeling/detector/generalized_rcnn.py文件中。

generalized_rcnn.py文件解析：

import torch
from torch import nn

from maskrcnn_benchmark.structures.image_list import to_image_list

from ..backbone import build_backbone
from ..rpn.rpn import build_rpn
from ..roi_heads.roi_heads import build_roi_heads


class GeneralizedRCNN(nn.Module):
    """
    Main class for Generalized R-CNN. Currently supports boxes and masks.
    该类是rcnn模型的共同抽象，目前支持 boxes 和 masks 两种形式的标签
    It consists of three main parts:
    - backbone
    - rpn
    - heads: takes the features + the proposals from the RPN and computes
        detections / masks from it.利用前面网络输出的 features 和 proposals 来计算 detections/masks
    """

    def __init__(self, cfg): # 根据配置信息初始化模型
        super(GeneralizedRCNN, self).__init__()

        self.backbone = build_backbone(cfg) # 根据配置信息创建 backbone 网络
        self.rpn = build_rpn(cfg, self.backbone.out_channels) # 根据配置信息创建 rpn 网络
        self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) # 根据配置信息创建 roi_heads

    def forward(self, images, targets=None):
        """
        定义模型的前向传播过程
        Arguments:
            images (list[Tensor] or ImageList): images to be processed
            targets (list[BoxList]): ground-truth boxes present in the image (optional)
        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).
                在训练阶段，返回包含模型损失的字典；在推理阶段，返回模型的预测结果
        """
        # 当 training 设置为 true 时，必须提供targets
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
        images = to_image_list(images) # 将图片的数据类型转换成 imagelist
        features = self.backbone(images.tensors) # 利用 backbone网络获取图片的features
        # 利用 rpn 网络获取 proposals 和相应的loss
        proposals, proposal_losses = self.rpn(images, features, targets)
        if self.roi_heads: # 如果 roi_heads 不为 none 的话，就直接计算其输出的结果
            x, result, detector_losses = self.roi_heads(features, proposals, targets)
        else:
            # RPN-only models don't have roi_heads
            x = features
            result = proposals
            detector_losses = {
    
    }

        if self.training: # 训练模式下，输出其损失值
            losses = {
    
    }
            losses.update(detector_losses)
            losses.update(proposal_losses)
            return losses

        return result # 如果不在训练模式下，则输出模型的预测结果

可以看出, MaskrcnnBenchmark 模型的创建主要依赖于三个函数, 即 build_backbone(cfg), build_rpn(cfg), build_roi_heads(cfg).

backbone目录下关于模型骨架的定义

backbone.py文件解析：

from collections import OrderedDict # 导入有序字典

from torch import nn

# 注册器，用于管理module的注册，使得可以像使用字典一样使用module
from maskrcnn_benchmark.modeling import registry
from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
from . import fpn as fpn_module
from . import resnet

# 创建 resnet 骨架网络，根据配置信息会被后面的build_backbone()函数调用
@registry.BACKBONES.register("R-50-C4")
@registry.BACKBONES.register("R-50-C5")
@registry.BACKBONES.register("R-101-C4")
@registry.BACKBONES.register("R-101-C5")
def build_resnet_backbone(cfg):
    body = resnet.ResNet(cfg) # resnet.py 文件中的class Resnet(cfg)
    model = nn.Sequential(OrderedDict([("body", body)])) # 利用 nn.Sequential 定义模型
    model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
    return model

# 创建 fpn 网络，根据配置信息会被下面的 build_backbone 函数调用
@registry.BACKBONES.register("R-50-FPN")
@registry.BACKBONES.register("R-101-FPN")
@registry.BACKBONES.register("R-152-FPN")
def build_resnet_fpn_backbone(cfg):
    body = resnet.ResNet(cfg) # 先创建 resnet 网络
    # 获取 fpn 所需要的 channels 参数
    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
    fpn = fpn_module.FPN( # 利用 fpn.py 文件夹的class FPN 创建 fpn 网络
        in_channels_list=[
            in_channels_stage2,
            in_channels_stage2 * 2,
            in_channels_stage2 * 4,
            in_channels_stage2 * 8,
        ],
        out_channels=out_channels,
        conv_block=conv_with_kaiming_uniform(
            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
        ),
        top_blocks=fpn_module.LastLevelMaxPool(),
    )
    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
    model.out_channels = out_channels
    return model


@registry.BACKBONES.register("R-50-FPN-RETINANET")
@registry.BACKBONES.register("R-101-FPN-RETINANET")
def build_resnet_fpn_p3p7_backbone(cfg):
    body = resnet.ResNet(cfg)
    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
    in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \
        else out_channels
    fpn = fpn_module.FPN(
        in_channels_list=[
            0,
            in_channels_stage2 * 2,
            in_channels_stage2 * 4,
            in_channels_stage2 * 8,
        ],
        out_channels=out_channels,
        conv_block=conv_with_kaiming_uniform(
            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
        ),
        top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels),
    )
    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
    model.out_channels = out_channels
    return model

# 利用上述函数来进行模型创建
def build_backbone(cfg):
    assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
        "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
            cfg.MODEL.BACKBONE.CONV_BODY
        )
    return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)

resnet.py 文件解析：

from collections import namedtuple

import torch
import torch.nn.functional as F
from torch import nn

from maskrcnn_benchmark.layers import FrozenBatchNorm2d
from maskrcnn_benchmark.layers import Conv2d
from maskrcnn_benchmark.layers import DFConv2d
from maskrcnn_benchmark.modeling.make_layers import group_norm
from maskrcnn_benchmark.utils.registry import Registry


# ResNet stage specification
StageSpec = namedtuple(
    "StageSpec",
    [
        "index",  # Index of the stage, eg 1, 2, ..,. 5
        "block_count",  # Number of residual blocks in the stage stage当中的残差块数量
        "return_features",  # True => return the last feature map from this stage
    ],
)

# -----------------------------------------------------------------------------
# Standard ResNet models
# -----------------------------------------------------------------------------
# ResNet-50 (including all stages)
ResNet50StagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True))
)
# ResNet-50 up to stage 4 (excludes stage 5)
ResNet50StagesTo4 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True))
)
# ResNet-101 (including all stages)
ResNet101StagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True))
)
# ResNet-101 up to stage 4 (excludes stage 5)
ResNet101StagesTo4 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True))
)
# ResNet-50-FPN (including all stages)
ResNet50FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True))
)
# ResNet-101-FPN (including all stages)
ResNet101FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True))
)
# ResNet-152-FPN (including all stages)
ResNet152FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True))
)

class ResNet(nn.Module):
# 初始化
    def __init__(self, cfg):
        super(ResNet, self).__init__()
        # 如果希望在 forward 函数中使用 cfg，那么就应该创建一个副本以其使用
        # self.cfg = cfg.clone()

        # 将配置文件中的字符串转化成具体的实现，下面三个分别使用了对应的注册模块，定义在文件的最后
        # stem的实现，也就是 resnet 的第一阶段 conv1
        # cfg.MODEL.RESNETS.STEM_FUNC = 'StemWithFixedBatchNorm'
        stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC]
        # resnet conv2_x~conv5_x 的实现
        # eg: cfg.MODEL.CONV_BODY="R-50-FPN"
        stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY]
        # residual transformation function
        # cfg.MODEL.RESNETS.TRANS_FUNC="BottleneckWithFixedBatchNorm"
        transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC]

        # 获取上面各个部分的组成实现后，可以利用上述实现来构建模型
        # Construct the stem module，构建stem module（就是 resnet 的 stage1，或者conv1）
        self.stem = stem_module(cfg)
        

        # 获取相应的信息来构建 resnet 的其他 stages的卷积层
        # 当 num_groups = 1 时为resnet，>1 时为 resnext
        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
        # in_channels 指的是向后面的第二阶段输入时特征图的通道数，也就是 stem 的输出通道数，默认为 64
        in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
        stage2_bottleneck_channels = num_groups * width_per_group
        # 第二阶段的输出，resnet 系列标准的模型，可以从 resnet第二阶段的输出通道数判断后续的通道数
        # 默认为 256，则后续分别为 512,1024,2048，若为64，则后续分别为128,256,512
        stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
        # 创建一个空的stages列表和对应的特征图字典
        self.stages = []
        self.return_features = {
    
    }
        for stage_spec in stage_specs:
            name = "layer" + str(stage_spec.index)
            # 计算每一个 stage的输出通道数，每经过一个stage，通道数都会加倍
            stage2_relative_factor = 2 ** (stage_spec.index - 1)
            # 计算输入特征图的通道数
            bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor
            # 计算输出特征图的通道数
            out_channels = stage2_out_channels * stage2_relative_factor
            stage_with_dcn = cfg.MODEL.RESNETS.STAGE_WITH_DCN[stage_spec.index -1]
            # 该函数可以根据传入的参数创建对应 stage 的模块
            # 当获取到所有需要的参数以后，调用本文件的'_make_stage'函数
            module = _make_stage(
                transformation_module,
                in_channels, # 输入的通道数
                bottleneck_channels, # 压缩后的通道数
                out_channels, # 输出的通道数
                stage_spec.block_count, # 当前stage 的卷积层数量
                num_groups, # resnet 时为1，resnext时>1
                cfg.MODEL.RESNETS.STRIDE_IN_1X1,
                # 当处于 stage3~5时，需要在开始的时候使用 stride=2 来 downsize
                first_stride=int(stage_spec.index > 1) + 1,
                dcn_config={
    
    
                    "stage_with_dcn": stage_with_dcn,
                    "with_modulated_dcn": cfg.MODEL.RESNETS.WITH_MODULATED_DCN,
                    "deformable_groups": cfg.MODEL.RESNETS.DEFORMABLE_GROUPS,
                }
            )
            # 下一个 stage的输入通道即为当前 stage 的输入通道数
            in_channels = out_channels
            # 将当前 stage 模块添加到模型中
            self.add_module(name, module)
            # 将 stage 的名称添加到列表中
            self.stages.append(name)
            # 将 stage 的布尔值添加到字典中
            self.return_features[name] = stage_spec.return_features

        # Optionally freeze (requires_grad=False) parts of the backbone
        # 根据配置文件参数选择性的冻结某些层（requires_grad=false）
        self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)

    # 将指定的参数置为： requires_grad = false
    def _freeze_backbone(self, freeze_at):
    # 根据给定的参数冻结某些层的参数更新
        if freeze_at < 0:
            return
        for stage_index in range(freeze_at):
            if stage_index == 0:
                m = self.stem  # stage 0 is the stem
            else:
                m = getattr(self, "layer" + str(stage_index))
            # 将 m 中的所有参数置为不更新的状态
            for p in m.parameters():
                p.requires_grad = False

    # 定义resnet 前向传播过程
    def forward(self, x):
        outputs = []
        x = self.stem(x) # 先经过 stem(stage 1)
        # 再依次计算 stage2~5的结果
        for stage_name in self.stages:
            x = getattr(self, stage_name)(x)
            if self.return_features[stage_name]:
                # 将stage 2~5 的计算结果（特征图）以列表的形式保存
                outputs.append(x)
        # 将结果返回，outputs为列表形式，元素为各个stage的特征图，正好作为FPN 的输入
        return outputs


class ResNetHead(nn.Module):
    def __init__(
        self,
        block_module,
        stages,
        num_groups=1,
        width_per_group=64,
        stride_in_1x1=True,
        stride_init=None,
        res2_out_channels=256,
        dilation=1,
        dcn_config={
    
    }
    ):
        super(ResNetHead, self).__init__()
        # 获取不同 stage 的通道数相对于 stage2 的倍数
        stage2_relative_factor = 2 ** (stages[0].index - 1)
        # 获取压缩后的 stage2 的 channels
        stage2_bottleneck_channels = num_groups * width_per_group
        # 获取输出的 channels
        out_channels = res2_out_channels * stage2_relative_factor
        # 获取输入的 channels
        in_channels = out_channels // 2
        # 获取压缩后的 channels
        bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor

        block_module = _TRANSFORMATION_MODULES[block_module]

        self.stages = []
        stride = stride_init
        for stage in stages:
            name = "layer" + str(stage.index)
            if not stride:
            # 当处于 stage3~5时，需要在开始时候使用 stride=2 来 downsize
                stride = int(stage.index > 1) + 1
            module = _make_stage(
                block_module,
                in_channels,
                bottleneck_channels,
                out_channels,
                stage.block_count,
                num_groups,
                stride_in_1x1,
                first_stride=stride,
                dilation=dilation,
                dcn_config=dcn_config
            )
            stride = None
            self.add_module(name, module)
            self.stages.append(name)
        self.out_channels = out_channels

# 定义前向传播
    def forward(self, x):
        for stage in self.stages:
            x = getattr(self, stage)(x)
        return x

# 创建 resnet 的 residual-block
def _make_stage(
    transformation_module,
    in_channels,
    bottleneck_channels,
    out_channels,
    block_count,
    num_groups,
    stride_in_1x1,
    first_stride,
    dilation=1,
    dcn_config={
    
    }
):
    blocks = []
    stride = first_stride
    for _ in range(block_count):
        blocks.append(
            transformation_module(
                in_channels,
                bottleneck_channels,
                out_channels,
                num_groups,
                stride_in_1x1,
                stride,
                dilation=dilation,
                dcn_config=dcn_config
            )
        )
        stride = 1
        in_channels = out_channels
    return nn.Sequential(*blocks)

# 定义每一个resnet-bottleneck
# 对于 resnet50 来说,  stage2~5每一个阶段的 bottleneck block 的数量分别为 3,4,6,3, 并且各个相邻 stage 之间的通道数都是两倍的关系, 所以可以很容易的从一个 stage 的通道数推知另一个 stage 的通道数
class Bottleneck(nn.Module):
    def __init__(
        self,
        in_channels, # bottleneck 的输入 channels
        bottleneck_channels, # bottleneck 压缩后的channels
        out_channels, # 当前stage的输出 channels
        num_groups,
        stride_in_1x1,
        stride,
        dilation,
        norm_func,
        dcn_config
    ):
        super(Bottleneck, self).__init__()
        # downsample：当 bottleneck 的输入和输出 channels 不相等时，则需要采取一定的策略
        # 即在输入输出通道数不相等时才使用 projection shortcuts
        # 也就是利用参数矩阵映射使得输入输出的channels 相等
        self.downsample = None
        # 当输入输出通道数不同时，额外添加一个 1x1 的卷积层使得输入通道映射成输出通道数
        if in_channels != out_channels:
            down_stride = stride if dilation == 1 else 1
            self.downsample = nn.Sequential(
                Conv2d(
                    in_channels, out_channels,
                    kernel_size=1, stride=down_stride, bias=False
                ),
                norm_func(out_channels), # 后接固定参数的bn层
            )
            for modules in [self.downsample,]:
                for l in modules.modules():
                    if isinstance(l, Conv2d):
                        nn.init.kaiming_uniform_(l.weight, a=1)

        if dilation > 1:
            stride = 1 # reset to be 1

        # The original MSRA ResNet models have stride in the first 1x1 conv
        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
        # stride in the 3x3 conv
        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
        # 获取到当前stage所需的参数后，就创建相应的卷积层
        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
        )
        self.bn1 = norm_func(bottleneck_channels) # 后接一个固定参数的 bn 层
        # TODO: specify init for the above dcn层的问题
        with_dcn = dcn_config.get("stage_with_dcn", False)
        if with_dcn:
            deformable_groups = dcn_config.get("deformable_groups", 1)
            with_modulated_dcn = dcn_config.get("with_modulated_dcn", False)
            self.conv2 = DFConv2d(
                bottleneck_channels,
                bottleneck_channels,
                with_modulated_dcn=with_modulated_dcn,
                kernel_size=3,
                stride=stride_3x3,
                groups=num_groups,
                dilation=dilation,
                deformable_groups=deformable_groups,
                bias=False
            )
        else:
        # 创建 bottleneck 的第二层卷积层
            self.conv2 = Conv2d(
                bottleneck_channels,
                bottleneck_channels,
                kernel_size=3,
                stride=stride_3x3,
                padding=dilation,
                bias=False,
                groups=num_groups,
                dilation=dilation
            )
            nn.init.kaiming_uniform_(self.conv2.weight, a=1)

        self.bn2 = norm_func(bottleneck_channels) # 后接一个BN层

        # 创建 bottleneck 的最后一个卷积层，padding默认为 1
        self.conv3 = Conv2d(
            bottleneck_channels, out_channels, kernel_size=1, bias=False
        )
        self.bn3 = norm_func(out_channels)

        for l in [self.conv1, self.conv3,]:
            nn.init.kaiming_uniform_(l.weight, a=1)

    def forward(self, x):
    # 执行一次 forward，相当于执行一次 bottleneck
    # 默认情况下，具有三个卷积层，一个恒等连接，每个卷积层之后都带有 bn 和 relu激活
    # 注意： 最后一个激活要放在恒等连接之后
        identity = x # 恒等连接，直接令残差等于 x 即可
        # conv1， bn1
        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu_(out)
        # conv2， bn2
        out = self.conv2(out)
        out = self.bn2(out)
        out = F.relu_(out)
        # conv3，bn3
        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
        # 如果输入输出的通道数不同，则必须要通过映射使之相同
            identity = self.downsample(x)

        out += identity
        out = F.relu_(out) # 最后再进行激活

        return out

# resnet的第一阶段，在resnet 50 中，该阶段主要包含一个7x7大小的卷积核，在maskrcnnbenchmark的视线中，为了方便，将第二阶段最开始的max pooling层也放在了stem中的forward函数中实现（一般不带参数网络层的都放在forward中）
class BaseStem(nn.Module):
    def __init__(self, cfg, norm_func):
        super(BaseStem, self).__init__()
        # resnet-50，out_channels=64
        out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
        # 输入的channels 为 3 ，输出为64
        self.conv1 = Conv2d(
            3, out_channels, kernel_size=7, stride=2, padding=3, bias=False
        )
        # 使用固定参数的bn层
        self.bn1 = norm_func(out_channels)
        # 权重初始化方式
        for l in [self.conv1,]:
            nn.init.kaiming_uniform_(l.weight, a=1)
# 定义前向传播过程
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu_(x) # 原地激活，因为不含参数，因此不放在模型定义中，而放在 forward中实现
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x

#使用固定的BN
class BottleneckWithFixedBatchNorm(Bottleneck):
    def __init__(
        self,
        in_channels,
        bottleneck_channels,
        out_channels,
        num_groups=1,
        stride_in_1x1=True,
        stride=1,
        dilation=1,
        dcn_config={
    
    }
    ):
        super(BottleneckWithFixedBatchNorm, self).__init__(
            in_channels=in_channels,
            bottleneck_channels=bottleneck_channels,
            out_channels=out_channels,
            num_groups=num_groups,
            stride_in_1x1=stride_in_1x1,
            stride=stride,
            dilation=dilation,
            norm_func=FrozenBatchNorm2d,
            dcn_config=dcn_config
        )


class StemWithFixedBatchNorm(BaseStem):
    def __init__(self, cfg):
        super(StemWithFixedBatchNorm, self).__init__(
            cfg, norm_func=FrozenBatchNorm2d
        )


class BottleneckWithGN(Bottleneck):
    def __init__(
        self,
        in_channels,
        bottleneck_channels,
        out_channels,
        num_groups=1,
        stride_in_1x1=True,
        stride=1,
        dilation=1,
        dcn_config={
    
    }
    ):
        super(BottleneckWithGN, self).__init__(
            in_channels=in_channels,
            bottleneck_channels=bottleneck_channels,
            out_channels=out_channels,
            num_groups=num_groups,
            stride_in_1x1=stride_in_1x1,
            stride=stride,
            dilation=dilation,
            norm_func=group_norm,
            dcn_config=dcn_config
        )


class StemWithGN(BaseStem):
    def __init__(self, cfg):
        super(StemWithGN, self).__init__(cfg, norm_func=group_norm)


# 文件注册的各个模块, 这些模块会通过配置文件中的字符串信息来决定调用哪一个类或者参数
_TRANSFORMATION_MODULES = Registry({
    
    
    "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm,
    "BottleneckWithGN": BottleneckWithGN,
})

_STEM_MODULES = Registry({
    
    
    "StemWithFixedBatchNorm": StemWithFixedBatchNorm,
    "StemWithGN": StemWithGN,
})

_STAGE_SPECS = Registry({
    
    
    "R-50-C4": ResNet50StagesTo4,
    "R-50-C5": ResNet50StagesTo5,
    "R-101-C4": ResNet101StagesTo4,
    "R-101-C5": ResNet101StagesTo5,
    "R-50-FPN": ResNet50FPNStagesTo5,
    "R-50-FPN-RETINANET": ResNet50FPNStagesTo5,
    "R-101-FPN": ResNet101FPNStagesTo5,
    "R-101-FPN-RETINANET": ResNet101FPNStagesTo5,
    "R-152-FPN": ResNet152FPNStagesTo5,
})

fpn.py 文件解析：

在 backbone.py 文件中的 build_resnet_fpn_backbone(cfg) 函数中, 使用了 fpn = fpn_module.FPN(...) 来创建一个 FPN 类的实例对象, 并且利用 nn.Sequential() 将 ResNet 和 FPN 组合在一起形成一个模型, 并将其返回, 下面, 我们就来看看 FPN 网络的具体实现, 实例代码位于 ./maskrcnn_benchmark/modeling/backbone/fpn.py 文件中, 解析如下:

import torch
import torch.nn.functional as F
from torch import nn


class FPN(nn.Module):
    """
    在一系列的 feature map（实际上就是 stage2~5 的最后一层输出）添加FPN
    这些 feature maps 的depth 假定是不断递增的，并且feature maps必须是连续的（从stage角度）
    """

    def __init__(
        self, in_channels_list, out_channels, conv_block, top_blocks=None
    ):
        """
        Arguments:
            in_channels_list (list[int]): number of channels for each feature map that
                will be fed 指定了送入 fpn 的每个 feature map 的通道数
            out_channels (int): number of channels of the FPN representation 
            fpn表征的通道数，所有的特征图最终都会转换成这个通道数的大小
            top_blocks (nn.Module or None): if provided, an extra operation will
                be performed on the output of the last (smallest resolution)
                FPN output, and the result will extend the result list
                当提供了 top_blocks 时，就会在 fpn的最后输出上进行一个额外的 op，然后result会扩展成 result list 返回
        """
        super(FPN, self).__init__()
        # 创建两个空列表
        self.inner_blocks = []
        self.layer_blocks = []
        # 假设我们使用的是 resnet-50-fpn 和配置，则 in_channels_list 的值为：
        # [256,512,1024,2048]
        for idx, in_channels in enumerate(in_channels_list, 1): # 下标从1 开始
            # 用下标起名：fpn_inner1,fpn_inner2, fpn_inner3, fpn_inner4
            inner_block = "fpn_inner{}".format(idx)
            # fpn_layer1, fpn_layer2, fpn_layer3, fpn_layer4
            layer_block = "fpn_layer{}".format(idx)

            if in_channels == 0:
                continue
            # 创建 inner_block 模块，这里 in_channels 为各个stage 输出的通道数
            # out_channels 为 256， 定义在用户配置文件中
            # 这里的卷积核大小为1，该卷积的主要作用是改变通道数到 out_channels（降维）
            inner_block_module = conv_block(in_channels, out_channels, 1)
            # 改变 channels 后，在每个 stage 的特征图上再进行 3x3 的卷积计算，通道数不变
            layer_block_module = conv_block(out_channels, out_channels, 3, 1)
            # 在当前特征图上添加 fpn
            self.add_module(inner_block, inner_block_module)
            self.add_module(layer_block, layer_block_module)
            # 将当前 stage 的fpn模块的名字添加到对应的列表当中
            self.inner_blocks.append(inner_block)
            self.layer_blocks.append(layer_block)
        # 将 top_blocks 作为 FPN 类的成员变量
        self.top_blocks = top_blocks

    def forward(self, x):
        """
        Arguments:
            x (list[Tensor]): feature maps for each feature level.
            resnet的计算结果正好满足 fpn 的输入要求，因此可以使用nn.Sequential直接将两者结合
        Returns:
            results (tuple[Tensor]): feature maps after FPN layers.
                They are ordered from highest resolution first.
                经过fpn后的特征图组成的列表，排列顺序是高分辨率的在前
        """
        # 先计算最后一层（分辨率最低）特征图的fpn结果
        last_inner = getattr(self, self.inner_blocks[-1])(x[-1])
        # 创建一个空的结果列表
        results = []
        # 将最后一层的计算结果添加到 results 中
        results.append(getattr(self, self.layer_blocks[-1])(last_inner))
        #[:-1]获取了前三项，[::-1]代表从头到尾切片，步长为-1，效果为列表逆置
        # 举例来说，zip里的操作 self.inner_block[:-1][::-1]的运行结果为
        # [fpn_inner3,fpn_inner2,fpn_inner1],相当于对列表进行了逆置
        for feature, inner_block, layer_block in zip(
            x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1]
        ):
            if not inner_block:
                continue
            # 根据给定的 scale 参数对特征图进行放大/缩小，这里scale=2，所以是放大
            inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")
            # 获取 inner_block 的计算结果
            inner_lateral = getattr(self, inner_block)(feature)
            # TODO use size instead of scale to make it robust to different sizes
            # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:],
            # mode='bilinear', align_corners=False)
            # 将二者叠加作为当前stage的输出，同时作为下一个stage的输入
            last_inner = inner_lateral + inner_top_down
            # 将当前 stage 输出添加到结果列表中，注意还要用 layer_block 执行卷积计算
            # 同时为了使得分辨率最大的在前，我们需要将结果插入到0位置
            results.insert(0, getattr(self, layer_block)(last_inner))

        # 如果 top_blocks 不为空，则需要执行如下的额外的op
        if isinstance(self.top_blocks, LastLevelP6P7):
            last_results = self.top_blocks(x[-1], results[-1])
            results.extend(last_results)
        elif isinstance(self.top_blocks, LastLevelMaxPool):
            last_results = self.top_blocks(results[-1])
            results.extend(last_results) # 将新的计算结果追加进列表中
        # 以元组（只读）形式返回
        return tuple(results)

# 最后一级的 max pool层
class LastLevelMaxPool(nn.Module):
    def forward(self, x):
        return [F.max_pool2d(x, 1, 2, 0)]


class LastLevelP6P7(nn.Module):
    """
    This module is used in RetinaNet to generate extra layers, P6 and P7.
    如果该模型采用retinanet需要采用多的p6和p7层
    """
    def __init__(self, in_channels, out_channels):
        super(LastLevelP6P7, self).__init__()
        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
        for module in [self.p6, self.p7]:
            nn.init.kaiming_uniform_(module.weight, a=1)
            nn.init.constant_(module.bias, 0)
        self.use_P5 = in_channels == out_channels

    def forward(self, c5, p5):
        x = p5 if self.use_P5 else c5
        p6 = self.p6(x)
        p7 = self.p7(F.relu(p6))
        return [p6, p7]

roi_heads

当使用 backbone 和 rpn 构建后特征图谱的生成结构以后, 我们就需要在特征图谱上划分相应的 RoI, 该模块的定义入口就是roi_heads/roi_heads.py中build_roi_heads函数

入口函数build_roi_heads：

def build_roi_heads(cfg, in_channels):
    # individually create the heads, that will be combined together
    # afterwards
    roi_heads = []
    if cfg.MODEL.RETINANET_ON:
        return []
    # 从概念上，下面的 roi 可以同时开启，互不影响，但通常只会开启其中一个
    if not cfg.MODEL.RPN_ONLY: # 使用 rpn
        roi_heads.append(("box", build_roi_box_head(cfg, in_channels)))
    if cfg.MODEL.MASK_ON: # 使用 mask
        roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels)))
    if cfg.MODEL.KEYPOINT_ON: # 使用 key point
        roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels)))

    # combine individual heads in a single module
    if roi_heads:
        roi_heads = CombinedROIHeads(cfg, roi_heads)

    return roi_heads

roi_heads/box_head/box_head.py 文件：

class ROIBoxHead(torch.nn.Module):
    """
    Generic Box Head class.
    """

    def __init__(self, cfg, in_channels):
        super(ROIBoxHead, self).__init__()
        # 定义在 roi_box_feature_extractors.py文件中
        self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels)
        # 函数定义在roi_box_predictors.py
        self.predictor = make_roi_box_predictor(
            cfg, self.feature_extractor.out_channels)
        self.post_processor = make_roi_box_post_processor(cfg) # 定义在 inference.py 文件中
        self.loss_evaluator = make_roi_box_loss_evaluator(cfg) # 定义在 loss.py

    def forward(self, features, proposals, targets=None):
        """
        Arguments:
            features (list[Tensor]): feature-maps from possibly several levels
            proposals (list[BoxList]): proposal boxes
            targets (list[BoxList], optional): the ground-truth targets.
        Returns:
            x (Tensor): the result of the feature extractor
            proposals (list[BoxList]): during training, the subsampled proposals
                are returned. During testing, the predicted boxlists are returned
            losses (dict[Tensor]): During training, returns the losses for the
                head. During testing, returns an empty dict.
        """

        if self.training:
            # Faster R-CNN subsamples during training the proposals with a fixed
            # positive / negative ratio
            with torch.no_grad():
                proposals = self.loss_evaluator.subsample(proposals, targets)

        # extract features that will be fed to the final classifier. The
        # feature_extractor generally corresponds to the pooler + heads
        x = self.feature_extractor(features, proposals)
        # final classifier that converts the features into predictions
        class_logits, box_regression = self.predictor(x)

        if not self.training:
            result = self.post_processor((class_logits, box_regression), proposals)
            return x, result, {
    
    }

        loss_classifier, loss_box_reg = self.loss_evaluator(
            [class_logits], [box_regression]
        )
        return (
            x,
            proposals,
            dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg),
        )

模型定义(modeling)–RPN网络

在 Faster R-CNN 中, 首次提出了 RPN 网络, 该网络用于生成目标检测任务所需要候选区域框, 在MaskrcnnBenchmark 中, 关于 RPN 网络的定义位于 ./maskrcnn_benchmark/modeling/rpn/ 文件夹中, 该文件夹包含以下四个文件:rpn.py、anchor_generator.py、inference.py、loss.py，在 class GeneralizedRCNN(nn.Module) 类中, 会通过 self.rpn = build_rpn(cfg) 函数来创建 RPN 网络, 该函数位于 ./maskrcnn_benchmark/modeling/rpn/rpn.py 文件中。

rpn.py 文件：

def build_fpn(cfg):
    return RPNModule(cfg)

构建 RPN 网络的核心定义在 class RPNModule 中：

class RPNModule(torch.nn.Module):
    """
    Module for RPN computation. Takes feature maps from the backbone and outputs 
    RPN proposals and losses. Works for both FPN and non-FPN.
    从backbone中获取特征图用于计算，输出proposals和损失值
    """

    def __init__(self, cfg, in_channels):
        super(RPNModule, self).__init__()

        self.cfg = cfg.clone()
        # 根据配置文件的信息输出对应的anchor
        anchor_generator = make_anchor_generator(cfg)
       # 创建 rpn heads
        rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD]
        head = rpn_head(
            cfg, in_channels, anchor_generator.num_anchors_per_location()[0]
        )
        # 主要功能是将 bounding boxes 的表示形式编码成易于训练的形式
        rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
       # 根据配置信息对候选框进行后处理，选取合适的框进行训练
        box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True)
        # 选取合适的框用于测试
        box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False)
        # 利用得到的box 获取损失函数
        loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder)
        # 设置相应的成员
        self.anchor_generator = anchor_generator
        self.head = head
        self.box_selector_train = box_selector_train
        self.box_selector_test = box_selector_test
        self.loss_evaluator = loss_evaluator

# 定义前向传播的过程
    def forward(self, images, features, targets=None):
        """
        Arguments:
            images (ImageList): images for which we want to compute the predictions
            features (list[Tensor]): features computed from the images that are
                used for computing the predictions. Each tensor in the list
                correspond to different feature levels
            targets (list[BoxList): ground-truth boxes present in the image (optional)
        Returns:
            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
                image.
            losses (dict[Tensor]): the losses for the model during training. During
                testing, it is an empty dict.
        """
        # 利用给定的特征图谱计算相应的 rpn 结果
        objectness, rpn_box_regression = self.head(features)
        # 在图片上生成 anchors
        anchors = self.anchor_generator(images, features)
        # 当处在训练状态时，调用_foward_train(),当处于推理状态时，调用_forward_test()
        if self.training:
            return self._forward_train(anchors, objectness, rpn_box_regression, targets)
        else:
            return self._forward_test(anchors, objectness, rpn_box_regression)
    # 训练状态时的前向传播函数
    def _forward_train(self, anchors, objectness, rpn_box_regression, targets):
        if self.cfg.MODEL.RPN_ONLY:
            # When training an RPN-only model, the loss is determined by the
            # predicted objectness and rpn_box_regression values and there is
            # no need to transform the anchors into predicted boxes; this is an
            # optimization that avoids the unnecessary transformation.
            boxes = anchors
        else:
            # For end-to-end models, anchors must be transformed into boxes and
            # sampled into a training batch.（注意此时不更新网络参数）
            # 对于 end-to-end 模型来说, anchors 必须被转化成 boxes,
            # 然后采样到目标检测网络的 batch 中用于训练, 注意此时不更新网络参数
            with torch.no_grad():
                boxes = self.box_selector_train(
                    anchors, objectness, rpn_box_regression, targets
                )
        # 获取损失函数的结果
        loss_objectness, loss_rpn_box_reg = self.loss_evaluator(
            anchors, objectness, rpn_box_regression, targets
        )
        losses = {
    
    
            "loss_objectness": loss_objectness,
            "loss_rpn_box_reg": loss_rpn_box_reg,
        }
        return boxes, losses
   # 测试状态时的前向传播函数
    def _forward_test(self, anchors, objectness, rpn_box_regression):
        # 将 anchors 转化成对应的 boxes
        boxes = self.box_selector_test(anchors, objectness, rpn_box_regression)
        if self.cfg.MODEL.RPN_ONLY:
            # For end-to-end models, the RPN proposals are an intermediate state
            # and don't bother to sort them in decreasing score order. For RPN-only
            # models, the proposals are the final output and we return them in
            # high-to-low confidence order.
            # 对于端到端模型来说，RPN proposal仅仅只是网络的一个中间状态，无需将它用降序的顺序排序，直接返回
            # RPN结果即可
            # 但是对于RPN-only 的模式，RPN的输出就是最终结果，需要以置信度从高到低的顺序保存结果并返回
            inds = [
                box.get_field("objectness").sort(descending=True)[1] for box in boxes
            ]
            boxes = [box[ind] for box, ind in zip(boxes, inds)]
        return boxes, {
    
    }

在 class RPNModule 中, 使用了 class RPNHead 作为其头部：

@registry.RPN_HEADS.register("SingleConvRPNHead")
class RPNHead(nn.Module):
    """
    Adds a simple RPN Head with classification and regression heads
    添加 classification 和 regression heads
    """

    def __init__(self, cfg, in_channels, num_anchors):
        """
        Arguments:
            cfg              : config 配置信息
            in_channels (int): number of channels of the input feature 输入特征的通道数
            num_anchors (int): number of anchors to be predicted # 需要预测的anchors数量
        """
        super(RPNHead, self).__init__()
        # 维持通道数不变
        self.conv = nn.Conv2d(
            in_channels, in_channels, kernel_size=3, stride=1, padding=1
        )
        # objectness 预测层，输出的channels 数为 anchors 的数量。（每一点对应K个anchors）
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
        # 预测 box 回归的网络层
        self.bbox_pred = nn.Conv2d(
            in_channels, num_anchors * 4, kernel_size=1, stride=1
        )
        # 对定义的网络层参数进行初始化
        for l in [self.conv, self.cls_logits, self.bbox_pred]:
            torch.nn.init.normal_(l.weight, std=0.01)
            torch.nn.init.constant_(l.bias, 0)
    # 定义 rpn head 的前向传播过程
    def forward(self, x):
        logits = []
        bbox_reg = []
        
        for feature in x:
            # 先执行卷积+激活
            t = F.relu(self.conv(feature))
            # 根据卷积+激活后的结果预测 objectness
            logits.append(self.cls_logits(t))
            # 根据卷积+激活后的结果预测bbox
            bbox_reg.append(self.bbox_pred(t))
        return logits, bbox_reg

在定义 RPNModule 时, 分别使用了 make_anchor_generator(), make_rpn_postprocessor() 和 make_rpn_loss_evaluator() 函数来构建模型的 anchor_generator, box_selector 以及 loss_evaluator, 这三个函数分别定义在其他的三个文件中, 下面我们就根据函数的调用顺序, 对这几个文件展开解析.

anchor_generator.py 生成 anchors:

# ./maskrcnn_benchmark/modeling/rpn/anchor_generator.py

# 包的导入
from maskrcnn_benchmark.structures.bounding_box import BoxList
# ...

class BufferList(nn.Module):
    # 和 nn.ParameterList 差不多, 但是是针对 buffers 的

    def __init__(self, buffers=None):
        # 初始化函数
        # ...

    def extend(self, buffers):
        # buffer 扩展
        # ...

    def __len__(self):
        # 获取 buffer 长度
        return len(self._buffers)

    def __iter__(self):
        # buffer 迭代器
        return iter(self._buffers.values())

class AnchorGenerator(nn.Module):
    # 对于给定的一系列 image sizes 和 feature maps, 计算对应的 anchors

    def __init__(...):
        # 初始化函数
        # ...

    def num_anchors_per_location(self):
        # 获取每个位置的 anchors 数量
        return [len(cell_anchors) for cell_anchors in self.cell_anchors]

    def grid_anchors(self, grid_sizes):
        # 获取 anchors
        # ...

    def add_visibility_to(self, boxlist):
        # anchors保留的功能，如果超出图像是否舍弃
        # ...

    def forward(self, image_list, feature_maps):
        # 定义前向传播过程
        # ...


def make_anchor_generator(config):
    # 根据配置信息创建 AnchorGenerator 对象实例
    # ...

def generator_anchors(...):
    # 根据给定的 stride, sizes, aspect_ratio 等参数返回一个 anchor box 组成的矩阵
    # ...

def _generate_anchors(base_size, scales, aspect_ratios):
    # 返回 anchor windows ??
    # ...

def _whctrs(anchor):
    # 返回某个 anchor 的宽高以及中心坐标
    # ...

def _mkanchors(ws, hs, x_ctr, y_ctr):
    # 给定关于一系列 centers 的宽和高, 返回对应的 anchors
    # ...

make_anchor_generator() 函数：

def make_anchor_generator(config):
# 定义了 RPN 网络的默认的 anchor 的面积大小
# 默认值为：（32,64,128,256,512）
    anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES
    # 定义了 RPN 网络默认的高宽比
    # 默认值为：（0.5,1.0,2.0）
    aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS
    # 定义了RPN 网络中 feature map 采用的stride
    # 默认值为:(16,)
    anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE
    # 移除那些超过图片 STRADDLE_THRESH 个像素大小的 anchors，起到剪枝作用
    # 默认值为0，如果想要关闭剪枝功能，则将该值置为 -1 或者一个更大的数
    straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH

    if config.MODEL.RPN.USE_FPN:
    # 当使用 fpn 时，要确保rpn和fpn的相关参数匹配
        assert len(anchor_stride) == len(
            anchor_sizes
        ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)"
    else:
        assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE"
    # 当获取到相关的参数以后，创建一个 AnchorGenerator 实例并将其返回
    anchor_generator = AnchorGenerator(
        anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh
    )
    return anchor_generator

根据上面的函数我们知道, make_anchor_generator(config) 函数会根据对应的配置文件创建一个 AnchorGenerator 的实例, 因此, 我们下面就对 class AnchorGenerator(nn.Module) 类进行解析, 代码如下:

class AnchorGenerator(nn.Module):
    """
    For a set of image sizes and feature maps, computes a set
    of anchors
    对于给定的 image sizes 和 features maps，计算对应的 anchors
    """

    def __init__(
        self,
        sizes=(128, 256, 512),
        aspect_ratios=(0.5, 1.0, 2.0),
        anchor_strides=(8, 16, 32),
        straddle_thresh=0,
    ):
        super(AnchorGenerator, self).__init__()

        if len(anchor_strides) == 1:
        # 如果 anchor_strides 的长度为 1，说明没有 fpn 部分，则直接调用相关函数
            anchor_stride = anchor_strides[0]
            # 此处调用了本文件的 generate——anchors 函数
            cell_anchors = [
                generate_anchors(anchor_stride, sizes, aspect_ratios).float()
            ]
        else:
            if len(anchor_strides) != len(sizes):
                raise RuntimeError("FPN should have #anchor_strides == #sizes")
            # 调用 generate_anchors 函数
            cell_anchors = [
                generate_anchors(
                    anchor_stride,
                    size if isinstance(size, (tuple, list)) else (size,),
                    aspect_ratios
                ).float()
                for anchor_stride, size in zip(anchor_strides, sizes)
            ]
        # 将 strides， cell_anchors, straddle_thresh 作为 AnchorGenerator 的成员
        self.strides = anchor_strides
        self.cell_anchors = BufferList(cell_anchors) # 使用了 bufferlist 类
        self.straddle_thresh = straddle_thresh

    # 返回每一个location 上对应的 anchors 数量
    def num_anchors_per_location(self):
        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
    # 用于生成所有特征图谱的 anchors，会被 forward 函数调用
    def grid_anchors(self, grid_sizes):
        # 创建一个空的 anchors 列表
        anchors = []
        # 针对各种组合
        for size, stride, base_anchors in zip(
            grid_sizes, self.strides, self.cell_anchors
        ):
        # 获取 grid 的尺寸和 base_anchors 的 device
            grid_height, grid_width = size
            device = base_anchors.device
            # 按照步长来获取偏移量
            shifts_x = torch.arange(
                0, grid_width * stride, step=stride, dtype=torch.float32, device=device
            )
            # 获取 y 的偏移量
            shifts_y = torch.arange(
                0, grid_height * stride, step=stride, dtype=torch.float32, device=device
            )
            # 创建关于 shifts_y, shifts_x 的 meshgrid（就是shifts_y x shifts_x的grid）
            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
            # 二者展开成一维
            shift_x = shift_x.reshape(-1)
            shift_y = shift_y.reshape(-1)
            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)

            anchors.append(
                (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)
            )

        return anchors

    def add_visibility_to(self, boxlist):
    # anchors保留的功能，如果超出图像是否舍弃
        image_width, image_height = boxlist.size
        anchors = boxlist.bbox
        if self.straddle_thresh >= 0:
            inds_inside = (
                (anchors[..., 0] >= -self.straddle_thresh)
                & (anchors[..., 1] >= -self.straddle_thresh)
                & (anchors[..., 2] < image_width + self.straddle_thresh)
                & (anchors[..., 3] < image_height + self.straddle_thresh)
            )
        else:
            device = anchors.device
            inds_inside = torch.ones(anchors.shape[0], dtype=torch.bool, device=device)
        boxlist.add_field("visibility", inds_inside)

    def forward(self, image_list, feature_maps):
        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
        anchors = []
        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
            anchors_in_image = []
            for anchors_per_feature_map in anchors_over_all_feature_maps:
                boxlist = BoxList(
                    anchors_per_feature_map, (image_width, image_height), mode="xyxy"
                )
                self.add_visibility_to(boxlist)
                anchors_in_image.append(boxlist)
            anchors.append(anchors_in_image)
        return anchors

在 class AnchorGenerator 中, 利用了 generate_anchors() 函数来生成对应的 anchors, 该函数是生成 anchors 的入口函数, 在生成 anchors 时, 需要进行一些计算和转换, 其大致流程和对应的实现函数如下所示:

获取生成 anchors 必要的参数, 包括: stride, sizes, 和 aspect_ratios, 其中, stride 代表特征图谱上的 anchors 的基础尺寸, sizes 代表 anchor 对应在原始图片中的大小(以像素为单位), 因此, 我们容易知道 anchor 在特征图谱上的放缩比例为 sizes/stride, aspect_ratios 代表 anchors 的高宽比, 于是, 最终返回的 anchors 的数量就是 sizes (在特征图谱上固定 base_window 的尺寸, 根据比例的不同来对应不同大小的物体)的数量和 aspect_ratios 数量的乘积;

在获取特征图谱上对应的 base_size(stride)后, 我们将其表示成 [x1, y1, x2, y2](坐标是相对于 anchor 的中心而言的) 的 box 形式. 例如对于 stride=4 的情况, 我们将其表示成 [0, 0, 3, 3], 此部分的实现位于 _generate_anchors(...) 函数中

然后根据 aspect_ratios 的值来获取不同的 anchor boxes 的尺寸, 例如, 对于 stride=4 的 base_anchor 来说, 如果参数 aspect_ratios 为 [0.5, 1.0, 2.0], 那么它就应该返回面积不变, 但是高宽比分别为 [0.5, 1.0, 2.0] 的三个 box 的坐标, 也就是应该返回下面的 box 数组(注意到这里 box 的比例实际上是 [5/2, 1, 2/5], 并不是绝对符合 aspect_ratios, 这是因为像素点只能为整数, 后面还能对这些坐标取整). 这部分的实现位于 _ratio_enum() 函数中;
[[-1. 0.5 4. 2.5] [ 0. 0. 3. 3. ] [ 0.5 -1. 2.5 4. ]]

在获取到不同比例的特征图谱上的 box 坐标以后, 我们就该利用 scales = sizes/stride 来将这些 box 坐标映射到原始图像中, 也就是按照对应的比例将这些 box 放大, 对于我们刚刚举的例子 scales = 32/4 = 8 来说, 最终的 box 的坐标如下所示. 这部分的代码实现位于 _scale_num() 函数中.
[[-22., -10., 25., 13.], [-14., -14., 17., 17.], [-10., -22., 13., 25.]]

代码解释如下：

def generate_anchors(
    stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)
):
    """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
    are centered on stride / 2, have (approximate) sqrt areas of the specified
    sizes, and aspect ratios as given.
    该函数会生成一个 anchor boxes 列表，列表中的元素是以(x1,x2,y1,y2)形式表示的 box；
    这些 box的坐标是相对于 anchor 的中心而言的，其大小为sizes 数组中元素的平方
    这里的默认参数对应的是使用 resnet-C4 作为 backbone 的 faster——RCNN 模型
    如果使用了FPN，则不同的 size 会对应到不同的特征图上，下面利用 fpn 的参数来讲解代码
    fpn 第一阶段的参数值为：（注意sizes必须写成元组或者列表的形式）
    stride = 4，size=（32，），aspect_ratios=（0.5,1,2）
    """
    return _generate_anchors( # 调用 _genarate_anchors()函数
        stride, # stride=4
        np.array(sizes, dtype=np.float) / stride, # sizes / strides = 32 / 4 = 8
        np.array(aspect_ratios, dtype=np.float), # [0.5, 1, 2]
    )


def _generate_anchors(base_size, scales, aspect_ratios):
# 根据调用语句知，参数值分别为：4,8,[0.5,1,2]
    """Generate anchor (reference) windows by enumerating aspect ratios X
    scales wrt a reference (0, 0, base_size - 1, base_size - 1) window.
    """
    # 首先得到 anchor 的base box坐标（相对于 anchor中心而言）,[0,0,3,3]
    anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
    # 根据 base_box 和给定的高宽比, 得到拥有不同高宽比的 anchors,
    # 此处是使 anchor 的比例转化成 [0.5, 1, 2], 对应的 box 为:
    #[[-1.   0.5  4.   2.5]
    # [ 0.   0.   3.   3. ]
    # [ 0.5 -1.   2.5  4. ]]
    # 注意到这里的 box 的比例实际为 [5/2, 1, 2/5], 具体原理可查看 _ratio_enum() 函数解析
    anchors = _ratio_enum(anchor, aspect_ratios)
    # 得到不同高宽比的 anchors 以后, 按照给定的比例(scales)将其缩放到原始图像中,
    # 此处 scales 的值只有一个, 即为 8, 因此, 将上面的 boxes 放大 8 倍(指的是宽高各放大 8 倍, 故总面积会放大64倍), 得到新的 boxes 坐标如下:
    #[[-22., -10.,  25.,  13.],
    # [-14., -14.,  17.,  17.],
    # [-10., -22.,  13.,  25.]]
    # 这里的 vstack 用于将 3 个 1×4 的数组合并成一个 3×4 的数组, 如上所示.
    # anchors[i, :] 代表的是一个 box 的坐标, 如: [-1.  0.5  4.  2.5]
    anchors = np.vstack(
        [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
    )
    # 将 numpy 数组转换成 tensors，然后返回，anchor的shape为：(n,4),其中 n 为 anchors 的数量
    return torch.from_numpy(anchors)

在上面的函数上, 分别使用了 _ratio_enum() 和 _scale_enum() 函数来实现高宽比和放缩比的组合, 下面, 我们就先对这两个函数进行解析:

def _ratio_enum(anchor, ratios):
    """Enumerate a set of anchors for each aspect ratio wrt an anchor."""
    # 该函数按照给定的 ratios 将 base anchor 转化成具有不同高宽比的多个 anchor boxes
    # 例如：
    # anchor：[0. 0. 3. 3.]
    # ratios: [0.5, 1.0 2.0]
    
    # 获取 anchor 的宽，高，以及中心点坐标
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    # 获取 anchor 的面积
    size = w * h
    # 根据高宽比获取 size_ratios 变量，后续会用该变量对 box 的高宽比进行转化
    size_ratios = size / ratios
    # ws = sqrt(size) / sqrt(ratios)
    # hs = sqrt(size) * sqrt(ratios)
    # 高宽比 = hs/ws = sqrt(ratios) * sqrt(ratios) = ratios
    # round 代表四舍五入
    ws = np.round(np.sqrt(size_ratios))
    hs = np.round(ws * ratios)
    # 根据新的 w 和 h, 生成新的 box 坐标(x1, x2, y1, y2) 并将其返回
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors


def _scale_enum(anchor, scales):
    """Enumerate a set of anchors for each scale wrt an anchor."""
    # 对放缩比进行遍历的函数
    # 举例说明： anchor:[-1. 0.5. 4. 2.5]
    # scales: 8
    # 获取anchor 的宽、高，以及中心坐标
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    # 将宽和高各放大 8 倍
    ws = w * scales
    hs = h * scales
    # 根据新的宽、高以及中心坐标，将 anchor 转化成(x1,x2,y1,y2) 的形式
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors

在 _ratio_enum() 和 _scale_enum() 函数中, 都使用了 _whctrs() 和 _mkanchors 函数, 前者可以根据 box 的坐标信息得到 box 的宽高以及中心点坐标, 后者则是根据宽高以及中心点坐标得到 box 的 (x1, y1, x2, y2) 形式, 这两个函数的代码解析如下所示：

def _whctrs(anchor):
    """Return width, height, x center, and y center for an anchor (window)."""
    # 根据左上角和右下角的坐标返回该 box 的宽高以及中心点坐标
    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5 * (h - 1)
    return w, h, x_ctr, y_ctr


def _mkanchors(ws, hs, x_ctr, y_ctr):
    """Given a vector of widths (ws) and heights (hs) around a center
    (x_ctr, y_ctr), output a set of anchors (windows).
    将给定的宽、高以及中心点坐标转化成(x1,y1,x2,y2)的坐标形式
    """
    # 这里新增加了一个维度，以便有 hstack 将结果叠加
    ws = ws[:, np.newaxis]
    hs = hs[:, np.newaxis]
    # 将结果组合起来返回
    anchors = np.hstack(
        (
            x_ctr - 0.5 * (ws - 1),
            y_ctr - 0.5 * (hs - 1),
            x_ctr + 0.5 * (ws - 1),
            y_ctr + 0.5 * (hs - 1),
        )
    )
    return anchors

inference.py 文件解析

# ./maskrcnn_benchmark/modeling/rpn/inference.py

# 导入各种包及函数
from maskrcnn_benchmark.modeling.box_coder import BoxCoder

class RPNPostProcessor(torch.nn.Module):
    # 在将 proposals 喂到网络的 heads 之前, 先对 RPN 输出的 boxes 执行后处理

    def __init__(...):
        # 初始化函数
        # ...

    def add_gt_proposals(self, proposals, targets):
        # ...

    def forward_for_single_feature_map(self, anchors, objectness, box_regression):
        # ...

    def forward(self, anchors, objectness, box_regression, targets=None):
        # ...

    def select_over_all_levels(self, boxlists):
        # ...

def make_rpn_postprocessor(config, rpn_box_coder, is_train):
    # ...

make_rpn_postprocessor()入口函数：

def make_rpn_postprocessor(config, rpn_box_coder, is_train):
# rpn_box_coder: BoxCoder 实例
    # eg : 2000
    fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN
    # eg: 1000
    if not is_train:
        fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST

    pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN
    post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN
    if not is_train:
        pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST
        post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST
    fpn_post_nms_per_batch = config.MODEL.RPN.FPN_POST_NMS_PER_BATCH
    nms_thresh = config.MODEL.RPN.NMS_THRESH
    min_size = config.MODEL.RPN.MIN_SIZE
    # 根据配置参数创建一个 RPNPostProcessor 实例
    box_selector = RPNPostProcessor(
        pre_nms_top_n=pre_nms_top_n,
        post_nms_top_n=post_nms_top_n,
        nms_thresh=nms_thresh,
        min_size=min_size,
        box_coder=rpn_box_coder,
        fpn_post_nms_top_n=fpn_post_nms_top_n,
        fpn_post_nms_per_batch=fpn_post_nms_per_batch,
    )
    return box_selector

RPNPostProcessor 类：
初始化函数：

class RPNPostProcessor(torch.nn.Module):
    """
    Performs post-processing on the outputs of the RPN boxes, before feeding the
    proposals to the heads 
    主要完成对 RPN boxes 的后处理功能（在将 boxes 送到 heads 之前执行）
    """

    def __init__(
        self,
        pre_nms_top_n,
        post_nms_top_n,
        nms_thresh,
        min_size,
        box_coder=None,
        fpn_post_nms_top_n=None,
        fpn_post_nms_per_batch=True,
    ):
        """
        Arguments:
            pre_nms_top_n (int)
            post_nms_top_n (int)
            nms_thresh (float)
            min_size (int)
            box_coder (BoxCoder)
            fpn_post_nms_top_n (int)
        """
        super(RPNPostProcessor, self).__init__()

        # 将传进来的参数都变成成员变量
        self.pre_nms_top_n = pre_nms_top_n
        self.post_nms_top_n = post_nms_top_n
        self.nms_thresh = nms_thresh
        self.min_size = min_size
        # 创建一个 BoxCoder 实例
        if box_coder is None:
            box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
        self.box_coder = box_coder

        if fpn_post_nms_top_n is None:
            fpn_post_nms_top_n = post_nms_top_n
        self.fpn_post_nms_top_n = fpn_post_nms_top_n
        self.fpn_post_nms_per_batch = fpn_post_nms_per_batch

添加真实候选框函数：

   def add_gt_proposals(self, proposals, targets):
        """
        将真实的边框标签 targets 添加到当前的 BoxList 列表数据中
        Arguments:
            proposals: list[BoxList]
            targets: list[BoxList]
        """
        # Get the device we're operating on
        # 获取当前正在操作的设备
        device = proposals[0].bbox.device
        # 调用 BoxList 的 copy_with_fields 方法进行深度复制，gt_boxes 是一个列表
        # 其元素类型是 BoxList
        gt_boxes = [target.copy_with_fields([]) for target in targets]

        # later cat of bbox requires all fields to be present for all bbox
        # so we need to add a dummy for objectness that's missing
        # 添加一个字典键，“objectness”，值为当前 boxlist元素中的 box 的数量长度的一维 tensor
        for gt_box in gt_boxes:
            gt_box.add_field("objectness", torch.ones(len(gt_box), device=device))
 
        # 调用 boxlist_ops.py中的 cat_boxlist 函数将 proposal 和 gt_box 合并成一个 boxlist
        proposals = [
            cat_boxlist((proposal, gt_box))
            for proposal, gt_box in zip(proposals, gt_boxes)
        ]

        return proposals

在单一的特征图谱上执行前向传播：

    def forward_for_single_feature_map(self, anchors, objectness, box_regression):
        """
        Arguments:
            anchors: list[BoxList]
            objectness: tensor of size N, A, H, W
            A 代表每个像素点的 anchors 数量；N 代表batchsize，H和W代表特征图谱的高和宽
            box_regression: tensor of size N, A * 4, H, W
        """
        # 获取当前的设备
        device = objectness.device
        # 获取 objectness 的 shape
        N, A, H, W = objectness.shape

        # put in the same format as anchors
        # 将格式转换成和 anchors 相同的格式，先改变维度的排列，然后改变shape的形状
        objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) # shape：（N,H*W*A）
        # SIGMOID归一化
        objectness = objectness.sigmoid()
        # 相似的操作，应用在 box_regression 上
        box_regression = permute_and_flatten(box_regression, N, A, 4, H, W)
        # 计算 anchors 的总数量
        num_anchors = A * H * W
       
        # 确保 pre_nms_top_n 不会超过 anchors 的总数量，以免产生错误
        pre_nms_top_n = min(self.pre_nms_top_n, num_anchors)
        # 调用 pytorch 的 topk 函数，该函数返回两个列表，一个是topk 的值，一个是对应下标
        objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True)
        
        # 创建 batch 的下标，shape 为 Nx1，按照顺序递增，如:[[0],[1],....,[N-1]]
        batch_idx = torch.arange(N, device=device)[:, None]
        # 获取所有 batch 的 top_k box
        box_regression = box_regression[batch_idx, topk_idx]
        
        # 获取所有 anchor 的尺寸
        image_shapes = [box.size for box in anchors]
        # 获取所有的 anchors，将 anchors 连接成一个列表
        concat_anchors = torch.cat([a.bbox for a in anchors], dim=0)
        # 重新按照 batch 划分，同时获取每个batch 的 topk
        concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx]
 
        # 将最终的结果解码成方便表示的形式（原本为方便训练的形式）
        proposals = self.box_coder.decode(
            box_regression.view(-1, 4), concat_anchors.view(-1, 4)
        )

        proposals = proposals.view(N, -1, 4)

        result = [] # 组建结果并返回
        for proposal, score, im_shape in zip(proposals, objectness, image_shapes):
            # 根据当前的结果创建一个 boxlist 实例
            boxlist = BoxList(proposal, im_shape, mode="xyxy")
            # 添加 score
            boxlist.add_field("objectness", score)
            # 防止 box 超出 image 的边界
            boxlist = boxlist.clip_to_image(remove_empty=False)
            # 移除过小的 box
            boxlist = remove_small_boxes(boxlist, self.min_size)
            # 在当前的 box 上执行 nms 算法
            boxlist = boxlist_nms(
                boxlist,
                self.nms_thresh,
                max_proposals=self.post_nms_top_n,
                score_field="objectness",
            )
            result.append(boxlist)
        return result

前向传播函数：

    def forward(self, anchors, objectness, box_regression, targets=None):
        """
        Arguments:
            anchors: list[list[BoxList]]
            objectness: list[tensor]
            box_regression: list[tensor]
        Returns:
            boxlists (list[BoxList]): the post-processed anchors, after
                applying box decoding and NMS 经过 box decoding 和NMS 操作处理后的 anchors
        """
        # 创建一个空的 box 列表
        sampled_boxes = []
        num_levels = len(objectness)
        anchors = list(zip(*anchors))

        # 调用类的 forward_for_single_feature_map() 成员函数
        for a, o, b in zip(anchors, objectness, box_regression):
            sampled_boxes.append(self.forward_for_single_feature_map(a, o, b))

        boxlists = list(zip(*sampled_boxes))
        # 调用 boxlist_ops.py 文件中的 cat_boxlist 函数
        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]

        if num_levels > 1:
        # 调用类的 select_over_all_levels 成员函数
            boxlists = self.select_over_all_levels(boxlists)

        # append ground-truth bboxes to proposals
        # 添加 gt bboxes 到 proposals 当中去
        if self.training and targets is not None:
        # 调用类的 add_gt_proposals 成员函数
            boxlists = self.add_gt_proposals(boxlists, targets)

        return boxlists

在所有层次上进行选择：

    def select_over_all_levels(self, boxlists):
    # 在训练阶段和测试阶段的行为不同，在训练阶段，post_nms_top_n 是在所有的proposals 上进行的
    # 而在测试阶段，是在每一个图片的 proposals 上进行的
        num_images = len(boxlists)
        # different behavior during training and during testing:
        # during training, post_nms_top_n is over *all* the proposals combined, while
        # during testing, it is over the proposals for each image
        # NOTE: it should be per image, and not per batch. However, to be consistent 
        # with Detectron, the default is per batch (see Issue #672)
        if self.training and self.fpn_post_nms_per_batch:
        # 连接“objectness”
            objectness = torch.cat(
                [boxlist.get_field("objectness") for boxlist in boxlists], dim=0
            )
            # 获取box数量
            box_sizes = [len(boxlist) for boxlist in boxlists]
            # 防止 post_nms_top_n 超过 anchors 总数，产生错误
            post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness))
            # 获取 topk 的下标
            _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True)
            inds_mask = torch.zeros_like(objectness, dtype=torch.bool)
            inds_mask[inds_sorted] = 1
            inds_mask = inds_mask.split(box_sizes)
            # 获取所有满足条件的box
            for i in range(num_images):
                boxlists[i] = boxlists[i][inds_mask[i]]
        else:
            for i in range(num_images):
                objectness = boxlists[i].get_field("objectness")
                post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness))
                _, inds_sorted = torch.topk(
                    objectness, post_nms_top_n, dim=0, sorted=True
                )
                boxlists[i] = boxlists[i][inds_sorted]
        return boxlists

loss.py 文件解析

make_rpn_loss_evaluator() 函数来创建 RPN 网络的损失函数评价器：

def make_rpn_loss_evaluator(cfg, box_coder):
# 根据配置信息创建 matcher 实例
    matcher = Matcher(
        cfg.MODEL.RPN.FG_IOU_THRESHOLD,
        cfg.MODEL.RPN.BG_IOU_THRESHOLD,
        allow_low_quality_matches=True,
    )
# 根据配置信息创建一个 BalancedPositiveNegativeSampler 实例
    fg_bg_sampler = BalancedPositiveNegativeSampler(
        cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION
    )
# 利用上面创建的实例对象进一步创建 RPNLossComputation 实例
    loss_evaluator = RPNLossComputation(
        matcher,
        fg_bg_sampler,
        box_coder,
        generate_rpn_labels
    )
    return loss_evaluator

RPNLossComputation 类的代码实现：

class RPNLossComputation(object):
    """
    This class computes the RPN loss.
    """

    def __init__(self, proposal_matcher, fg_bg_sampler, box_coder,
                 generate_labels_func):
        """
        Arguments:
            proposal_matcher (Matcher)
            fg_bg_sampler (BalancedPositiveNegativeSampler)
            box_coder (BoxCoder)
        """
        # self.target_preparator = target_preparator
        self.proposal_matcher = proposal_matcher
        self.fg_bg_sampler = fg_bg_sampler
        self.box_coder = box_coder
        self.copied_fields = []
        self.generate_labels_func = generate_labels_func
        self.discard_cases = ['not_visibility', 'between_thresholds']

    def match_targets_to_anchors(self, anchor, target, copied_fields=[]):
        match_quality_matrix = boxlist_iou(target, anchor)
        matched_idxs = self.proposal_matcher(match_quality_matrix)
        # RPN doesn't need any fields from target
        # for creating the labels, so clear them all
        target = target.copy_with_fields(copied_fields)
        # get the targets corresponding GT for each anchor
        # NB: need to clamp the indices because we can have a single
        # GT in the image, and matched_idxs can be -2, which goes
        # out of bounds
        matched_targets = target[matched_idxs.clamp(min=0)]
        matched_targets.add_field("matched_idxs", matched_idxs)
        return matched_targets

    def prepare_targets(self, anchors, targets):
        labels = []
        regression_targets = []
        for anchors_per_image, targets_per_image in zip(anchors, targets):
            matched_targets = self.match_targets_to_anchors(
                anchors_per_image, targets_per_image, self.copied_fields
            )

            matched_idxs = matched_targets.get_field("matched_idxs")
            labels_per_image = self.generate_labels_func(matched_targets)
            labels_per_image = labels_per_image.to(dtype=torch.float32)

            # Background (negative examples)
            bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
            labels_per_image[bg_indices] = 0

            # discard anchors that go out of the boundaries of the image
            if "not_visibility" in self.discard_cases:
                labels_per_image[~anchors_per_image.get_field("visibility")] = -1

            # discard indices that are between thresholds
            if "between_thresholds" in self.discard_cases:
                inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
                labels_per_image[inds_to_discard] = -1

            # compute regression targets
            regression_targets_per_image = self.box_coder.encode(
                matched_targets.bbox, anchors_per_image.bbox
            )

            labels.append(labels_per_image)
            regression_targets.append(regression_targets_per_image)

        return labels, regression_targets


    def __call__(self, anchors, objectness, box_regression, targets):
        """
        Arguments:
            anchors (list[list[BoxList]])
            objectness (list[Tensor])
            box_regression (list[Tensor])
            targets (list[BoxList])
        Returns:
            objectness_loss (Tensor)
            box_loss (Tensor)
        """
        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
        labels, regression_targets = self.prepare_targets(anchors, targets)
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)

        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)

        objectness, box_regression = \
                concat_box_prediction_layers(objectness, box_regression)

        objectness = objectness.squeeze()

        labels = torch.cat(labels, dim=0)
        regression_targets = torch.cat(regression_targets, dim=0)

        box_loss = smooth_l1_loss(
            box_regression[sampled_pos_inds],
            regression_targets[sampled_pos_inds],
            beta=1.0 / 9,
            size_average=False,
        ) / (sampled_inds.numel())

        objectness_loss = F.binary_cross_entropy_with_logits(
            objectness[sampled_inds], labels[sampled_inds]
        )

        return objectness_loss, box_loss

参考链接：
MaskrcnnBenchmark 源码解析