今天开始学习目标检测，我所有的博客仅限于自己学习记录而已，有不足之处，还请大佬们指正。
目标检测就是一个分类加目标框地址选定，目标检测框架是先训练，之后利用训练好的框架进行检测，训练与检测是不一样框架，训练的时候是这样子，检测的时候，训练的框架就不再使用，贴一张大佬画的图，目前我只是在学习阶段，请添加图片描述
所有的图像任务，都是建立在大量学习目标特征的基础上，Dataset，是目标检测需要训练的图像，根据输入的图像进行特征提取，目标检测是分类类别+目标外围框地址，

Dataset

目标检测使用PASCAL VOC2007/2012数据集

from torch.utils.data import Dataset
import os 
import json
import torch
form PIL import Image
from lxml import etree
class VOCDataSet(Dataset):
    def__init__(self,voc_root,year="2012",transforms=None,txt_name:str="train.txt"):
        assert year in ["2007","2012"],"year must be in ['2007','2012']"
        self.root=os.path.join(voc_root,"VOCdeckit",f"VOC{year}")
        self.img_root=os.path.join(self.root."JPEGImages")
        self.annotation_root=os.path.join(self.root."Annotations")
        txt_path=os.path.join(self.root,"imageSets","Main",txt_name)
        assert os.path.exists(txt_path0，"not found {} file.".format(txt_name)
        with open(txt_path) as read:
            self.xml_list=[os.path.join(self.annotations_root,line.strip()+".xml")
            for line in read.readlines() if len(line.strip()>0]
        assert len(self.xml_list)>0,"in '{}' file dose not find any information.".format(xml_path)
        json_file='./pascal_voc_classes.json'
        assert os.path.exists(json_file),"{} file not exist.".format(json_file)
        json_file=open(json_file,'r')
        self.class_dict=json.load(json_file)
        json_file.close()
        self.transfroms=transforms
    def __len__(self):
        return len(self.xml_list)
    def __getitem__(self,idx):
        xml_path=self.xml_list[idx]
        with open(xml_path) as fid:
            xml_str=fid.read()
        xml=etree.fromstring(xml_str)
        data=self.parse_xml_to_dict(xml)["annotation"]
        img_path=os.path.join(self.img_root,data["filename"])
        image=Image.open(img_path)
        if image.format!="JPEG":
            raise ValueError("Image '{}' format not JPEG".format(img_path))
        boxes=[]
        labels=[]
        iscrowd=[]
        assert "object" in data,"{} lack of object information.".format(xml_path)
        for obj in data["object"]:
            xmin=float(obj["bndbox"]["xmin"])
            xmax=float(obj["bndbox"]["xmax"])
            ymin=float(obj["bndbox"]["ymin"])
            ymax=float(obj["bndbox"]["ymax"])
            if xmax<=xmin or ymax<=ymin:
                print("warning :in '{}' xml,there are some bbox w/h<=0".format(xml_path)
                continue
            boxes.append([xmin,xmax,ymax,ymin])
            labels.append(self.class_dict[obj["name"]])
            if "difficult" in obj:
                iscrowd.append(int(obj["difficult"]))
            else:
                iscrowd.append(0)
        boxes=torch.as_tensor(boxes,dtype=torch.float32)
        labels=torch.as_tensor(labels,dtype=torch.int64)
        iscrowd=torch.as_tensor(iscrowd,dtype=torch.int64)
        image_id=torch.tensor([idx])
        area=(boxes[:,3]-boxes[:,1])*(boxes[:,2]-boxes[:,0])
        target={
    
    }
        target["boxes"]=boxes
        target["labels"]=labels
        target["image_id"]=iamge_id
        target["area"]=area
        target["iscrowd"]=iscrowd
        if self.transforms is not None:
            imag,target=self.transforms(image,target)
        return image,target
    def parse_xml_dict(self,xml):
        if len(xml)==0:
            return {
    
    xml.tag:xml.text}
        result={
    
    }
        for child in xml:
            child_result=self.parse_xml_to_dict(child)
            if child_tag!='object':
                result[child.tag]=child_result[child.tag]
            else:
                if child.tag not in result:
                    result[child.tag]=[]
                result[child.tag].append(child_result[child.tag])
        return {
    
    xml.tag:result}
    def coco_index(self,idx):
        xml_path=self.xml_list[idx]
        with open(xml_path) as fid:
            xml_str=fid.read()
        xml=etree.fromstring(xml_str)
        data=self.parse_xml_to_dict(xml)["annotatin"]
        data_height=int(data["size"]["height"])
        data_width=int(data["size"]["width"])
        boxes=[]
        labels=[]
        iscrowd=[]
        for obj in data["object"]:
            xmin=float(obj["bndbox"]["ximin"])
            xman=float(obj["bndbox"]["xmax"])
            ymin=float(obj["bndbox"]["ymin"])
            ymax=float(obj["bndbox"]["ymax"])
            boxes.append([xmin,ymin,xmax,ymax])
            labels.append(self.class_dict[obj["name"]])
            iscrowd.append(int(obj["difficult"])
        boxes=torch.as_tensor(boxes,type=torch.float32)
        labels=torch.as_tensor(label,type=torch.int64)
        iscrowd=torch.as_rensor(iscrowd,type=torch.int64)
        image_id=torch.tensor([idx])
        area=(boxex[:,3]-boxes[:,1])*(boxes[:,0]-boxes[:,0])
        target={
    
    }
        target["boxes"]=boxes
        target["labels"]=labels
        target["image_id"]=image_id
        target["area"]=area
        target["iscrowd"]=iscrowd
        return (data_height,data_weight),target
   @staticmethod
   def collate_fn(batch):
       return tupel(zip(*batch))

读取图片用于训练的图像之后进行处理transform,重新写，因为检测图像对图像
图像进行处理，相应的坐标也要进行相应的处理，不仅图像要进行处理，坐标也要进行TRANSFORMS

import random
from torchvision.transforms import functional as F
class Compose(object):
    def__init__(self,transforms):
        self.transforms=transforms
    def __call__(self,image,target):
        for t in self.transforms:
            image,target=t(image,target)
        return image,target
 class ToTensor(object):
     def__call__(self,image,target):
         image=F.to_tensor(image）
         return image,target
 class RandomHorizontalFlip(object):
     def__init__(self,prob=0.5):
         self.prob=prob
     def __call__(self,image,target):
         if random.random()<self.prob:
             height,width=image.shape[-2:0]
             image=image.flip(-1)
             bbox=target["boxes"]
             bbox[:,[0,2]]=width-bbox[:,[2,0]]
             target["boxes"]=bbox
         return image,target

之后提取特征

使用MobileNetV2，作为特征提取网络 Faster Rcnn 包括主要三部分 backbone, rpn, roi_heads ,MobileNetV2使用的是训练好的特征提取器，该网络为分类网络。

x=self.features(x)
x=self.avgpool(x)
x=torch.flatten(x,1)
x=self.classifier(x)

使用的是feature部分，输出是特征矩阵，CMM大小，根据MM大小的特征矩阵生成anchor MM9，9对应着所有比例 RPNhead 利用33特征矩阵在生成该anchor是属于前景还是背景，以及四个坐标值的相对变化量。根据生成的所有属于前景的取前2000作为proposal ，根据对应的proposal生成7*7特征矩阵，判断该类别以及坐标信息，
特征提取网络，就是分类网络，在此不再赘述。

整个网络训练

import os
import datatime
import torch
import torchvisin
import transforms
from network_file import FasterRCNN,AnchorsGenerator
from backbone import  MobileNetV2
from my_dataset import VOCDataset
from train_utils import GroupedBatchSampler,create_aspect_ratio_groups
from train_utils import train_eval_utils as utils
if __name__=="__main__":
    main()
def main():
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("using {} device training.".format(device.type))
    result_file="result{}.txt".format(datatime.datatime.now().strftime("%Y%m%d-%H%M%S"))
    if not os.path.exists("save_weight"):
        os.makedirs("save_weights")
    data_fransform={
    
    
    "train":transforms.Compose([transforms.ToTensor(),
    transforms.RandomHorizontalFlip(0.5)]),
    "val":transforms.Compose([transforms.ToTensor()])
    }
    VOC_root="./"
    aspect_ratio_group_factor=3
    batch_size=4
    if os.path.exists(os.path.join(VOC_root,"VOCdevkt")) if False:
        raise FileNotFoundError("VOCdevkit dese not in path:{}".format(VOC_root))
    train_dataset=VOCDataset(VOC_root,"2012",data_transfrom["train"],
    "train.txt")
    train_sampler=None
    if aspect_ratio_group_factor>=0:
        train_samler=torch.utils.data.RandomSampler(train_dataset)
        group_ids=create_aspect_ratio_groups(train_dataset,k=aspect_ratio_group_factor)
         train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size)
    #使得这样一个batch中的数据是从相同比例获取的
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    print('Using %g dataloader workers' % nw)

该函数时统计训练图像比例在一定比例区间的图像索引

def create_aspect_ratio_group(dataset,k=0):
    aspect_ratios=compute_aspect_ratios(dataset)
    #计算比例 一共5717张。计算出比例
    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
    ## 将[0.5, 2]区间划分成6等份(7个点，6个区间)
    groups=_quantize(aspect_ratios,bins)
    counts = np.unique(groups, return_counts=True)[1]
    return group
    #Count of instances per bin: [   5   25  929  117  260 4198  135   48]
    #统计各个比例出现的图片个数
    #计算所有图像的比例在bins中的索引
    #Using [0, 0.5, 0.6299605249474366, 0.7937005259840997, 1.0, 1.2599210498948732, 1.5874010519681994, 2.0, inf] as bins for aspect ratio quantization

在这里插入图片描述
如果按照图片高宽比采样图片，dataloader 使用batch_sampler

if train_sampler:
    train_data_loader=torch.utils.data.DataLoader(train_dataset,
    batch_sampler=train_batch_sampler,
    num_workers_nw,
    collate_fn=train_dataset,collate_fn)
else:
    train_data_loader=torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=True,
    num_workers=nw,
    collate_fn=train_dataset.collate_fn)
    model=create_model(num_classes=21)

数据进行比例构建，之后构建网络

def create_model(num_classes):
    backbones=MobileNetV2(weight_path="./backbone/mobilenet_v2.pth").features
    
Sequential(
  (0): ConvBNReLU(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU6(inplace=True)
  )
  (1): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (2): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (3): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(144, 144, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=144, bias=False)
        (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(144, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (4): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(144, 144, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=144, bias=False)
        (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(144, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (5): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (6): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (7): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
        (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (8): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (9): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (10): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (11): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
        (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (12): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (13): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (14): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=576, bias=False)
        (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (15): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (16): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (17): InvertedResidual(
    (conv): Sequential(
      (0): ConvBNReLU(
        (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): ConvBNReLU(
        (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
        (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (2): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (18): ConvBNReLU(
    (0): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (1): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU6(inplace=True)
  )
)
一共18层网络
最后一层输出维度1280 ，卷积核为1*1
backbone.out_channels=1280

网络已经准备好
之后进行产生anchor ，anchor 应该是一系列坐标值
AnchorsGenerator生成器根据特征矩阵生成一定比例的anchors

anchor_generator=AnchorsGenerator(sizes=(32,64,128,256,512),),
aspect_ratios=((0.5,1.0,2.0),)

class AnchorsGenerator(nn.Module):
    __annotation__={
    
    
    "cell_anchors":Optional[List[torch.Tensor]],
    "_cache":Dict[str,List[torch.Tensor]]}
    #在特征图上进行生成anchor
    def __init__(self,sizes=(128,256,512),aspect_ratios=(0.5,1.0,2.0)):
        super(AnchorsGenerator,self).__init__()
        if not isinstance(sizes[0],(list,tuple)):
            sizes=tuple((s,) for s in sizes)
        if not isinstance(aspect_ratios[0],(list,tuple)):
            aspect_ratios=(aspect_ratios,)*len(sizes)
        assert len(sizes)==len(aspect_ratios)
        self.sizes=size
        self.aspect_ratios=aspect_ratio
        self.cell_anchors=None
        self._cache={
    
    }
        # 只是初始化的过程

roi_pooler=torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],output_size=[7,7],sampling_ratio=2)
#根据RPN生成的propsal生成对应的特征矩阵
model=FasterRCNN(backbone=backbone,num_classes=num_classes,rpn_anchor_generator=anchor_generator,box_roi_pool=roi_pooler)

class FasterRCNN(FasterRCNNBase):
    def__init__(self,backbone,num_classes=None,
    min_size=800,max_size=1333,
    image_mean=None,image_std=None,
    rpn_anchor_generator=None,rpn_head=None,
    rpn_pre_nus_top_train=2000, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000,    # rpn中在极大值抑制处理前保留的proposal数(根据score进行排序)
    rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000,  # rpn中在极大值抑制处理后保留的proposal数
    rpn_nms_thresh=0.7,  # rpn中进行极大值抑制处理时使用的iou阈值
    rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,  # rpn计算损失时，采集正负样本设置的阈值
    rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,  # rpn计算损失时采样的样本数，以及正样本占总样本的比例
    rpn_score_thresh=0.0,
                 # Box parameters
    box_roi_pool=None, box_head=None, box_predictor=None,
    # 移除低目标概率      fast rcnn中进行nms处理的阈值   对预测结果根据score排序取前100个目标
    box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
    box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,   # fast rcnn计算误差时，采集正负样本设置的阈值
    box_batch_size_per_image=512, box_positive_fraction=0.25,  # fast rcnn计算误差时采样的样本数，以及正样本占所有样本的比例
    bbox_reg_weights=None):
    if not hasattr(backbone,"out_channels"):
        raise ValueError( "backbone should contain an attribute out_channels"
                "specifying the number of output channels  (assumed to be the"
                "same for all the levels"
            )
        assert isinstance(rpn_anchor_generator,(AnchorsGenerator,type(None)))
        assert isinstance(box_roi_pool,(MultiScaleRoIAlign,type(None)))
       if num_classes is not None:
            if box_predictor is not None:
                raise ValueError("num_classes should be None when box_predictor "
                                 "is specified")
        else:
            if box_predictor is None:
                raise ValueError("num_classes should not be None when box_predictor "
                                 "is not specified")
        out_channels=backbone.out_channels
        if rpn_anchor_generator is None:
            anchor_sizes=((32,),(64,),(128,),(256,),(512,))
            aspect_ratios=((0.5,1.0,2.0),)*len(anchor_sizes)
            rpn_anchor_generator=AnchorsGenerator(anchor_sizes,aspect_ratios)

#生成RPN通过滑动窗口预测网络部分
if rpn_head is None:
    rpn_head=RPNHead(out_channels,rpn_anchor_gemerator.num_anchors_per_locatin()[0])
class RPNHead(nn.Module):
    #通过滑动窗口其实就是卷积生成预测概率与回归参数信息
    def __init__(self,in_channels,num_anchors):#1280 15
        super(RPNHead,self).__init__()
        self.conv=nn.Conv2d(in_channels,in_channels,kernel_size=3,stride=1,padding=1)
        self.cls_logits=nn.Conv2d(in_chanels,num_anchors,kernel_size=1,padding=1)
        self.bbox_pred=nn.Conv2d(in_channels,num_anchors*4,kernel_size=1,padding=1)
        for layer in self.children():
            if isinstance(layer,nn.Conv2d):
                torch.nn.init.normal_(layer.weight,std=0.01)
                torch.nn.init.constant_(layer.bias,0)

rpn_pre_nums_top_n=dict(training=rpn_pre_nums_top_n_train,
testing=rpn_pre_nums_top_n_test)
rpn_post_nums_top_n=dict(training=rpn_post_nus_train
testing=rpn_post_nms_top_n_test)
进行极大值之前需要保留的anchor个数，之后需要保留anchor个数
# rpn_pre_nms_top_n={'training': 2000, 'testing': 1000}
   rpn = RegionProposalNetwork(
            rpn_anchor_generator, rpn_head,
            rpn_fg_iou_thresh, rpn_bg_iou_thresh,
            rpn_batch_size_per_image, rpn_positive_fraction,
            rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh,
            score_thresh=rpn_score_thresh)
class RegionProposalNetwork(torch.nn.Module):
        __annotations__ = {
    
    
        'box_coder': det_utils.BoxCoder,
        'proposal_matcher': det_utils.Matcher,
        'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
        'pre_nms_top_n': Dict[str, int],
        'post_nms_top_n': Dict[str, int],
    }

          def __init__(self, anchor_generator, head,
                 fg_iou_thresh, bg_iou_thresh,
                 batch_size_per_image, positive_fraction,
                 pre_nms_top_n, post_nms_top_n, nms_thresh, score_thresh=0.0): 
                #  batch_size_per_image=256
                #  positive_fraction=0.5
                # fg_iou_thresh=0.3, bg_iou_thresh=0.7,被认为是正样本的概率值，实际预测与真实的概率大于0.7 小于0.3的被认为是负样本 计算anchors与真实bbox的iou
                #在进行极大值抑制之前的样本数，在进行极大值抑制之后的样本数
                super(RegionProposalNetwork, self).__init__()
                self.anchor_generator = anchor_generator
                self.head = head
                self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
                self.box_similarity=box_ops.box_iou
                       self.proposal_matcher = det_utils.Matcher(
            fg_iou_thresh,  # 当iou大于fg_iou_thresh(0.7)时视为正样本
            bg_iou_thresh,  # 当iou小于bg_iou_thresh(0.3)时视为负样本
            allow_low_quality_matches=True
        )

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
            batch_size_per_image, positive_fraction  # 256, 0.5
        )

        # use during testing
        self._pre_nms_top_n = pre_nms_top_n
        self._post_nms_top_n = post_nms_top_n
        self.nms_thresh = nms_thresh
        self.score_thresh = s
        if box_head is None:
            resolutin=box_rol_pool.output_size[0]
            #roi pooling 7*7 
           #之后进行全连接层
            representaion_size=1024
            box_head=TwoMLPHead(out_channels*resolution**2,representation_size)
class TwoMLPHead(nn.Module):
    def __init__(self,in_channels,representation_size):
         # 62720 1024
         super(TWoMlPHead,self).__init__()
         self.fc6=nn.Linear(in_channels,representation_size)
         self.fc7=nn.Liner(representation_size,representation_size)
    if box_predictor is None:
        representation_size=1024
        box_predictor=FasterRCNNPredictor(representation_size,num_classes)
    class FaterRCNNPredictor(nn.Module):
        def __init__(self,in_channels,num_classes):
            #1024 21
            super(FastRCNNPredictor,self).__init__()
            self.cls_score=nn.Linear(in_channels,num_classes)
            self.bbox_pre=nn.Linear(in_channels,num_calsses*4)
roi_heads = RoIHeads(
            # box
            box_roi_pool, box_head, box_predictor,
            box_fg_iou_thresh, box_bg_iou_thresh,  # 0.5  0.5
            box_batch_size_per_image, box_positive_fraction,  # 512  0.25
            bbox_reg_weights,
            box_score_thresh, box_nms_thresh, box_detections_per_img)  # 0.05  0.5  100      
         class RoIHeads(torch.nn.Module):
    __annotations__ = {
    
    
        'box_coder': det_utils.BoxCoder,
        'proposal_matcher': det_utils.Matcher,
        'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
    }

    def __init__(self,
                 box_roi_pool,   # Multi-scale RoIAlign pooling
                 box_head,       # TwoMLPHead
                 box_predictor,  # FastRCNNPredictor
                 # Faster R-CNN training
                 fg_iou_thresh, bg_iou_thresh,  # default: 0.5, 0.5
                 batch_size_per_image, positive_fraction,  # default: 512, 0.25
                 bbox_reg_weights,  # None
                 # Faster R-CNN inference
                 score_thresh,        # default: 0.05
                 nms_thresh,          # default: 0.5
                 detection_per_img):  # default: 100
        super(RoIHeads, self).__init__()   
        self.box_similarity=box_ops.box_iou
         self.proposal_matcher = det_utils.Matcher(
            fg_iou_thresh,  # default: 0.5
            bg_iou_thresh,  # default: 0.5
            allow_low_quality_matches=False)

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
            batch_size_per_image,  # default: 512
            positive_fraction)     # default: 0.25

        if bbox_reg_weights is None:
            bbox_reg_weights = (10., 10., 5., 5.)
        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)

        self.box_roi_pool = box_roi_pool    # Multi-scale RoIAlign pooling
        self.box_head = box_head            # TwoMLPHead
        self.box_predictor = box_predictor  # FastRCNNPredictor

        self.score_thresh = score_thresh  # default: 0.05
        self.nms_thresh = nms_thresh      # default: 0.5
        self.detection_per_img = detection_per_img  # default: 100
         if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]

        # 对数据进行标准化，缩放，打包成batch等处理部分
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)

整个网络结构图


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (3): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(144, 144, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=144, bias=False)
          (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(144, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (4): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(144, 144, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=144, bias=False)
          (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(144, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (5): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (6): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (7): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
          (1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (8): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (9): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (10): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (11): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
          (1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (12): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (13): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (14): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=576, bias=False)
          (1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (15): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (16): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (17): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): ConvBNReLU(
          (0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
          (1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (2): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (18): ConvBNReLU(
      (0): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
  )
  (rpn): RegionProposalNetwork(
    (anchor_generator): AnchorsGenerator()
    (head): RPNHead(
      (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cls_logits): Conv2d(1280, 15, kernel_size=(1, 1), stride=(1, 1))
      (bbox_pred): Conv2d(1280, 60, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (roi_heads): RoIHeads(
    (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0'], output_size=(7, 7), sampling_ratio=2)
    (box_head): TwoMLPHead(
      (fc6): Linear(in_features=62720, out_features=1024, bias=True)
      (fc7): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (box_predictor): FastRCNNPredictor(
      (cls_score): Linear(in_features=1024, out_features=21, bias=True)
      (bbox_pred): Linear(in_features=1024, out_features=84, bias=True)
    )
  )
)
#一共21类别

开始训练网络

model.to(device)
train_loss=[]
learning_rate=[]
val_map=[]
首先冻结前置特征提取网络权重（backbone），训练rpn以及最终预测网络部分 
    for param in model.backbone.parameters():
        param.requires_grad = False

    # define optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

    init_epochs = 1
    for epoch in range(init_epochs):
        mean_loss, lr = utils.train_one_epoch(model,optimizer,train__loader,device, epoch, print_freq=50, warmup=True)

进入训练模块

def train_one_epoch(model,optimizer,data_loader,device,epoch,print_freq=50,warmup=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
       lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮（epoch=0）时，启用warmup训练方式，可理解为热身训练
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    mloss = torch.zeros(1).to(device)  # mean losses
    enable_amp = True if "cuda" in device.type else False
    for i, [images, targets] in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        images = list(image.to(device) for image in images)
        targets = [{
    
    k: v.to(device) for k, v in t.items()} for t in targets]
        with torch.cuda.amp.autocast(enabled=enable_amp):
            loss_dict = model(images, targets)
#开始进入训练模式
calss FaterRCNNBase(nn.Module):
    def forward(self,images,target=None):
        if self.training and target is None:
            raise ValueError (In training mode,targets should be passed")
        if self.training:
            assert target is not None:
            for target in targets:
                boxes=target["boxes"]  tensor([[  4.,   4., 438., 375.]])
                if isintance(boxes,torch.Tensor):
                    if len(boxes.shape)!=2 or boxes.shape[-1]!=4:
                        raise ValueError("Expected  traget boxes to be a lensor of shape[N,4],got {
    
    :}.".format(boxes.shape))
                         original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
        for img in images:
            val = img.shape[-2:]
            assert len(val) == 2  # 防止输入的是个一维向量
            original_image_sizes.append((val[0], val[1]))
            #[(375, 500), (333, 500), (385, 500), (375, 500)] batch等于4
              features = self.backbone(images.tensors)  # 将图像输入backbone得到特征图
              # torch.Size([4, 1280, 25, 38])
              proposals, proposal_losses = self.rpn(images, features, targets)
              #rpn
               def forward(self,
                images,        # type: ImageList
                features,      # type: Dict[str, Tensor]
                targets=None   # type: Optional[List[Dict[str, Tensor]]]
                ):
                features = list(features.values())
                #torch.Size([4, 15, 25, 38]) torch.Size([4, 60, 25, 38])
                objectness, pred_bbox_deltas = self.head(features)
                ，，，
                  def __init__(self, in_channels, num_anchors):
        super(RPNHead, self).__init__()
        # 3x3 滑动窗口
        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
        # 计算预测的目标分数（这里的目标只是指前景或者背景）
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
        # 计算预测的目标bbox regression参数
        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)

        for layer in self.children():
            if isinstance(layer, nn.Conv2d):
                torch.nn.init.normal_(layer.weight, std=0.01)
                torch.nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        # type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
        logits = []
        bbox_reg = []
        for i, feature in enumerate(x): 就循环一次，
            t = F.relu(self.conv(feature))
            logits.append(self.cls_logits(t)) 得到分类回归参数初步计算出相应的参数
            bbox_reg.append(self.bbox_pred(t))
        return logits, bbox_reg
        ，，，
         anchors = self.anchor_generator(images, features)
         生成anchors
         ,,,
             def forward(self, image_list, feature_maps):
        # type: (ImageList, List[Tensor]) -> List[Tensor]
        # 获取每个预测特征层的尺寸(height, width)
        grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
#                                torch.Size([25, 38])
# 获取输入图像的height和width
        image_size = image_list.tensors.shape[-2:]
# torch.size(800,1260)
 dtype, device = feature_maps[0].dtype, feature_maps[0].device

        # one step in feature map equate n pixel stride in origin image
        # 计算特征层上的一步等于原始图像上的步长
        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
                    #[[tensor(32), tensor(32)]]
                    ,,,
                        def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
        # type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
        """
        compute anchor sizes
        Arguments:
            scales: sqrt(anchor_area)
            aspect_ratios: h/w ratios
            dtype: float32
            device: cpu/gpu
        """
        scales = torch.as_tensor(scales, dtype=dtype, device=device)
        tensor([ 32.,  64., 128., 256., 512.])
        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
        tensor([0.5000, 1.0000, 2.0000])
        h_ratios = torch.sqrt(aspect_ratios) tensor([0.7071, 1.0000, 1.4142])
        w_ratios = 1.0 / h_ratios tensor([1.4142, 1.0000, 0.7071])

ws = (w_ratios[:, None] * scales[None, :]).view(-1)
        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
tensor([ 45.2548,  90.5097, 181.0193, 362.0387, 724.0773,  32.0000,  64.0000,
        128.0000, 256.0000, 512.0000,  22.6274,  45.2548,  90.5097, 181.0193,
        362.0387])
        tensor([ 22.6274,  45.2548,  90.5097, 181.0193, 362.0387,  32.0000,  64.0000,
        128.0000, 256.0000, 512.0000,  45.2548,  90.5097, 181.0193, 362.0387,
        724.0773])
 # left-top, right-bottom coordinate relative to anchor center(0, 0)
        # 生成的anchors模板都是以（0, 0）为中心的, shape [len(ratios)*len(scales), 4]
        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
tensor([[ -22.6274,  -11.3137,   22.6274,   11.3137],
        [ -45.2548,  -22.6274,   45.2548,   22.6274],
        [ -90.5097,  -45.2548,   90.5097,   45.2548],
        [-181.0193,  -90.5097,  181.0193,   90.5097],
        [-362.0387, -181.0193,  362.0387,  181.0193],
        [ -16.0000,  -16.0000,   16.0000,   16.0000],
        [ -32.0000,  -32.0000,   32.0000,   32.0000],
        [ -64.0000,  -64.0000,   64.0000,   64.0000],
        [-128.0000, -128.0000,  128.0000,  128.0000],
        [-256.0000, -256.0000,  256.0000,  256.0000],
        [ -11.3137,  -22.6274,   11.3137,   22.6274],
        [ -22.6274,  -45.2548,   22.6274,   45.2548],
        [ -45.2548,  -90.5097,   45.2548,   90.5097],
        [ -90.5097, -181.0193,   90.5097,  181.0193],
        [-181.0193, -362.0387,  181.0193,  362.0387]])
        return base_anchors.round()  #
         shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
         根据特征图在原图像上找到相对应的坐标信息
         tensor([[   0.,    0.,    0.,    0.],
        [  32.,    0.,   32.,    0.],
        [  64.,    0.,   64.,    0.],
        ...,
        [1120.,  768., 1120.,  768.],
        [1152.,  768., 1152.,  768.],
        [1184.,  768., 1184.,  768.]])
           shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
           torch.Size([950, 15, 4]) 25*38=950 生成特征矩阵回归图像生成15个anchor
            anchors.append(shifts_anchor.reshape(-1, 4))
            #torch.Size([14250, 4])
            
        return anchors  # List[Tensor(all_num_anchors, 4)]
     num_images = len(anchors)

        # numel() Returns the total number of elements in the input tensor.
        # 计算每个预测特征层上的对应的anchors数量
        num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
        #[torch.Size([15, 25, 38])]
        num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
        #[14250]
         objectness, pred_bbox_deltas =concat_box_prediction_layers(objectness, pred_bbox_deltas)
                                                         # # 57000      
                                                         # 57000 4
proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
#将预测的bbox regression参数应用到anchors上得到最终预测bbox坐标
#torch.Size([57000, 1, 4])
# 筛除小boxes框，nms处理，根据预测概率获取前post_nms_top_n个目标
        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
#每一个图像大概留下2000张图像
---------------------------
 def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
        # type: (Tensor, Tensor, List[Tuple[int, int]], List[int]) -> Tuple[List[Tensor], List[Tensor]]
        """
        筛除小boxes框，nms处理，根据预测概率获取前post_nms_top_n个目标
        Args:
            proposals: 预测的bbox坐标
            objectness: 预测的目标概率  57000 
            image_shapes: batch中每张图片的size信息 
            num_anchors_per_level: 每个预测特征层上预测anchors的数目  14250

        Returns:

        """
        num_images = proposals.shape[0]
        # 4 14250 4
        device = proposals.device

        # do not backprop throught objectness
        objectness = objectness.detach()
        objectness = objectness.reshape(num_images, -1)

        # Returns a tensor of size size filled with fill_value
        # levels负责记录分隔不同预测特征层上的anchors索引信息
        levels = [torch.full((n, ), idx, dtype=torch.int64, device=device)
                  for idx, n in enumerate(num_anchors_per_level)]
        levels = torch.cat(levels, 0)

        # Expand this tensor to the same size as objectness
        levels = levels.reshape(1, -1).expand_as(objectness)

        # select top_n boxes independently per level before applying nms
        # 获取每张预测特征图上预测概率排前pre_nms_top_n的anchors索引值
        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)

        image_range = torch.arange(num_images, device=device)
        batch_idx = image_range[:, None]  # [batch_size, 1]

        # 根据每个预测特征层预测概率排前pre_nms_top_n的anchors索引值获取相应概率信息
        objectness = objectness[batch_idx, top_n_idx]
        levels = levels[batch_idx, top_n_idx]
        # 预测概率排前pre_nms_top_n的anchors索引值获取相应bbox坐标信息
        proposals = proposals[batch_idx, top_n_idx]

        objectness_prob = torch.sigmoid(objectness)

        final_boxes = []
        final_scores = []
        # 遍历每张图像的相关预测信息
        for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
            # 调整预测的boxes信息，将越界的坐标调整到图片边界上
            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)

            # 返回boxes满足宽，高都大于min_size的索引
            keep = box_ops.remove_small_boxes(boxes, self.min_size)
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]

            # 移除小概率boxes，参考下面这个链接
            # https://github.com/pytorch/vision/pull/3205
            keep = torch.where(torch.ge(scores, self.score_thresh))[0]  # ge: >=
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]

            # non-maximum suppression, independently done per level
            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)

            # keep only topk scoring predictions
            keep = keep[: self.post_nms_top_n()]
            boxes, scores = boxes[keep], scores[keep]

            final_boxes.append(boxes)
            final_scores.append(scores)
        return final_boxes, final_scores
        ---------
         losses = {}
        if self.training:
            assert targets is not None
            # 计算每个anchors最匹配的gt，并将anchors进行分类，前景，背景以及废弃的anchors
            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
            # 结合anchors以及对应的gt，计算regression参数
            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
            loss_objectness, loss_rpn_box_reg = self.compute_loss(
                objectness, pred_bbox_deltas, labels, regression_targets
            )
            losses = {
                "loss_objectness": loss_objectness,
                "loss_rpn_box_reg": loss_rpn_box_reg
            }
        return boxes, losses
        -----
         proposals, proposal_losses = self.rpn(images, features, targets)
         detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
box_features = self.box_roi_pool(features, proposals, image_shapes)
torch.Size([2048, 1280, 7, 7])
   def forward(self, x):
        x = x.flatten(start_dim=1)

        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))

        return x
 optimizer.zero_grad()
        losses.backward()
        optimizer.step()

目标检测首先训练输入图像，利用特征提取网络提取特征，根据生成的特征矩阵4 * 1280* 2538
在该特征层上生成anchor，以及与bb相确认anchor标签，利用33卷积层在特征层上进行滑动，生成anchor概率,生成回归框，取前2000，利用anchor得到相应的特征层resize flatten 全连接层生成类概率与回归框，利用极大值抑制去除最后，得到阈值为0.5的概率后的边界框，得到最后的输出，边界值

计算机视觉篇---图像检测实战+理论讲解（1）-faster_rcnn

Dataset

猜你喜欢