pytorch实现yolo-v3 (源码阅读和复现) -- 002

上一篇已经介绍了yolov3使用到的网络darknet53每一层的结构,现在这里来完成代码解析和模型创建

本章所有代码: https://github.com/wanghao00/pytorch-yolo-v3/tree/master/001

1. 加载并解析配置文件`cfg/yolov3.cfg`

配置文件包含6种不同type, 分别为'convolutional', 'net', 'route', 'shortcut', 'upsample', 'yolo',
其中'net'相当于超参数,定义了网络全局配置的相关参数

darnet.py代码如下

"""
python 3.6
Pytorch 0.4
"""
import torch
import torch.nn as nn

def parse_cfg(cfgfile):
    """
    输入: 配置文件路径
    返回值: list对象,其中每一个item为一个dict类型
    对应于一个要建立的神经网络模块
    """

    # 加载文件并过滤掉文本中多余内容
    with open(cfgfile, 'r') as f:
        lines = f.read().split('\n')
    lines = [x for x in lines if len(x) > 0] # 去掉空行
    lines = [x for x in lines if x[0]!='#'] # 去掉以#开头的注释行
    lines = [x.rstrip().lstrip() for x in lines] # 去掉左右两边的空格

    block = {}
    blocks = []

    for line in lines:
        if line[0] == "[":  # 这是一个层(块)的开始
            # 上一个块内容如果还没有保存
            if len(block) != 0:  # 块内已经存了信息, 都是上一个块的信息
                blocks.append(block)
                block = {}  # 新建一个空白块存描述信息
            block["type"] = line[1:-1].rstrip()  # 块名
        else:
            key, value = line.split("=")
            block[key.rstrip()] = value.lstrip()
    blocks.append(block)  # 退出循环，将最后一个未加入的block加进去

    # print('\n\n'.join([repr(x) for x in blocks])) # 查看结果可以取消注释
    return blocks

# 执行查看返回结果
cfg = parse_cfg("cfg/yolov3.cfg")
print(cfg)

blocks的内容如下

{'type': 'net', 'batch': '1', 'subdivisions': '1', 'width': '320', 'height': '320', 'channels': '3', 'momentum': '0.9', 'decay': '0.0005', 'angle': '0', 'saturation': '1.5', 'exposure': '1.5', 'hue': '.1', 'learning_rate': '0.001', 'burn_in': '1000', 'max_batches': '500200', 'policy': 'steps', 'steps': '400000,450000', 'scales': '.1,.1'}

{'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'}

**省略**

{'type': 'convolutional', 'batch_normalize': '1', 'size': '3', 'stride': '1', 'pad': '1', 'filters': '256', 'activation': 'leaky'}

{'type': 'convolutional', 'size': '1', 'stride': '1', 'pad': '1', 'filters': '255', 'activation': 'linear'}

{'type': 'yolo', 'mask': '0,1,2', 'anchors': '10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326', 'classes': '80', 'num': '9', 'jitter': '.3', 'ignore_thresh': '.5', 'truth_thresh': '1', 'random': '1'}

2. 配置项到神经网络层的映射

将上一步的执行结果, 映射成在pytorch中对应的神经网络层

这一步需要我们创建两个层, 其中

EmptyLayer是为了shortcut layer / route layer准备的, 具体公用在Darknet的forward函数中有体现;
DetectionLayer是yolo检测层的具体实现, 在特征图上使用锚点预测目标区域和类别, 功能函数在predict_transform中

class EmptyLayer(nn.Module):
    """
    为shortcut layer / route layer 准备, 具体功能不在此实现
    """
    def __init__(self):
        super(EmptyLayer, self).__init__()

class DetectionLayer(nn.Module):
    '''yolo 检测层'''
    def __init__(self, anchors):
        super(DetectionLayer, self).__init__()
        self.anchors = anchors

    def forward(self, x, input_dim, num_classes, confidence):
        x = x.data
        global CUDA
        prediction = x
        prediction = predict_transform(prediction, input_dim, self.anchors, num_classes, confidence, CUDA)

        return prediction

def create_modules(blocks):
    # 获取网路输入和预处理相关信息
    net_info = blocks[0]

    module_list = nn.ModuleList()
    index = 0 # route layer 会用到
    previous_filters = 3 # 初始值对应于输入数据3通道
    output_filters = []

    for block in blocks:
        container = nn.Sequential()
        if block["type"] == "net":
            continue

        if block["type"] == "convolutional":
            ''' 1. 卷积层 '''
            # 获取激活函数/批归一化/卷积层参数
            activation = block["activation"]
            try:
                batch_normalize = int(block["batch_normalize"])
                bias = False
            except:
                batch_normalize = 0
                bias = True
            filters = int(block["filters"])
            padding = int(block["pad"])
            kernel_size = int(block["size"])
            stride = int(block["stride"])

            if padding:
                pad = (kernel_size - 1) // 2
            else:
                pad = 0

            # 开始创建并添加相应层
            # Add the convolutional layer
            # nn.Conv2d(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True)
            conv = nn.Conv2d(previous_filters, filters, kernel_size, stride, pad, bias=bias)
            container.add_module("conv_{0}".format(index), conv)

            # Add the Batch Norm Layer
            if batch_normalize:
                bn = nn.BatchNorm2d(filters)
                container.add_module("batch_norm_{0}".format(index), bn)

            # Check the activation.
            # It is either Linear or a Leaky ReLU for YOLO
            # 给定参数负轴系数0.1
            if activation == "leaky":
                activn = nn.LeakyReLU(0.1, inplace=True)
                container.add_module("leaky_{0}".format(index), activn)

        elif block["type"] == "upsample":
            '''
            2. upsampling layer
            没有使用 Bilinear2dUpsampling
            实际使用的为最近邻插值
            '''
            upsample = nn.Upsample(scale_factor=2, mode="nearest")
            container.add_module("upsample_{}".format(index), upsample)

        # route layer -> Empty layer
        elif block["type"] == "route":
            block["layers"] = block["layers"].split(',')

            #Start  of a route
            start = int(block["layers"][0])
            #end, if there exists one.
            try:
                end = int(block["layers"][1])
            except:
                end = 0

            #Positive anotation: 正值
            if start > 0:
                start = start - index

            if end > 0:
                end = end - index

            route = EmptyLayer()
            container.add_module("route_{0}".format(index), route)

            if end < 0:
                filters = output_filters[index + start] + output_filters[index + end]
            else:
                filters= output_filters[index + start]

        # shortcut corresponds to skip connection
        elif block["type"] == "shortcut":
            from_ = int(block["from"])
            shortcut = EmptyLayer()
            container.add_module("shortcut_{}".format(index), shortcut)

        elif block["type"] == "maxpool":
            stride = int(block["stride"])
            size = int(block["size"])
            maxpool = nn.MaxPool2d(size, stride)
            container.add_module("maxpool_{}".format(index), maxpool)

        # Yolo is the detection layer
        elif block["type"] == "yolo":
            mask = block["mask"].split(",")
            mask = [int(x) for x in mask]

            anchors = block["anchors"].split(",")
            anchors = [int(a) for a in anchors]
            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
            anchors = [anchors[i] for i in mask]

            detection = DetectionLayer(anchors) # 锚点,检测,位置回归,分类
            container.add_module("Detection_{}".format(index), detection)
        else:
            print("...咱未实现的...")
            assert False

        module_list.append(container)
        previous_filters = filters
        output_filters.append(filters)
        index += 1

    return net_info, module_list

blocks = parse_cfg('cfg/yolov3.cfg')
x,y = create_modules(blocks)
print(y)

输出节选

ModuleList(
  (0): Sequential(
    (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_0): LeakyReLU(negative_slope=0.1, inplace)
  )
  ...省略...
  (3): Sequential(
    (conv_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_3): LeakyReLU(negative_slope=0.1, inplace)
  )
  (4): Sequential(
    (shortcut_4): EmptyLayer()
  )
  ...省略...
  (97): Sequential(
    (upsample_97): Upsample(scale_factor=2, mode=nearest)
  )
  (98): Sequential(
    (route_98): EmptyLayer()
  )
  ...省略...
  (104): Sequential(
    (conv_104): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_104): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_104): LeakyReLU(negative_slope=0.1, inplace)
  )
  (105): Sequential(
    (conv_105): Conv2d(256, 255, kernel_size=(1, 1), stride=(1, 1))
  )
  (106): Sequential(
    (Detection_106): DetectionLayer()
  )
)

3. 构建model

前两步还是未能将所有模块链接到一起形成网络, 下面会创建Darknet类实现,并进行测试

class Darknet(nn.Module):
    def __init__(self, cfgfile):
        super(Darknet, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list = create_modules(self.blocks)
        # 模型版本标志
        self.header = torch.IntTensor([0, 0, 0, 0])
        self.seen = 0

    def get_blocks(self):
        return self.blocks # list

    def get_module_list(self):
        return self.module_list # nn.ModuleList

    def forward(self, x, CUDA=True):
        detections = []
        # 除了net块之外的所有
        modules = self.blocks[1:]

        # cache output for route layer
        outputs = {}

        write = False # 拼接检测层结果
        for i in range(len(modules)):
            module_type = modules[i]["type"]

            #
            if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool":
                x = self.module_list[i](x)
                outputs[i] = x
            #
            elif module_type == "route":
                layers = modules[i]["layers"]
                layers = [int(a) for a in layers]

                if (layers[0]) > 0:
                    layers[0] = layers[0] - i

                if len(layers) == 1:
                    x = outputs[i + (layers[0])]

                else:
                    if (layers[1]) > 0:
                        layers[1] = layers[1] - i

                    map1 = outputs[i + layers[0]]
                    map2 = outputs[i + layers[1]]

                    x = torch.cat((map1, map2), 1)
                outputs[i] = x

            elif module_type == "shortcut":
                from_ = int(modules[i]["from"])
                x = outputs[i - 1] + outputs[i + from_]  # 求和运算
                outputs[i] = x

            #
            elif module_type == 'yolo':

                anchors = self.module_list[i][0].anchors
                # Get the input dimensions
                inp_dim = int(self.net_info["height"])

                # Get the number of classes
                num_classes = int(modules[i]["classes"])

                # Output the result
                x = x.data
                x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)

                if type(x) == int:
                    continue

                # 将在3个不同level的fm上检测结果
                # 存储在 detections 里
                if not write:
                    detections = x
                    write = True

                else:
                    detections = torch.cat((detections, x), 1)

                outputs[i] = outputs[i - 1]
        # 网络forward 执行完毕
        try:
            return detections
        except:
            return 0

model = Darknet("cfg/yolov3.cfg")
input = torch.sigmoid(torch.rand(1, 3, 416, 416).float())
# 网络输入数据大小
model.net_info["height"] = 416
predictions = model(input, False)
print(predictions.shape) # torch.Size([1, 10647, 85])

网络会在三个不同level的特征图(?, 255, 13,13)/ (?, 255, 26, 26)/ (?, 255, 52, 52)上进行多尺度预测, 在每个位置处有三组不同尺寸的anchor,最终将三个层次的预测结果合并在一起返回,所以模型直接的预测结为Size([?, 10647, 5+cls]) ,

其中10647= (13*13 + 26*26 + 52*52) * 3 [anchors]

关于在特征图上如何利用给定锚点进行多尺度检测,即DetectionLayer中使用的predict_transform的实现思路,会在下一篇文章中继续阐述.