文章说明

本系列文章旨在对 Github 上 malin9402 提供的代码进行说明，在这篇文章中，我们会对 YOLOv3 项目中的 dataset.py 文件进行说明。在这个文件中，只有一个 Dataset 类，用于生成数据集，所以本文将对代码的解释直接写在代码旁边。
如果只是想运行 Github 上的代码，可以参考对 YOLOv3 代码的说明一文。
完整代码

import os
import cv2
import random
import numpy as np
import tensorflow as tf
import core.utils as utils
from core.config import cfg

class Dataset(object):
    """implement Dataset here"""
    def __init__(self, dataset_type):
        self.annot_path  = cfg.TRAIN.ANNOT_PATH if dataset_type == 'train' else cfg.TEST.ANNOT_PATH  # 训练（测试）集标签路径
        self.input_sizes = cfg.TRAIN.INPUT_SIZE if dataset_type == 'train' else cfg.TEST.INPUT_SIZE  # 训练（测试）集图片尺寸
        self.batch_size  = cfg.TRAIN.BATCH_SIZE if dataset_type == 'train' else cfg.TEST.BATCH_SIZE  # 训练（测试）集批次大小
        self.data_aug    = cfg.TRAIN.DATA_AUG   if dataset_type == 'train' else cfg.TEST.DATA_AUG  # 是否对训练（测试）集图片进行数据增强处理

        self.train_input_sizes = cfg.TRAIN.INPUT_SIZE  # 训练集图片尺寸
        self.strides = np.array(cfg.YOLO.STRIDES)  # 每个 feature map 中的一个格子代表原始图像中的几个格子
        self.classes = utils.read_class_names(cfg.YOLO.CLASSES)  # 类别的索引
        self.num_classes = len(self.classes)  # 类别的个数
        self.anchors = np.array(utils.get_anchors(cfg.YOLO.ANCHORS)) # 先验框的宽度和高度
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE  # 一个尺度（feature map）上有几个先验框
        self.max_bbox_per_scale = 150  # 一个尺度（feature map）上最多有几个真实框（即最多有几个检测目标）

        self.annotations = self.load_annotations(dataset_type)  # 加载训练（测试）集标签
        self.num_samples = len(self.annotations)  # 样本数量
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))  # 一共有几个 batch
        self.batch_count = 0  # 计数


    def load_annotations(self, dataset_type):
        with open(self.annot_path, 'r') as f:
            txt = f.readlines()
            annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0]
        np.random.shuffle(annotations)
        return annotations

    def __iter__(self):
        return self

    def __next__(self):

        with tf.device('/cpu:0'):
            self.train_input_size = random.choice(self.train_input_sizes)  # 如果输入的图片尺寸不一，这里可以随机选择；但如果输入的图片尺寸一致，这里无论怎么随机结果都是一样的
            self.train_output_sizes = self.train_input_size // self.strides  # 三个输出（对应三个 feature map）的尺寸

            # 初始化一个批次的样本
            batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3), dtype=np.float32)

            # 初始化一个批次的输出（批次大小为4，三个尺度上的输出尺寸分别为52，26，13，一个尺度上有3个先验框，检测物共有80类）
            batch_label_sbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)  # 初始化第一个尺度上的输出，shape 为 [4, 52, 52, 3, 85]
            batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)  # 初始化第二个尺度上的输出，shape 为 [4, 26, 26, 3, 85]
            batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[2], self.train_output_sizes[2],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)  # 初始化第三个尺度上的输出，shape 为 [4, 13, 13, 3, 85]

            batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)
            batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)
            batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)

            num = 0  # 记录现在遍历到一个批次（4张图片）中的第几张图片了
            if self.batch_count < self.num_batchs:  # 如果记录的 batch 个数还没达到总 batch 个数
                while num < self.batch_size:  # 如果这个批次（4张图片）还没遍历完
                    index = self.batch_count * self.batch_size + num  # 记录现在是第几个样本（即记录已经遍历了的样本数）
                    if index >= self.num_samples: index -= self.num_samples  # 要是已遍历样本数大于总样本数，将 index 置零，相当于 repeat 操作
                    annotation = self.annotations[index]  # 加载第 index 个样本的信息（因为之前有 shuffle 操作，所以这个样本并非图片 index.jpg）
                    image, bboxes = self.parse_annotation(annotation)  # 加载图片及其真实框 -> 数据增强操作 -> 将图像处理成模型需要输入的格式
                    label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes)  # 把符合要求的三个尺度上的先验框信息及位置提取出来

                    batch_image[num, :, :, :] = image  # 把批次大小张图片一起放入数组
                    batch_label_sbbox[num, :, :, :, :] = label_sbbox  # 把批次大小张图片的在第一尺度上符合要求的先验框信息一起放入数组
                    batch_label_mbbox[num, :, :, :, :] = label_mbbox  # 把批次大小张图片的在第二尺度上符合要求的先验框信息一起放入数组
                    batch_label_lbbox[num, :, :, :, :] = label_lbbox  # 把批次大小张图片的在第三尺度上符合要求的先验框信息一起放入数组
                    batch_sbboxes[num, :, :] = sbboxes  # 把批次大小张图片的在第一尺度上符合要求的先验框位置一起放入数组
                    batch_mbboxes[num, :, :] = mbboxes  # 把批次大小张图片的在第二尺度上符合要求的先验框位置一起放入数组
                    batch_lbboxes[num, :, :] = lbboxes  # 把批次大小张图片的在第三尺度上符合要求的先验框位置一起放入数组
                    num += 1
                self.batch_count += 1
                batch_smaller_target = batch_label_sbbox, batch_sbboxes  # 第一尺度目标
                batch_medium_target  = batch_label_mbbox, batch_mbboxes  # 第二尺度目标
                batch_larger_target  = batch_label_lbbox, batch_lbboxes  # 第三尺度目标

                return batch_image, (batch_smaller_target, batch_medium_target, batch_larger_target)
            else:  # 如果记录的 batch 个数达到总 batch 个数了
                self.batch_count = 0  # 计数归零
                np.random.shuffle(self.annotations)  # 再次打乱样本
                raise StopIteration

    def random_horizontal_flip(self, image, bboxes):  # 随机水平翻转图片

        if random.random() < 0.5:  # 执行水平翻转操作的概率为 0.5
            _, w, _ = image.shape
            image = image[:, ::-1, :]
            bboxes[:, [0,2]] = w - bboxes[:, [2,0]]

        return image, bboxes

    def random_crop(self, image, bboxes):  # 随机剪裁图片

        if random.random() < 0.5:  # 执行剪裁图片操作的概率为 0.5
            h, w, _ = image.shape  # 图片的高和宽
            # 取所有真实框的 [xmin, ymin] 中最小的 [xmin, ymin] 和 [xmax, ymax] 中最大的 [xmax, ymax]
            # 所以 max_bbox 现在所对应的框是面积最大的框
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]  # 最大框离图片左边缘的距离
            max_u_trans = max_bbox[1]  # 最大框离图片上边缘的距离
            max_r_trans = w - max_bbox[2]  # 最大框离图片右边缘的距离
            max_d_trans = h - max_bbox[3]  # 最大框离图片下边缘的距离

            crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))  # 剪裁后的图片左上角在原始图像中的横坐标
            crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))  # 剪裁后的图片左上角在原始图像中的纵坐标
            crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))  # 剪裁后的图片右下角在原始图像中的横坐标
            crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))  # 剪裁后的图片右下角在原始图像中的纵坐标

            image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]  # 剪裁后的图片

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin  # 剪裁后的真实框左上角在原始图像中的坐标
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin  # 剪裁后的真实框右下角在原始图像中的坐标

        return image, bboxes

    def random_translate(self, image, bboxes):  # 随机平移图片

        if random.random() < 0.5:  # 执行平移图片操作的概率为 0.5
            h, w, _ = image.shape  # 图片的高和宽
            # 取所有真实框的 [xmin, ymin] 中最小的 [xmin, ymin] 和 [xmax, ymax] 中最大的 [xmax, ymax]
            # 所以 max_bbox 现在所对应的框是面积最大的框
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]  # 最大框离图片左边缘的距离
            max_u_trans = max_bbox[1]  # 最大框离图片上边缘的距离
            max_r_trans = w - max_bbox[2]  # 最大框离图片右边缘的距离
            max_d_trans = h - max_bbox[3]  # 最大框离图片下边缘的距离

            tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))  # 左右平移的距离
            ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))  # 上下平移的距离

            M = np.array([[1, 0, tx], [0, 1, ty]])
            image = cv2.warpAffine(image, M, (w, h))  # 仿射变换

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx  # 平移后的真实框左上角在原始图像中的坐标
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty  # 平移后的真实框右下角在原始图像中的坐标

        return image, bboxes

    def parse_annotation(self, annotation):

        line = annotation.split()  # 将样本信息按 ' ' 划分为一个列表中的不同元素
        image_path = line[0]  # 提取样本路径
        if not os.path.exists(image_path):  # 要是这个路径不存在就报错
            raise KeyError("%s does not exist ... " %image_path)
        image = cv2.imread(image_path)  # 加载路径下对应的图片
        # 用 map 函数将真实框信息从字符串类型变为数值类型，然后放到一个列表中（这里可以参考 text.py 中的说明）
        bboxes = np.array([list(map(int, box.split(','))) for box in line[1:]])

        if self.data_aug:  # 是否对图片进行数据增强操作
            image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes))  # 随机水平翻转图片
            image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes))  # 随机剪裁图片
            image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes))  # 随机平移图片

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # BGR --> RGB
        # 将图像处理成模型需要输入的格式
        image, bboxes = utils.image_preporcess(np.copy(image), [self.train_input_size, self.train_input_size], np.copy(bboxes))
        return image, bboxes
    
    # 计算两个框之间的 IOU 值
    def bbox_iou(self, boxes1, boxes2):

        boxes1 = np.array(boxes1)  # 第一个检测框的坐标数据
        boxes2 = np.array(boxes2)  # 第二个检测框的坐标数据

        boxes1_area = boxes1[..., 2] * boxes1[..., 3]  # 第一个检测框的面积
        boxes2_area = boxes2[..., 2] * boxes2[..., 3]  # 第二个检测框的面积

        boxes1 = np.concatenate([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                                boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)  # 第一个检测框的左上角坐标+右下角坐标
        boxes2 = np.concatenate([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                                boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)  # 第二个检测框的左上角坐标+右下角坐标

        left_up = np.maximum(boxes1[..., :2], boxes2[..., :2])  # 对上图来说，left_up=[xmin2, ymin2]
        right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:])  # 对上图来说，right_down=[xmax1, ymax1]

        inter_section = np.maximum(right_down - left_up, 0.0)  # 交集区域
        inter_area = inter_section[..., 0] * inter_section[..., 1]  # 交集面积
        union_area = boxes1_area + boxes2_area - inter_area  # 并集面积

        return inter_area / union_area

    def preprocess_true_boxes(self, bboxes):

        # label 中有3个列表，每个列表形状为 [当前 feature map 尺寸, 当前 feature map 尺寸, 当前 feature map 下的先验框数量, 5 + 种类数量]
        label = [np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale,
                           5 + self.num_classes)) for i in range(3)]
        # bboxes_xywh 中有3个列表，每个列表形状为 [每个尺度下允许的最大检测物数量, 检测框 4 个坐标]
        bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3)]
        bbox_count = np.zeros((3,))  # 记录最终在各个尺度下有多少符合条件的先验框

        for bbox in bboxes:  # 对这张图片上所有真实框中的一个真实框操作
            bbox_coor = bbox[:4]  # 记录这个真实框的坐标（左上角 + 右下角）
            bbox_class_ind = bbox[4]  # 记录这个真实框中内容的种类（用索引表示）

            onehot = np.zeros(self.num_classes, dtype=np.float)  # 初始化关于种类的独热编码
            onehot[bbox_class_ind] = 1.0  # 把对应这个真实框内容种类的独热编码位置置 1
            # uniform_distribution 是一个形状为 [种类数,] 的数组，每个元素值都是种类数的倒数
            uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes)
            deta = 0.01
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution  # 标签平滑

            # 将 [xmin, ymin, xmax, ymax] 转换为 [中心横坐标, 中心纵坐标, 宽, 高]
            bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
            # 将真实框的坐标转化到 feature map 上（转化完之后就不一定都是整数了）
            bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis]  # bbox_xywh shape: [1, 4]；strides shape: [3, ]; bbox_xywh_scaled shape: [1, 3, 4]

            iou = []
            exist_positive = False  # False 表示还没有符合条件的正样本
            for i in range(3):  # 3 个尺度中的某一尺度下
                anchors_xywh = np.zeros((self.anchor_per_scale, 4))  # 初始化先验框位置，shape = [3, 4]，3 表示一个尺度下有三种先验框，4 表示是每个先验框的中心坐标和宽高
                anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5  # 该尺度下三个先验框的中心坐标（让它在真实框中心所在的格子上）
                anchors_xywh[:, 2:4] = self.anchors[i]  # 该尺度下三个先验框的宽度和高度

                iou_scale = self.bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)  # 计算三个先验框与（该尺度下）真实框的 IOU 值
                iou.append(iou_scale)
                iou_mask = iou_scale > 0.3  # IOU 值大于 0.3 时此先验框代表的索引置 1

                if np.any(iou_mask):  # 如果该尺度下有符合条件的先验框
                    xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)  # 该尺度下真实框的中心坐标

                    # 表示在以 (yind, xind) 这个格子为中心的先验框里有目标
                    label[i][yind, xind, iou_mask, :] = 0
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh  # 将真实框的中心坐标和宽高赋给这个符合条件的先验框
                    label[i][yind, xind, iou_mask, 4:5] = 1.0  # 1 表示这个先验框里有目标
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot  # 将代表种类的独热编码赋给这个符合条件的先验框

                    bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)  # 记录这是第几个符合条件的先验框
                    bboxes_xywh[i][bbox_ind, :4] = bbox_xywh  # 记录所有符合条件的先验框的中心坐标和宽高
                    bbox_count[i] += 1

                    exist_positive = True  # True 表示有符合条件的正样本了

            if not exist_positive:  # 如果三个尺度下都没有先验框符合条件
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)  # 取 IOU 值最大的那个先验框
                best_detect = int(best_anchor_ind / self.anchor_per_scale)  # 哪一个尺度
                best_anchor = int(best_anchor_ind % self.anchor_per_scale)  # 这个尺度下的哪个先验框
                xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)  # 该尺度下真实框的中心坐标

                label[best_detect][yind, xind, best_anchor, :] = 0
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh  # 将真实框的中心坐标和宽高赋给这个先验框
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0  # 1 表示这个先验框里有目标
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot  # 将代表种类的独热编码赋给这个先验框

                bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale)  # 记录这是第几个符合条件的先验框
                bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh  # 记录所有符合条件的先验框的中心坐标和宽高
                bbox_count[best_detect] += 1
        label_sbbox, label_mbbox, label_lbbox = label  # 将符合条件的先验框标签（包括位置、置信度和种类信息）按照尺度分到三个列表里
        sbboxes, mbboxes, lbboxes = bboxes_xywh  # 将符合条件的先验框的中心坐标和宽高按照尺度分到三个列表里
        return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

    def __len__(self):
        return self.num_batchs
Tensorflow2.0 实现 YOLOv3（六）：dataset.py

文章目录

文章说明

完整代码

猜你喜欢