Source code precision understanding of YOLOv3 (7) YoloDataSet

The code mainly refers to the code of github YOLOv3 of bubbliiing : github.com/bubbliiing…

Interpretation of the source code

training part

dataloader.py file

class YoloDataset(Dataset):
    def __init__(self, annotation_lines, input_shape, num_classes, train):
        super(YoloDataset, self).__init__()
        self.annotation_lines   = annotation_lines
        self.input_shape        = input_shape
        self.num_classes        = num_classes
        self.length             = len(self.annotation_lines)
        self.train              = train

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        index       = index % self.length
        #---------------------------------------------------#
        #   训练时进行数据的随机增强
        #   验证时不进行数据的随机增强
        #---------------------------------------------------#
        image, box  = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train)
        image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
        box         = np.array(box, dtype=np.float32)
        if len(box) != 0:
            box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
            box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]

            box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
            box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
        return image, box

    def rand(self, a=0, b=1):
        return np.random.rand()*(b-a) + a

    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
        line    = annotation_line.split()
        #------------------------------#
        #   读取图像并转换成RGB图像
        #------------------------------#
        image   = Image.open(line[0])
        image   = cvtColor(image)
        #------------------------------#
        #   获得图像的高宽与目标高宽
        #------------------------------#
        iw, ih  = image.size
        h, w    = input_shape
        #------------------------------#
        #   获得预测框
        #------------------------------#
        box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])

        if not random:
            scale = min(w/iw, h/ih)
            nw = int(iw*scale)
            nh = int(ih*scale)
            dx = (w-nw)//2
            dy = (h-nh)//2

            #---------------------------------#
            #   将图像多余的部分加上灰条
            #---------------------------------#
            image       = image.resize((nw,nh), Image.BICUBIC)
            new_image   = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
            image_data  = np.array(new_image, np.float32)

            #---------------------------------#
            #   对真实框进行调整
            #---------------------------------#
            if len(box)>0:
                np.random.shuffle(box)
                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
                box[:, 0:2][box[:, 0:2]<0] = 0
                box[:, 2][box[:, 2]>w] = w
                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box

            return image_data, box
                
        #------------------------------------------#
        #   对图像进行缩放并且进行长和宽的扭曲
        #------------------------------------------#
        new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
        scale = self.rand(.25, 2)
        if new_ar < 1:
            nh = int(scale*h)
            nw = int(nh*new_ar)
        else:
            nw = int(scale*w)
            nh = int(nw/new_ar)
        image = image.resize((nw,nh), Image.BICUBIC)

        #------------------------------------------#
        #   将图像多余的部分加上灰条
        #------------------------------------------#
        dx = int(self.rand(0, w-nw))
        dy = int(self.rand(0, h-nh))
        new_image = Image.new('RGB', (w,h), (128,128,128))
        new_image.paste(image, (dx, dy))
        image = new_image

        #------------------------------------------#
        #   翻转图像
        #------------------------------------------#
        flip = self.rand()<.5
        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)

        image_data      = np.array(image, np.uint8)
        #---------------------------------#
        #   对图像进行色域变换
        #   计算色域变换的参数
        #---------------------------------#
        r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
        #---------------------------------#
        #   将图像转到HSV上
        #---------------------------------#
        hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
        dtype           = image_data.dtype
        #---------------------------------#
        #   应用变换
        #---------------------------------#
        x       = np.arange(0, 256, dtype=r.dtype)
        lut_hue = ((x * r[0]) % 180).astype(dtype)
        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)

        image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
        image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)

        #---------------------------------#
        #   对真实框进行调整
        #---------------------------------#
        if len(box)>0:
            np.random.shuffle(box)
            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
            if flip: box[:, [0,2]] = w - box[:, [2,0]]
            box[:, 0:2][box[:, 0:2]<0] = 0
            box[:, 2][box[:, 2]>w] = w
            box[:, 3][box[:, 3]>h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w>1, box_h>1)] 
        
        return image_data, box
复制代码

Read the code in detail

dataset的三部件

# 初始化
def __init__(self, annotation_lines, input_shape, num_classes, train):
    super(YoloDataset, self).__init__()
    # 从txt中读出来的数据列表
    self.annotation_lines   = annotation_lines
    # [416,416]
    self.input_shape        = input_shape
    # voc 20
    self.num_classes        = num_classes
    # 训练数据总量
    self.length             = len(self.annotation_lines)
    # 是train还是val, train需要backward反向传播，val不用
    self.train              = train

# 返回图片(训练数据)总量
def __len__(self):
    return self.length

# 根据index获取数据
def __getitem__(self, index):
    index       = index % self.length
    #---------------------------------------------------#
    #   训练时进行数据的随机增强
    #   验证时不进行数据的随机增强
    #---------------------------------------------------#
    # 根据是训练或者是测试阶段，看是否对图像进行数据增强和框的变化
    image, box  = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train)
    # 对于图片先转化成nd.array,然后在进行prepreocess处理，因为pytorch需要[channel,w,h]的输入格式，所以直接transpose转化维度
    image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
    # 先将box加载成ndarray,
    box         = np.array(box, dtype=np.float32)
    if len(box) != 0:
        # 在这个部分我们将我们的真实宽高x1,x2,y1,y2转化成归一化坐标
        box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
        box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
        
        # 将归一化的x1,x2,y1,y2坐标转化成[中心点坐标x,y,w,h]的形式
        box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
        box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
    # 返回image数据和label数据
    return image, box
复制代码

call get_random_datamethod

# 读入的信息如下C:\Users\xxx\yolo3-pytorch-master\VOCdevkit/VOC2007/JPEGImages/000072.jpg 40,71,333,473,13
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
    # 对读入的数据进行分割
    line    = annotation_line.split()
    #------------------------------#
    #   读取图像并转换成RGB图像
    #------------------------------#
    
    # 我们读取图像并转化成RGB的格式
    image   = Image.open(line[0])
    image   = cvtColor(image)
    #------------------------------#
    #   获得图像的高宽与目标高宽
    #------------------------------#
    
    # 获取真实狂傲
    iw, ih  = image.size
    # 指定宽高[416,416]
    h, w    = input_shape
    #------------------------------#
    #   获得预测框
    #------------------------------#
    
    # 我们获取box的数据[[40,71,333,473,13],[89,31,323,73,11]...]，注意他的复合写法，可学习
    box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])

    # 这个random变量就是train or val
    if not random:
        # 是val的话，不进行图像增强，直接进行框的变化即可
        scale = min(w/iw, h/ih)
        nw = int(iw*scale)
        nh = int(ih*scale)
        
        # 我们要得到的是填充的宽度和高度
        dx = (w-nw)//2
        dy = (h-nh)//2

        #---------------------------------#
        #   将图像多余的部分加上灰条
        #---------------------------------#
        
        # 对图像进行填充，得到不失真的填充图片
        image       = image.resize((nw,nh), Image.BICUBIC)
        new_image   = Image.new('RGB', (w,h), (128,128,128))
        new_image.paste(image, (dx, dy))
        
        # 返回图片的ndarray数据
        image_data  = np.array(new_image, np.float32)

        #---------------------------------#
        #   对真实框进行调整
        #---------------------------------#
        # 真的有真实框的话
        if len(box)>0:
            # 对框的顺序进行打乱
            np.random.shuffle(box)
            # 计算我们的坐标在填充后的真实坐标
            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
            
            box[:, 0:2][box[:, 0:2]<0] = 0
            box[:, 2][box[:, 2]>w] = w
            box[:, 3][box[:, 3]>h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box

        return image_data, box

    #------------------------------------------#
    #   对图像进行缩放并且进行长和宽的扭曲
    #------------------------------------------#
    
    # 是train的话，进行图像增强(以下都是图像增强的手段[随机缩放、扭曲、翻转、RGB-HSV-RGB等]，不进行详细解读)
    new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
    scale = self.rand(.25, 2)
    if new_ar < 1:
        nh = int(scale*h)
        nw = int(nh*new_ar)
    else:
        nw = int(scale*w)
        nh = int(nw/new_ar)
    image = image.resize((nw,nh), Image.BICUBIC)

    #------------------------------------------#
    #   将图像多余的部分加上灰条
    #------------------------------------------#
    dx = int(self.rand(0, w-nw))
    dy = int(self.rand(0, h-nh))
    new_image = Image.new('RGB', (w,h), (128,128,128))
    new_image.paste(image, (dx, dy))
    image = new_image

    #------------------------------------------#
    #   翻转图像
    #------------------------------------------#
    flip = self.rand()<.5
    if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)

    image_data      = np.array(image, np.uint8)
    #---------------------------------#
    #   对图像进行色域变换
    #   计算色域变换的参数
    #---------------------------------#
    r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
    #---------------------------------#
    #   将图像转到HSV上
    #---------------------------------#
    hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
    dtype           = image_data.dtype
    #---------------------------------#
    #   应用变换
    #---------------------------------#
    x       = np.arange(0, 256, dtype=r.dtype)
    lut_hue = ((x * r[0]) % 180).astype(dtype)
    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)

    image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
    image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)

    #---------------------------------#
    #   对真实框进行调整
    #---------------------------------#
    if len(box)>0:
        np.random.shuffle(box)
        box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
        box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
        if flip: box[:, [0,2]] = w - box[:, [2,0]]
        box[:, 0:2][box[:, 0:2]<0] = 0
        box[:, 2][box[:, 2]>w] = w
        box[:, 3][box[:, 3]>h] = h
        box_w = box[:, 2] - box[:, 0]
        box_h = box[:, 3] - box[:, 1]
        box = box[np.logical_and(box_w>1, box_h>1)] 

    return image_data, box
复制代码

Let's see that the data of our box before the change is the set

and the data after the data enhancement change is

Source code precision understanding of YOLOv3 (7) YoloDataSet

Interpretation of the source code

training part

dataloader.py file

Guess you like