foreword

The size of the input picture of the deep learning model is square, while the pictures in the data set are generally rectangular. Rough resizing will distort the picture. Using letterbox can better solve this problem. This method can keep the aspect ratio of the picture, and fill the rest with gray
import cv2
import numpy as np
import xml.etree.ElementTree as ET


class_dict = {'aircraft': 1}

def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)

def parse_xml(path):
    tree = ET.parse(path)
    root = tree.findall('object')
    class_list = []
    boxes_list = []
    difficult_list = []
    for sub in root:
        xmin = float(sub.find('bndbox').find('xmin').text)
        xmax = float(sub.find('bndbox').find('xmax').text)
        ymin = float(sub.find('bndbox').find('ymin').text)
        ymax = float(sub.find('bndbox').find('ymax').text)
        boxes_list.append([xmin, ymin, xmax, ymax])
        class_list.append(class_dict[sub.find('name').text])
        difficult_list.append(int(sub.find('difficult').text))
    return np.array(class_list), np.array(boxes_list).astype(np.int32)

if __name__=="__main__":
    import glob
    import matplotlib.pyplot as plt
    imglist = glob.glob(f'./JPEGImages/*.jpg')
    shape = (512, 512)
    imgPath = imglist[0]
    xmlPath = imgPath[:-3] + 'xml'
    img = cv2.imread(imgPath)
    labels, boxes = parse_xml(xmlPath.replace('JPEGImages', 'Annotation/xml'))
    img2, ratio, pad = letterbox(img.copy(), shape, auto=False, scaleup=False)
    new_boxes = np.zeros_like(boxes)
    new_boxes[:, 0] = ratio[0] * boxes[:, 0] + pad[0]  # pad width
    new_boxes[:, 1] = ratio[1] * boxes[:, 1] + pad[1]  # pad height
    new_boxes[:, 2] = ratio[0] * boxes[:, 2] + pad[0]
    new_boxes[:, 3] = ratio[1] * boxes[:, 3] + pad[1]

    sample1 = img.copy()
    for box in boxes:
        cv2.rectangle(sample1, (box[0], box[1]), (box[2], box[3]), (1, 0, 0), 1)

    sample2 = img2.copy()
    for box in new_boxes:
        cv2.rectangle(sample2, (box[0], box[1]), (box[2], box[3]), (1, 0, 0), 1)

    plt.subplot(121)
    plt.imshow(sample1)
    plt.subplot(122)
    plt.imshow(sample2)
    plt.show()
Letter Box object detection

foreword

Guess you like