1. PASCAL VOC data format
<?xml version='1.0' encoding='utf-8'?>
<annotation verified="no">
<folder>JPEGImages</folder>
<filename>2018_06_05_09_06_55_065</filename>
<path>F:\receive\VOC2007\JPEGImages\2018_06_05_09_06_55_065.jpg</path>
<source>
<database>Unknown</database>
</source>
<size>
<width>2048</width>
<height>1536</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>1</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>530</xmin>
<ymin>752</ymin>
<xmax>1498</xmax>
<ymax>1326</ymax>
</bndbox>
</object>
</annotation>
2. yolo data format
class x_center y_center w h The coordinate position and the width and height of the box are divided by the width and height of the picture
0 0.947510 0.546224 0.049316 0.050781
0 0.434326 0.586263 0.217285 0.194661
3. The file layout is shown in the figure
Images are stored in images, labels_voc are stored in *.xml (labels), and labels are stored in *.txt (converted labels)
Classes.names stores the name of each category, one category name per line
The contents of pascalVOC2yolov3.py are as follows:
#coding:utf-8
from __future__ import print_function
import os
import random
import glob
import xml.etree.ElementTree as ET
def xml_reader(filename):
""" Parse a PASCAL VOC xml file """
tree = ET.parse(filename)
size = tree.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
objects = []
for obj in tree.findall('object'):
obj_struct = {}
obj_struct['name'] = obj.find('name').text
bbox = obj.find('bndbox')
obj_struct['bbox'] = [int(bbox.find('xmin').text),
int(bbox.find('ymin').text),
int(bbox.find('xmax').text),
int(bbox.find('ymax').text)]
objects.append(obj_struct)
return width, height, objects
def voc2yolo(filename):
classes_dict = {}
with open("classes.names") as f:
for idx, line in enumerate(f.readlines()):
class_name = line.strip()
classes_dict[class_name] = idx
width, height, objects = xml_reader(filename)
lines = []
for obj in objects:
x, y, x2, y2 = obj['bbox']
class_name = obj['name']
label = classes_dict[class_name]
cx = (x2+x)*0.5 / width
cy = (y2+y)*0.5 / height
w = (x2-x)*1. / width
h = (y2-y)*1. / height
line = "%s %.6f %.6f %.6f %.6f\n" % (label, cx, cy, w, h)
lines.append(line)
txt_name = filename.replace(".xml", ".txt").replace("labels_voc", "labels")
with open(txt_name, "w") as f:
f.writelines(lines)
def get_image_list(image_dir, suffix=['jpg', 'jpeg', 'JPG', 'JPEG','png']):
'''get all image path ends with suffix'''
if not os.path.exists(image_dir):
print("PATH:%s not exists" % image_dir)
return []
imglist = []
for root, sdirs, files in os.walk(image_dir):
if not files:
continue
for filename in files:
filepath = "data/custom/" + os.path.join(root, filename) + "\n"
if filename.split('.')[-1] in suffix:
imglist.append(filepath)
return imglist
def imglist2file(imglist):
random.shuffle(imglist)
train_list = imglist[:-100]
valid_list = imglist[-100:]
with open("train.txt", "w") as f:
f.writelines(train_list)
with open("valid.txt", "w") as f:
f.writelines(valid_list)
if __name__ == "__main__":
xml_path_list = glob.glob("labels_voc/*.xml")
for xml_path in xml_path_list:
voc2yolo(xml_path)
imglist = get_image_list("images")
imglist2file(imglist)
After the data conversion is completed, the detection model can be trained using the yolo series of algorithms.