Divide the SSDD dataset into training set, verification set and test set in proportion, and convert from VOC annotation format to COCO format

The SSDD data set has a total of 1160 pictures. The data set is annotated in voc format. The original data is not divided well and needs to be divided by itself.

I found the relevant division code of the voc format data set on the Internet, and it can be successfully run after some modifications. Now record the code as follows:

'''
本脚本用来将所有的xml标注文件，随机划分成事先设定比例的训练集、验证集、训练验证集、测试集；结果是输出储存这些文件名（只是文件名而没有'.xml'这个后缀）的txt文件；从而得到到voc格式的数据集
本脚本在文件将数据集按7：2：1的比例分成训练集train、验证集val、测试集test
'''
import os
import random
random.seed(0)
xmlfilepath = r'/home/dwt/DataSets/SSDD_train_val_test/newAnnotations'
saveBasePath=r"/home/dwt/DataSets/SSDD_train_val_test/ImageSets/Main/"

#----------------------------------------------------------------------#
#   想要增加测试集修改trainval_percent
#   train_percent不需要修改
#----------------------------------------------------------------------#
train_percent = 0.7 #训练集在数据集中的比例
val_percent = 0.2   #验证集在数据集中的比例
trainval_percent = 0.9 #训练集和验证集加在一起在数据集中的比例
test_percent = 0.1   #测试集在数据集中的比例

temp_xml = os.listdir(xmlfilepath) #os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表。
total_xml = []
for xml in temp_xml:
    if xml.endswith(".xml"):
        total_xml.append(xml)

num=len(total_xml) #num表示所有标注文件的总数量
total_list=range(num) #total_list是从0到1的总元素数量为num的列表。与total_xml列表一一对应
train_num = int(num*train_percent)#train_num表示所有标注文件中用于训练集的标注文件的数量
val_num = int(num*val_percent) #val_num表示所有标注文件中用于验证集的标注文件的数量
trainval_num = int(num*trainval_percent) #trainval_num表示所有标注文件中用于训练集和验证集的标注文件的数量
test_num = int(num*test_percent) #test_num表示所有标注文件中用于测试集的标注文件的数量

trainval_list = random.sample(total_list,trainval_num)#trainval_list时存储用于训练集和验证集的标注文件的序号的列表 #trainval = random.sample(list,tv)函数从列表list不分大小顺序的抽取tv个数，并抽取结果以列表的形式传给trainval
train_list = random.sample(trainval_list,train_num) #train_list是储存作为训练集中用于训练的标注文件在total_list列表中的序号

print("train and val size",trainval_num)
print("train size: {} , val size : {} , test size : {}".format(train_num,val_num,test_num))

ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath,'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w')
fval = open(os.path.join(saveBasePath,'val.txt'), 'w')

for i  in total_list:
    name=total_xml[i][:-4]+'\n' #因为voc格式数据集的储存训练和测试集信息的txt文件是只有文件名的，所以诸如“aa.xml”的文件只将“aa”写入到储存训练和测试集信息的txt文件中
    if i in trainval_list:
        ftrainval.write(name)
        if i in train_list:
            ftrain.write(name)
        else:
            fval.write(name)
    else:
        ftest.write(name)

ftrainval.close()
ftrain.close()
fval.close()
ftest .close()

The following is the code to convert the voc format dataset comment (xml file) to the coco format dataset (json) file (the parseXMLFiles function converts the xml annotation files in a given folder into a json file uniformly, and the parseXMLFiles_by_txt function has the same function as The parseXMLFiles function is the same, but it will convert the corresponding xml file into a json file according to the train.txt and test.txt files that distinguish the training set and test set data in the voc format data set):

Note: The content of this paragraph refers to the code in the format of the blog VOC dataset converted into the COCO dataset format . Notes are added to the original code for easy understanding, and appropriate modifications are made to the details.

import xml.etree.ElementTree as ET
import os
import json

coco = dict()
coco['images'] = []
coco['type'] = 'instances'
coco['annotations'] = []
coco['categories'] = []

category_set = dict() #该字典负责统计该数据集中出现的目标种类名称及其对应的id值
image_set = set()

category_item_id = -1
image_id = 0
annotation_id = 0


def addCatItem(name): #该函数对json文件中的categories这一项添加内容
    global category_item_id #category_item_id原本是该函数之外的变量，使用global关键字作用于category_item_id上后，该函数内对category_item_id的操作会影响到函数外category_item_id的值
    category_item = dict()
    category_item['supercategory'] = 'none'
    category_item_id += 1
    category_item['id'] = category_item_id
    category_item['name'] = name
    coco['categories'].append(category_item)
    category_set[name] = category_item_id #对category_set字典添加内容
    return category_item_id


def addImgItem(file_name, size):
    global image_id
    if file_name is None:
        raise Exception('Could not find filename tag in xml file.')
    if size['width'] is None:
        raise Exception('Could not find width tag in xml file.')
    if size['height'] is None:
        raise Exception('Could not find height tag in xml file.')
    image_id += 1
    image_item = dict()
    image_item['id'] = image_id
    image_item['file_name'] = file_name
    image_item['width'] = size['width']
    image_item['height'] = size['height']
    coco['images'].append(image_item)
    image_set.add(file_name)
    return image_id


def addAnnoItem(object_name, image_id, category_id, bbox):
    global annotation_id
    annotation_item = dict()
    annotation_item['segmentation'] = []
    seg = []
    # bbox[] is x,y,w,h
    # left_top
    seg.append(bbox[0])
    seg.append(bbox[1])
    # left_bottom
    seg.append(bbox[0])
    seg.append(bbox[1] + bbox[3])
    # right_bottom
    seg.append(bbox[0] + bbox[2])
    seg.append(bbox[1] + bbox[3])
    # right_top
    seg.append(bbox[0] + bbox[2])
    seg.append(bbox[1])

    annotation_item['segmentation'].append(seg)

    annotation_item['area'] = bbox[2] * bbox[3]
    annotation_item['iscrowd'] = 0
    annotation_item['ignore'] = 0
    annotation_item['image_id'] = image_id
    annotation_item['bbox'] = bbox
    annotation_item['category_id'] = category_id
    annotation_id += 1
    annotation_item['id'] = annotation_id
    coco['annotations'].append(annotation_item)


def _read_image_ids(image_sets_file):
    ids = []
    with open(image_sets_file) as f:
        for line in f:
            ids.append(line.rstrip())
    return ids


"""通过txt文件生成"""


# split ='train' 'va' 'trainval' 'test'
def parseXmlFiles_by_txt(data_dir, json_save_path, split='train'): #功能与parseXmlFiles相同，只是parseXmlFiles_by_txt根据txt文件中指示的文件进行对应xml的转化
    print("hello")
    labelfile = split + ".txt"
    image_sets_file = data_dir + "/ImageSets/Main/" + labelfile
    ids = _read_image_ids(image_sets_file)

    for _id in ids:
        xml_file = data_dir + f"/Annotations/{_id}.xml"

        bndbox = dict()
        size = dict()
        current_image_id = None
        current_category_id = None
        file_name = None
        size['width'] = None
        size['height'] = None
        size['depth'] = None

        tree = ET.parse(xml_file)
        root = tree.getroot()
        if root.tag != 'annotation':
            raise Exception('pascal voc xml root element should be annotation, rather than {}'.format(root.tag))

        # elem is <folder>, <filename>, <size>, <object>
        for elem in root:
            current_parent = elem.tag
            current_sub = None
            object_name = None

            if elem.tag == 'folder':
                continue

            if elem.tag == 'filename':
                file_name = elem.text
                if file_name in category_set:
                    raise Exception('file_name duplicated')

            # add img item only after parse <size> tag
            elif current_image_id is None and file_name is not None and size['width'] is not None:
                if file_name not in image_set:
                    current_image_id = addImgItem(file_name, size)
                    print('add image with {} and {}'.format(file_name, size))
                else:
                    raise Exception('duplicated image: {}'.format(file_name))
                    # subelem is <width>, <height>, <depth>, <name>, <bndbox>
            for subelem in elem:
                bndbox['xmin'] = None
                bndbox['xmax'] = None
                bndbox['ymin'] = None
                bndbox['ymax'] = None

                current_sub = subelem.tag
                if current_parent == 'object' and subelem.tag == 'name': #当循环来到xml文件的object中的name项（目标的种类名称信息）
                    object_name = subelem.text
                    if object_name not in category_set:#如果当前目标的种类名称先前没出现过则将该目标的种类名称进行统计整理，其中category_set字典负责统计该数据集中出现的目标种类名称及其对应的id值
                        current_category_id = addCatItem(object_name)#该函数对json文件中的categories这一项添加内容
                    else:
                        current_category_id = category_set[object_name] #如果当前目标的种类名称已出现过，则从category_set字典调出当前种类名称对应的编号

                elif current_parent == 'size': #当循环来到xml文件的size项（图片的尺寸信息）
                    if size[subelem.tag] is not None:
                        raise Exception('xml structure broken at size tag.')
                    size[subelem.tag] = int(subelem.text) #对图片size信息下面的width、height、depth三项信息进行记录

                # option is <xmin>, <ymin>, <xmax>, <ymax>, when subelem is <bndbox>
                for option in subelem:
                    if current_sub == 'bndbox': #当循环来到xml文件的object中的bndbox项（目标框的坐标信息）
                        if bndbox[option.tag] is not None:
                            raise Exception('xml structure corrupted at bndbox tag.')
                        bndbox[option.tag] = int(option.text) #将当前object项中的目标框bndbox的xmin、ymin、xmax、ymax进行记录到bnbbox中临时储存

                # only after parse the <object> tag
                if bndbox['xmin'] is not None: #当获取到目标的目标框坐标信息后bndbox后，说明当前目标的信息已获取完毕，接下来将获取的该目标的信息进行添加记录操作
                    if object_name is None:
                        raise Exception('xml structure broken at bndbox tag')
                    if current_image_id is None:
                        raise Exception('xml structure broken at bndbox tag')
                    if current_category_id is None:
                        raise Exception('xml structure broken at bndbox tag')
                    bbox = []
                    # x
                    bbox.append(bndbox['xmin'])
                    # y
                    bbox.append(bndbox['ymin'])
                    # w
                    bbox.append(bndbox['xmax'] - bndbox['xmin'])
                    # h
                    bbox.append(bndbox['ymax'] - bndbox['ymin'])
                    print('add annotation with {},{},{},{}'.format(object_name, current_image_id, current_category_id,
                                                                   bbox))
                    addAnnoItem(object_name, current_image_id, current_category_id, bbox)
    json.dump(coco, open(json_save_path, 'w'))


"""直接从xml文件夹中生成"""


def parseXmlFiles(xml_path, json_save_path):
    for f in os.listdir(xml_path):
        if not f.endswith('.xml'):
            continue

        bndbox = dict()
        size = dict()
        current_image_id = None
        current_category_id = None
        file_name = None
        size['width'] = None
        size['height'] = None
        size['depth'] = None

        xml_file = os.path.join(xml_path, f) #xml_file为当前xml文件的具体地址
        print(xml_file)

        tree = ET.parse(xml_file)
        root = tree.getroot()
        if root.tag != 'annotation':
            raise Exception('pascal voc xml root element should be annotation, rather than {}'.format(root.tag))

        # elem is <folder>, <filename>, <size>, <object>
        for elem in root:
            current_parent = elem.tag
            current_sub = None
            object_name = None

            if elem.tag == 'folder':
                continue

            if elem.tag == 'filename':
                file_name = elem.text
                if file_name in category_set:
                    raise Exception('file_name duplicated')

            # add img item only after parse <size> tag
            elif current_image_id is None and file_name is not None and size['width'] is not None:
                if file_name not in image_set:
                    current_image_id = addImgItem(file_name, size)
                    print('add image with {} and {}'.format(file_name, size))
                else:
                    raise Exception('duplicated image: {}'.format(file_name))
                    # subelem is <width>, <height>, <depth>, <name>, <bndbox>
            for subelem in elem:
                bndbox['xmin'] = None
                bndbox['xmax'] = None
                bndbox['ymin'] = None
                bndbox['ymax'] = None

                current_sub = subelem.tag
                if current_parent == 'object' and subelem.tag == 'name':#当elem项为object且subelem为name时才进行下面的操作
                    object_name = subelem.text #获取当前目标的种类名称
                    if object_name not in category_set:
                        current_category_id = addCatItem(object_name)
                    else:
                        current_category_id = category_set[object_name]

                elif current_parent == 'size':
                    if size[subelem.tag] is not None:
                        raise Exception('xml structure broken at size tag.')
                    size[subelem.tag] = int(subelem.text)

                # option is <xmin>, <ymin>, <xmax>, <ymax>, when subelem is <bndbox>
                for option in subelem:
                    if current_sub == 'bndbox':
                        if bndbox[option.tag] is not None:
                            raise Exception('xml structure corrupted at bndbox tag.')
                        bndbox[option.tag] = int(option.text)

                # only after parse the <object> tag
                if bndbox['xmin'] is not None:
                    if object_name is None:
                        raise Exception('xml structure broken at bndbox tag')
                    if current_image_id is None:
                        raise Exception('xml structure broken at bndbox tag')
                    if current_category_id is None:
                        raise Exception('xml structure broken at bndbox tag')
                    bbox = []
                    # x
                    bbox.append(bndbox['xmin'])
                    # y
                    bbox.append(bndbox['ymin'])
                    # w
                    bbox.append(bndbox['xmax'] - bndbox['xmin'])
                    # h
                    bbox.append(bndbox['ymax'] - bndbox['ymin'])
                    print('add annotation with {},{},{},{}'.format(object_name, current_image_id, current_category_id,
                                                                   bbox))
                    addAnnoItem(object_name, current_image_id, current_category_id, bbox)
    json.dump(coco, open(json_save_path, 'w'))


if __name__ == '__main__':
    # 通过txt文件生成
    voc_data_dir="/home/dwt/MyCode/pycharm_projects/YOLOX_sample/datasets/VOCdevkit-hrsid/VOC2011"
    json_save_path= "data/coco/hrsid_coco_annotations/train.json"
    parseXmlFiles_by_txt(voc_data_dir,json_save_path,"train")

    # # 通过文件夹生成
    # ann_path = "E:/VOCdevkit/VOC2007/Annotations" #voc标注文件地址
    # json_save_path = "E:/VOCdevkit/test.json" #json文件保存地址
    # parseXmlFiles(ann_path, json_save_path)

The following code is after converting SSDD from voc format to coco format (that is, converting the voc format annotation information xml file to coco format annotation information json file), and then according to the corresponding in the three json files of train, val, and test The pictures are classified into three corresponding folders: image_train, image_val, image_test, and the code record of the written script:

(1) Initial code

#move_rename_image.py
#在这个脚本中实现根据annotation中图片id与图片名称来对image图片改名为其对应id并另存到单独的文件夹中
import os
import json
import argparse
import  shutil

parser = argparse.ArgumentParser()
parser.add_argument('--source_path', default='D:/engineering/intrusion detection/dataset/preliminary_contest_crane_federal',type=str)
parser.add_argument('--save_path', default='D:/engineering/intrusion detection/dataset/diaoche_data', type=str, help="specify where to save the output dir of labels")
arg = parser.parse_args()

c = 0 #设置这个变量用来计数处理的标签数，在训练过程中用print打印c来指示进度

if __name__ == '__main__':
    json_file = os.path.join(arg.source_path, 'annotations', 'train2.json')
    #image_file_catalog = os.path.join(arg.save_path, 'images')
    data = json.load(open(json_file, encoding="utf-8"))
    images = data['images']

    #在下面的循环中将图片改成其对应id的名称并另存到指定的文件夹
    #在处理train2中的数据时，因为计划将train2中图片合在train1之后，故train2图片的重命名根据其对应annotation中id号加1159
    for img_info in images:
        img_name = img_info['file_name']
        img_id = img_info['id']
        src = os.path.join(arg.source_path,'images','train2',img_name)
        dst = os.path.join(arg.save_path,'images',f'{str(img_id + 1159)}.jpg')
        shutil.copy(src,dst)

        c += 1
        if c % 50 == 0:
            print("当前已经重命名并另存%d个吊机图片" % (c))

(2) The code rewritten according to the initial code above

#image_tidy.py
'''在voc转化成coco格式后，需要将原本混在一起的图片分成train、val、test这三个文件夹'''
import os
import json
import argparse
import shutil


# train_txt = '/home/dwt/DataSets/SSDD_train_val_test/VOCdevkit/VOC2011/ImageSets/Main/train.txt'
# val_txt = '/home/dwt/DataSets/SSDD_train_val_test/VOCdevkit/VOC2011/ImageSets/Main/val.txt'
# test_txt = '/home/dwt/DataSets/SSDD_train_val_test/VOCdevkit/VOC2011/ImageSets/Main/test.txt'

train_json = '/home/dwt/DataSets/SSDD_train_val_test/coco_format/voc2011_train.json'
val_json = '/home/dwt/DataSets/SSDD_train_val_test/coco_format/voc2011_val.json'
test_json = '/home/dwt/DataSets/SSDD_train_val_test/coco_format/voc2011_test.json'

src_front = '/home/dwt/DataSets/SSDD_train_val_test/VOCdevkit'
dst_front = '/home/dwt/DataSets/SSDD_train_val_test/coco_format'

files = ["train","val","test"]
for file in files:
    if file == "train":
        dir_json = train_json
        dst_file = os.path.join(dst_front,"image_train")
    if file == "val":
        dir_json = val_json
        dst_file = os.path.join(dst_front, "image_val")
    if file == "test":
        dir_json = test_json
        dst_file = os.path.join(dst_front, "image_test")
    data = json.load(open(dir_json,encoding="utf-8"))
    images = data['images']

    for img_info in images:
        img_dir = img_info["file_name"]# 例如："VOC2011/JPEGImages/000078.jpg"
        src = os.path.join(src_front,img_dir)
        dst = os.path.join(dst_file,img_dir[-10:])
        shutil.copy(src,dst)

    print("done")

print("Done!")

Divide the SSDD dataset into training set, verification set and test set in proportion, and convert from VOC annotation format to COCO format

Supongo que te gusta