1.三步法写自定义Torch的DataLoader - 知乎
2.pytorch Dataset, DataLoader产生自定义的训练数据_pan_jinquan的博客-CSDN博客
3.GitHub - JDAI-CV/fast-reid: SOTA Re-identification Methods and Toolbox
4.https://github.com/chenyuntc/simple-faster-rcnn-pytorch
第一个链接,会给出一个大体的框架思路,第二个链接,是一个写得比较好的且完整的例子,第三个链接,是reid的自定义dataloader的实例,第四个链接,是目标检测自定义的VOC dataloader的实例。
1.以三步走框架分析Market1501自定义的dataloader
1)loaddata函数
从知乎链接1上的来看,分类任务中这个函数,主要是返回图像的路径、训练所需要的label信息等。
分析下面的代码,由于ReID训练过程中,除了图像路径信息外,还需要对应的id与camera信息。类Market1501中的方法,初始化后,就会返回train、query、gallery等的图像路径信息,对应的id与camera信息。
# encoding: utf-8
"""
@author: sherlock
@contact: [email protected]
"""
import glob
import os.path as osp
import re
import warnings
from .bases import ImageDataset
from ..datasets import DATASET_REGISTRY
@DATASET_REGISTRY.register()
class Market1501(ImageDataset):
"""Market1501.
Reference:
Zheng et al. Scalable Person Re-identification: A Benchmark. ICCV 2015.
URL: `<http://www.liangzheng.org/Project/project_reid.html>`_
Dataset statistics:
- identities: 1501 (+1 for background).
- images: 12936 (train) + 3368 (query) + 15913 (gallery).
"""
_junk_pids = [0, -1]
dataset_dir = ''
dataset_url = 'http://188.138.127.15:81/Datasets/Market-1501-v15.09.15.zip'
dataset_name = "market1501"
def __init__(self, root='datasets', market1501_500k=False, **kwargs):
# self.root = osp.abspath(osp.expanduser(root))
self.root = root
self.dataset_dir = osp.join(self.root, self.dataset_dir)
# allow alternative directory structure
self.data_dir = self.dataset_dir
data_dir = osp.join(self.data_dir, 'Market-1501-v15.09.15')
if osp.isdir(data_dir):
self.data_dir = data_dir
else:
warnings.warn('The current data structure is deprecated. Please '
'put data folders such as "bounding_box_train" under '
'"Market-1501-v15.09.15".')
self.train_dir = osp.join(self.data_dir, 'bounding_box_train')
self.query_dir = osp.join(self.data_dir, 'query')
self.gallery_dir = osp.join(self.data_dir, 'bounding_box_test')
self.extra_gallery_dir = osp.join(self.data_dir, 'images')
self.market1501_500k = market1501_500k
required_files = [
self.data_dir,
self.train_dir,
self.query_dir,
self.gallery_dir,
]
if self.market1501_500k:
required_files.append(self.extra_gallery_dir)
self.check_before_run(required_files)
train = lambda: self.process_dir(self.train_dir)
query = lambda: self.process_dir(self.query_dir, is_train=False)
gallery = lambda: self.process_dir(self.gallery_dir, is_train=False) + \
(self.process_dir(self.extra_gallery_dir, is_train=False) if self.market1501_500k else [])
super(Market1501, self).__init__(train, query, gallery, **kwargs)
def process_dir(self, dir_path, is_train=True):
img_paths = glob.glob(osp.join(dir_path, '*.jpg'))
pattern = re.compile(r'([-\d]+)_c(\d)')
data = []
for img_path in img_paths:
pid, camid = map(int, pattern.search(img_path).groups())
if pid == -1:
continue # junk images are just ignored
assert 0 <= pid <= 1501 # pid == 0 means background
assert 1 <= camid <= 6
camid -= 1 # index starts from 0
if is_train:
pid = self.dataset_name + "_" + str(pid)
camid = self.dataset_name + "_" + str(camid)
data.append((img_path, pid, camid))
return data
2)继承Dataset,重写初始化函数,定义自己的Dataset
从知乎链接1上的来看,可以看到这步会读取图片,归一化,并将图片转化为NCHW(pytorch的默认输入数据结构顺序)。
**可能需要分析的是__getitem__函数:在python中__getitem__(self,key)方法被称为魔法方法,这个方法返回所给键对应的值。当对象是序列时,键是整数。当对象是映射时(字典),键是任意值。
ReID中,也类似,这个地方加了一个transformer。img = read_image(img_path)。__getitem__只需要给输入图像的index就可以得到:处理好的图像、id、camera信息、图像路径。
https://github.com/JDAI-CV/fast-reid/blob/master/fastreid/data/common.pyhttps://github.com/JDAI-CV/fast-reid/blob/master/fastreid/data/common.py
# encoding: utf-8
"""
@author: liaoxingyu
@contact: [email protected]
"""
from torch.utils.data import Dataset
from .data_utils import read_image
class CommDataset(Dataset):
"""Image Person ReID Dataset"""
def __init__(self, img_items, transform=None, relabel=True):
self.img_items = img_items
self.transform = transform
self.relabel = relabel
pid_set = set()
cam_set = set()
for i in img_items:
pid_set.add(i[1])
cam_set.add(i[2])
self.pids = sorted(list(pid_set))
self.cams = sorted(list(cam_set))
if relabel:
self.pid_dict = dict([(p, i) for i, p in enumerate(self.pids)])
self.cam_dict = dict([(p, i) for i, p in enumerate(self.cams)])
def __len__(self):
return len(self.img_items)
def __getitem__(self, index):
img_item = self.img_items[index]
img_path = img_item[0]
pid = img_item[1]
camid = img_item[2]
img = read_image(img_path)
if self.transform is not None: img = self.transform(img)
if self.relabel:
pid = self.pid_dict[pid]
camid = self.cam_dict[camid]
return {
"images": img,
"targets": pid,
"camids": camid,
"img_paths": img_path,
}
@property
def num_classes(self):
return len(self.pids)
@property
def num_cameras(self):
return len(self.cams)
3)DataLoader把自定义Dataset包起来
创建一个CommDataset的实例,使用DataLoader进行调用。此处代码高度集成,所以参考着看看:
# line 41
train_set = CommDataset(train_items, transforms, relabel=True)
# line 72-
@configurable(from_config=_train_loader_from_config)
def build_reid_train_loader(
train_set, *, sampler=None, total_batch_size, num_workers=0,
):
"""
Build a dataloader for object re-identification with some default features.
This interface is experimental.
Returns:
torch.utils.data.DataLoader: a dataloader.
"""
mini_batch_size = total_batch_size // comm.get_world_size()
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, mini_batch_size, True)
train_loader = DataLoaderX(
comm.get_local_rank(),
dataset=train_set,
num_workers=num_workers,
batch_sampler=batch_sampler,
collate_fn=fast_batch_collator,
pin_memory=True,
)
return train_loader
2.VOC自定义的dataloader
参考三步走来说明下。
https://github.com/chenyuntc/simple-faster-rcnn-pytorch/blob/master/train.py
1)loaddata函数、定义自己的Dataset
此处把三步走的前两步都做了。
主要是解析voc的xml文件,然后读图,__getitem__将img, bbox, label, difficult几个结果返回。
import os
import xml.etree.ElementTree as ET
import numpy as np
from .util import read_image
class VOCBboxDataset:
"""Bounding box dataset for PASCAL `VOC`_.
.. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
The index corresponds to each image.
When queried by an index, if :obj:`return_difficult == False`,
this dataset returns a corresponding
:obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
This is the default behaviour.
If :obj:`return_difficult == True`, this dataset returns corresponding
:obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
that indicates whether bounding boxes are labeled as difficult or not.
The bounding boxes are packed into a two dimensional tensor of shape
:math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
the image. The second axis represents attributes of the bounding box.
They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
four attributes are coordinates of the top left and the bottom right
vertices.
The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
:math:`R` is the number of bounding boxes in the image.
The class name of the label :math:`l` is :math:`l` th element of
:obj:`VOC_BBOX_LABEL_NAMES`.
The array :obj:`difficult` is a one dimensional boolean array of shape
:math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
If :obj:`use_difficult` is :obj:`False`, this array is
a boolean array with all :obj:`False`.
The type of the image, the bounding boxes and the labels are as follows.
* :obj:`img.dtype == numpy.float32`
* :obj:`bbox.dtype == numpy.float32`
* :obj:`label.dtype == numpy.int32`
* :obj:`difficult.dtype == numpy.bool`
Args:
data_dir (string): Path to the root of the training data.
i.e. "/data/image/voc/VOCdevkit/VOC2007/"
split ({'train', 'val', 'trainval', 'test'}): Select a split of the
dataset. :obj:`test` split is only available for
2007 dataset.
year ({'2007', '2012'}): Use a dataset prepared for a challenge
held in :obj:`year`.
use_difficult (bool): If :obj:`True`, use images that are labeled as
difficult in the original annotation.
return_difficult (bool): If :obj:`True`, this dataset returns
a boolean array
that indicates whether bounding boxes are labeled as difficult
or not. The default value is :obj:`False`.
"""
def __init__(self, data_dir, split='trainval',
use_difficult=False, return_difficult=False,
):
# if split not in ['train', 'trainval', 'val']:
# if not (split == 'test' and year == '2007'):
# warnings.warn(
# 'please pick split from \'train\', \'trainval\', \'val\''
# 'for 2012 dataset. For 2007 dataset, you can pick \'test\''
# ' in addition to the above mentioned splits.'
# )
id_list_file = os.path.join(
data_dir, 'ImageSets/Main/{0}.txt'.format(split))
self.ids = [id_.strip() for id_ in open(id_list_file)]
self.data_dir = data_dir
self.use_difficult = use_difficult
self.return_difficult = return_difficult
self.label_names = VOC_BBOX_LABEL_NAMES
def __len__(self):
return len(self.ids)
def get_example(self, i):
"""Returns the i-th example.
Returns a color image and bounding boxes. The image is in CHW format.
The returned image is RGB.
Args:
i (int): The index of the example.
Returns:
tuple of an image and bounding boxes
"""
id_ = self.ids[i]
anno = ET.parse(
os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
bbox = list()
label = list()
difficult = list()
for obj in anno.findall('object'):
# when in not using difficult split, and the object is
# difficult, skipt it.
if not self.use_difficult and int(obj.find('difficult').text) == 1:
continue
difficult.append(int(obj.find('difficult').text))
bndbox_anno = obj.find('bndbox')
# subtract 1 to make pixel indexes 0-based
bbox.append([
int(bndbox_anno.find(tag).text) - 1
for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
name = obj.find('name').text.lower().strip()
label.append(VOC_BBOX_LABEL_NAMES.index(name))
bbox = np.stack(bbox).astype(np.float32)
label = np.stack(label).astype(np.int32)
# When `use_difficult==False`, all elements in `difficult` are False.
difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool
# Load a image
img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
img = read_image(img_file, color=True)
# if self.return_difficult:
# return img, bbox, label, difficult
return img, bbox, label, difficult
__getitem__ = get_example
VOC_BBOX_LABEL_NAMES = (
'aeroplane',
'bicycle',
'bird',
'boat',
'bottle',
'bus',
'car',
'cat',
'chair',
'cow',
'diningtable',
'dog',
'horse',
'motorbike',
'person',
'pottedplant',
'sheep',
'sofa',
'train',
'tvmonitor')
2) 封装train_dataset与test_dataset
下面的代码主要是将训练与测试的数据经过不同的transformer封装起来。
from __future__ import absolute_import
from __future__ import division
import torch as t
from data.voc_dataset import VOCBboxDataset
from skimage import transform as sktsf
from torchvision import transforms as tvtsf
from data import util
import numpy as np
from utils.config import opt
def inverse_normalize(img):
if opt.caffe_pretrain:
img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
return img[::-1, :, :]
# approximate un-normalize for visualize
return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
def pytorch_normalze(img):
"""
https://github.com/pytorch/vision/issues/223
return appr -1~1 RGB
"""
normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
img = normalize(t.from_numpy(img))
return img.numpy()
def caffe_normalize(img):
"""
return appr -125-125 BGR
"""
img = img[[2, 1, 0], :, :] # RGB-BGR
img = img * 255
mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
img = (img - mean).astype(np.float32, copy=True)
return img
def preprocess(img, min_size=600, max_size=1000):
"""Preprocess an image for feature extraction.
The length of the shorter edge is scaled to :obj:`self.min_size`.
After the scaling, if the length of the longer edge is longer than
:param min_size:
:obj:`self.max_size`, the image is scaled to fit the longer edge
to :obj:`self.max_size`.
After resizing the image, the image is subtracted by a mean image value
:obj:`self.mean`.
Args:
img (~numpy.ndarray): An image. This is in CHW and RGB format.
The range of its value is :math:`[0, 255]`.
Returns:
~numpy.ndarray: A preprocessed image.
"""
C, H, W = img.shape
scale1 = min_size / min(H, W)
scale2 = max_size / max(H, W)
scale = min(scale1, scale2)
img = img / 255.
img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)
# both the longer and shorter should be less than
# max_size and min_size
if opt.caffe_pretrain:
normalize = caffe_normalize
else:
normalize = pytorch_normalze
return normalize(img)
class Transform(object):
def __init__(self, min_size=600, max_size=1000):
self.min_size = min_size
self.max_size = max_size
def __call__(self, in_data):
img, bbox, label = in_data
_, H, W = img.shape
img = preprocess(img, self.min_size, self.max_size)
_, o_H, o_W = img.shape
scale = o_H / H
bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
# horizontally flip
img, params = util.random_flip(
img, x_random=True, return_param=True)
bbox = util.flip_bbox(
bbox, (o_H, o_W), x_flip=params['x_flip'])
return img, bbox, label, scale
class Dataset:
def __init__(self, opt):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir)
self.tsf = Transform(opt.min_size, opt.max_size)
def __getitem__(self, idx):
ori_img, bbox, label, difficult = self.db.get_example(idx)
img, bbox, label, scale = self.tsf((ori_img, bbox, label))
# TODO: check whose stride is negative to fix this instead copy all
# some of the strides of a given numpy array are negative.
return img.copy(), bbox.copy(), label.copy(), scale
def __len__(self):
return len(self.db)
class TestDataset:
def __init__(self, opt, split='test', use_difficult=True):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
def __getitem__(self, idx):
ori_img, bbox, label, difficult = self.db.get_example(idx)
img = preprocess(ori_img)
return img, ori_img.shape[1:], bbox, label, difficult
def __len__(self):
return len(self.db)
3)DataLoader把自定义Dataset包起来
下面没有什么需要说的了。
dataset = Dataset(opt)
print('load data')
dataloader = data_.DataLoader(dataset, \
batch_size=1, \
shuffle=True, \
# pin_memory=True,
num_workers=opt.num_workers)
testset = TestDataset(opt)
test_dataloader = data_.DataLoader(testset,
batch_size=1,
num_workers=opt.test_num_workers,
shuffle=False, \
pin_memory=True
)