一个神经网络的开始首先是要对数据进行处理。
目标检测图像分为两个部分,一个是原始图像,另外一个是标志的xml文件。
由于pytorch数据使用迭代器加载,所以处理都是一张一张图像的处理。
1.加载图像和标签。
标签包括boundingbox和其名称标签,由于boundingbox和其标签有多个,所有使用循环读取。返回图像及其标签。
def get_example(self, i):
"""Returns the i-th example.
Returns a color image and bounding boxes. The image is in CHW format.
The returned image is RGB.
Args:
i (int): The index of the example.
Returns:
tuple of an image and bounding boxes
"""
id_ = self.ids[i]
anno = ET.parse(
os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
bbox = list()
label = list()
difficult = list()
for obj in anno.findall('object'):
# when in not using difficult split, and the object is
# difficult, skipt it.
if not self.use_difficult and int(obj.find('difficult').text) == 1:
continue
difficult.append(int(obj.find('difficult').text))
bndbox_anno = obj.find('bndbox')
# subtract 1 to make pixel indexes 0-based
bbox.append([
int(bndbox_anno.find(tag).text) - 1
for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
name = obj.find('name').text.lower().strip()
# print(name)
label.append(VOC_BBOX_LABEL_NAMES.index(name))
bbox = np.stack(bbox).astype(np.float32)
label = np.stack(label).astype(np.int32)
# When `use_difficult==False`, all elements in `difficult` are False.
difficult = np.array(difficult, dtype=np.bool).astype(
np.uint8) # PyTorch don't support np.bool
# Load a image
img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
img = read_image(img_file, color=True)
# if self.return_difficult:
# return img, bbox, label, difficult
return img, bbox, label, difficult
2.将图像转化为pytorch固定的数据格式。
class Transform(object):
def __init__(self, min_size=600, max_size=1000):
self.min_size = min_size
self.max_size = max_size
def __call__(self, in_data):
img, bbox, label = in_data
_, H, W = img.shape
img = preprocess(img, self.min_size, self.max_size)
_, o_H, o_W = img.shape
scale = o_H / H
bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
# horizontally flip
img, params = util.random_flip(
img, x_random=True, return_param=True)
bbox = util.flip_bbox(
bbox, (o_H, o_W), x_flip=params['x_flip'])
return img, bbox, label, scale
在转化之前,由于输入的图像大小是不确定的,所有将图像大小转化为规定范围内:
def preprocess(img, min_size=600, max_size=1000):
C, H, W = img.shape
scale1 = min_size / min(H, W)
scale2 = max_size / max(H, W)
scale = min(scale1, scale2)
img = img / 255.
img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect')
# both the longer and shorter should be less than
# max_size and min_size
if opt.caffe_pretrain:
normalize = caffe_normalize
else:
normalize = pytorch_normalze
return normalize(img)
由于可以选择预训练模型,所以提供两种数据格式转化
def pytorch_normalze(img):
"""
https://github.com/pytorch/vision/issues/223
return appr -1~1 RGB
"""
normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
img = normalize(t.from_numpy(img))
return img.numpy()
def caffe_normalize(img):
"""
return appr -125-125 BGR
"""
img = img[[2, 1, 0], :, :] # RGB-BGR
img = img * 255
mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
img = (img - mean).astype(np.float32, copy=True)
return img
3.所以封装成pytorch的数据读取类就入下面这样:
class Dataset:
def __init__(self, opt):
self.opt = opt
self.db = VOCBboxDataset(opt.voc_data_dir)
self.tsf = Transform(opt.min_size, opt.max_size)
def __getitem__(self, idx):
ori_img, bbox, label, difficult = self.db.get_example(idx)
img, bbox, label, scale = self.tsf((ori_img, bbox, label))
# TODO: check whose stride is negative to fix this instead copy all
# some of the strides of a given numpy array are negative.
return img.copy(), bbox.copy(), label.copy(), scale
def __len__(self):
return len(self.db)
4.最后在设置batch_size之类的迭代器:
dataloader = data_.DataLoader(dataset,
batch_size=1,
shuffle=True, \
# pin_memory=True,
num_workers=opt.num_workers)
5.测试集类似
扫描二维码关注公众号,回复:
3726384 查看本文章