1.目标跟踪的框架

刚接触目标跟踪，尝试了一种较为简单的目标跟踪方法，
原理图如下：

图一

训练过程：输入上一帧的目标包围框，上一帧的图像，当前帧的图像，标签为当前帧的目标包围框（简称bb）。首先得到输入后，以bb的中心剪出上一帧图像和当前帧图像4倍大的样本（宽高各为原来的两倍），之后再resize为（127，127）大小的图像，同时，当前帧bb需要进行相应的运算，得到bb在resize后的图像中的坐标，如图二所示。之后将resize后的两张图像输入卷积层，得到特征后，将其变成一维向量后拼接，送入全连接层，然后以变换后的当前帧的bb作为标签进行训练，如图一所示。
验证过程：输入为上一帧的目标包围框，上一帧的图像，当前帧的图像，输出为下一帧的bb。首先得到输入后，以bb的中心剪出上一帧图像和当前帧图像4倍大的样本（宽高各为原来的两倍），之后再resize为（127，127）大小的图像，之后将resize后的两张图像输入卷积层，得到特征后，将其变成一维向量后拼接，送入全连接层，预测出当前帧的bb；最后需要将得到的bb进行运算，映射成为裁剪前的框的位置，此时预测结束，得到当前帧的bb。当前帧的bb将作为下一帧的裁剪区域。

图二

代码是根据网盘中的论文以及具体情况改编实现的。
网盘链接：https://pan.baidu.com/s/1Ys9a3eqpLZ_N9zaMzlU7mA
提取码：gc6r

2.数据集

数据集的每个视频标签为一个json文件，内容为一个字典，{‘exit’：list1, ‘gt_rect’: list2},len(list1) = len(list2)=frame_num；list1[frame_id] = 0(或者1，0表示目标不存在，1表示目标在图像里)；list2[frame_id] = [x,y,w,h]（或者为[]，[]表示目标不存在，[x,y,w,h]表示目标的位置）。视频数据是将按照frame顺序分成了一系列png图片，每个视频一个文件夹。

3.视频分帧（get_frame.py）

import cv2
import os
import glob
def save_img():
    video_base_path = r'C:\Users\youchao\Desktop\test'
    video_dirs = glob.glob(os.path.join(video_base_path,'*'))
    for video_dir in video_dirs:
        video_path = os.path.join(video_dir,'IR.mp4')
        frames_path = os.path.join(video_dir,'IR')
        if not os.path.exists(frames_path):
            os.makedirs(frames_path)
        print(video_dir)
        cap = cv2.VideoCapture(video_path) #读入视频文件
        count = 0
        ret = cap.isOpened()
        while ret:   #循环读取视频帧
            count +=1
            ret, frame = cap.read()
            frame_path = os.path.join(frames_path,'{:0>6}.png'.format(count))
            if ret:
                cv2.imwrite(frame_path, frame) #存储为图像,保存名为 文件夹名_数字（第几个文件）.png
                cv2.waitKey(1)
            else:
                break
        cap.release()
        print('save_success')
        print(frames_path)
save_img()

4.训练和测试（mytest.py）

import glob
import os
import numpy as np
import json
import util
from torchvision import models
import torch
from PIL import Image
from torch.optim import lr_scheduler
from torchvision import transforms

def train(train_base_path):
    batch_size = 16
    n_epoch = 20
    all_video_annotation = util.get_patch_path_and_annotation(train_base_path)
    preprocessing_dataset = util.dataset(all_video_annotation=all_video_annotation)
    dataloader = torch.utils.data.DataLoader(dataset=preprocessing_dataset, batch_size=batch_size, shuffle=True)

    """ model"""
    model = util.SiamFC()
    print(model)
    model.train()
    if torch.cuda.is_available():
        model.cuda()
    loss_f = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    if not os.path.exists('./net'):
        os.makedirs('./net')

    """train"""
    for epoch in range(n_epoch):
        print('train.........')
        loss_batch = 0.0
        num = 0

        for i, sample in enumerate(dataloader, 1):
            # input
            x0 = sample['sample0']
            x1 = sample['sample1']
            y_gt = sample['bb']
            if torch.cuda.is_available():
                x0 = x0.cuda()
                x1 = x1.cuda()
                y_gt = y_gt.cuda()
            optimizer.zero_grad()

            # forward
            f_0 = model.feature(x0)
            f_1 = model.feature(x1)
            num0 = f_0.shape[0]
            num1 = f_1.shape[0]
            assert num0 == num1
            f_0 = f_0.view(num0, 9216)
            f_1 = f_1.view(num1, 9216)
            f = torch.cat((f_0, f_1), 1)
            # with torch.set_grad_enabled(True):
            y_pre = model.classifier(f)

            # loss and backward
            num += num0
            loss = loss_f(y_pre, y_gt)
            loss.backward()
            optimizer.step()
            loss_batch += loss.item() * num0
            if i % 20 == 0:
                print('sample_loss avg {}_batchs:{:.4f}'.format(i, loss_batch / num))
        loss_epoch = loss_batch

        # adjust lr in every epoch
        scheduler.step()
        print('sample_loss avg epoch:{:.4f}'.format(loss_epoch / len(preprocessing_dataset)))
        # save model
        if epoch == n_epoch - 1:
            torch.save(model, f'./net/uav_{epoch}.pth')



def test(test_base_path,model_path):
    # test_base_path = r'C:\Users\youchao\Desktop\test'
    all_video_annotation, video_name_list = util.get_test_path_and_annotation(test_base_path)
    results_path = './results/'
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    model = torch.load(model_path)
    model.eval()
    if torch.cuda.is_available():
        model.cuda()
    video_init = {'20190925_101846_1_3': [362, 306, 37, 24]}

    for i, video_name in enumerate(video_name_list):
        video_annotation = all_video_annotation[i]  # list.shape=(frame,4)
        labels = [video_init[video_name]]
        file_path = os.path.join(results_path, f'{video_name}_IR.json')
        for frame, frame_annotation in enumerate(video_annotation, 0):
            image0_path = frame_annotation[0]
            image1_path = frame_annotation[2]
            image0 = Image.open(image0_path)
            image1 = Image.open(image1_path)
            image0_id = int(os.path.basename(image0_path).split('.png')[0])
            image0_bounding_box = labels[image0_id - 1]  # list:[x,y,w,h]
            output_size = (127, 127)
            sample0 = util.crop_resize(image0, image0_bounding_box, output_size=output_size)
            sample1 = util.crop_resize(image1, image0_bounding_box, output_size=output_size)
            x0 = sample0['resize_img'].unsqueeze(0)
            x1 = sample1['resize_img'].unsqueeze(0)
            if torch.cuda.is_available():
                x0 = x0.cuda()
                x1 = x1.cuda()

            # forward
            f_0 = model.feature(x0)
            f_1 = model.feature(x1)
            num0 = f_0.shape[0]
            num1 = f_1.shape[0]
            assert num0 == num1
            f_0 = f_0.view(num0, 9216)
            f_1 = f_1.view(num1, 9216)
            f = torch.cat((f_0, f_1), 1)
            resize_bb = model.classifier(f)  # [x1,y1,x2,y2]
            resize_bb = np.squeeze(resize_bb.data.cpu().numpy())

            # calculation the location of bb
            w, h = image0.size
            x1_s = int(image0_bounding_box[0] - image0_bounding_box[2] / 2)
            y1_s = int(image0_bounding_box[1] - image0_bounding_box[3] / 2)
            x2_s = int(image0_bounding_box[0] + image0_bounding_box[2] * 1.5)
            y2_s = int(image0_bounding_box[1] + image0_bounding_box[3] * 1.5)
            x1_s = max(0, x1_s)
            y1_s = max(0, y1_s)
            x2_s = min(w, x2_s)
            y2_s = min(h, y2_s)
            h_s = y2_s - y1_s
            w_s = x2_s - x1_s
            scale = 2 * [w_s / output_size[1], h_s / output_size[0]]
            crop_bb = [float(a) * float(b) for a, b in zip(resize_bb, scale)]
            origin_coordinates = 2 * [x1_s, y1_s]  # top left coordinate of sample
            bb = [a + b for a, b in zip(crop_bb, origin_coordinates)]
            image1_bounding_box = [bb[0], bb[1], bb[2] - bb[0], bb[3] - bb[1]]
            labels.append(image1_bounding_box)
        with open(file_path, "w") as f:
            json.dump(labels, f)

if __name__ == '__main__':
    state = "train"
    if state =="train":
        test_base_path = r'C:\Users\youchao\Desktop\test'
        model_path = 'net/uav_19.pth'
        test(test_base_path, model_path)
    if state == "test":
        train_base_path = r'C:\Users\youchao\Desktop\test'
        train(train_base_path)

5.定义的函数以及类（util.py）

from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
import numpy as np
import torch
import torch.nn as nn
import json
import os
import glob

class dataset(Dataset):

    def __init__(self,all_video_annotation):
        self.all_video_annotation = all_video_annotation #shape :(video_num * frame_name,10)

    def __len__(self):
        return len(self.all_video_annotation)

    def __getitem__(self, item):
        image0_path = self.all_video_annotation[item][0] #str
        image0 = Image.open(image0_path)
        image0_bounding_box = self.all_video_annotation[item][1]#list:[x,y,w,h]
        image1_path = self.all_video_annotation[item][2]
        image1 = Image.open(image1_path)
        image1_bounding_box = self.all_video_annotation[item][3]
        sample0 = crop_resize(image0,image0_bounding_box)
        sample1 = crop_resize(image1,image0_bounding_box,image1_bounding_box)
        label = torch.Tensor(sample1['resize_bb'])
        sample = {'sample0':sample0['resize_img'], 'sample1': sample1['resize_img'], 'bb': label}
        return sample


def crop_resize(image,image0_bounding_box,image1_bounding_box=False,output_size=(127,127)):#output_size.type: tuple(h,w)
    #coordinate of sampel
    x1_s = int(image0_bounding_box[0] - image0_bounding_box[2] / 2)
    y1_s = int(image0_bounding_box[1] - image0_bounding_box[3] / 2)
    x2_s = int(image0_bounding_box[0] + image0_bounding_box[2] * 1.5)
    y2_s = int(image0_bounding_box[1] + image0_bounding_box[3] * 1.5)
    x1_s = max(0, x1_s)
    y1_s = max(0, y1_s)
    w, h =image.size
    x2_s = min(w, x2_s)
    y2_s = min(h, y2_s)
    h_s = y2_s - y1_s
    w_s = x2_s - x1_s

    #crop
    data = np.array(image).astype(np.uint8)
    data = data[y1_s:y2_s,x1_s:x2_s,:]

    #resize
    img = Image.fromarray(data)
    transform_resize = transforms.Resize(output_size, interpolation=Image.NEAREST)
    transform_to_tensor = transforms.ToTensor()
    img = transform_to_tensor(transform_resize(img))


    # bb of image1 after crop
    if isinstance(image1_bounding_box, list):
        bb = [image1_bounding_box[0], image1_bounding_box[1],
               image1_bounding_box[0] + image1_bounding_box[2], image1_bounding_box[1] + image1_bounding_box[3]]
        origin_coordinates = 2 * [x1_s, y1_s]  # top left coordinate of sample
        crop_bb = [a - b for a, b in zip(bb, origin_coordinates)]
        scale = 2 * [output_size[1] / w_s, output_size[0] / h_s]
        resize_bb = [float(a) * float(b) for a, b in zip(crop_bb, scale)]
    else:
        resize_bb = []
    sample = {'resize_img': img, 'resize_bb': resize_bb}#resize_bb:[x1,y1,x2,y2]
    return sample

class SiamFC(nn.Module):

    def __init__(self):
        super(SiamFC, self).__init__()
        self.feature = nn.Sequential(
            # conv1
            nn.Conv2d(3, 96, 11, 2),
            nn.BatchNorm2d(96, eps=1e-6, momentum=0.05),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, 2),
            # conv2
            nn.Conv2d(96, 256, 5, 1, groups=2),
            nn.BatchNorm2d(256, eps=1e-6, momentum=0.05),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, 2),
            # conv3
            nn.Conv2d(256, 384, 3, 1),
            nn.BatchNorm2d(384, eps=1e-6, momentum=0.05),
            nn.ReLU(inplace=True),
            # conv4
            nn.Conv2d(384, 384, 3, 1, groups=2),
            nn.BatchNorm2d(384, eps=1e-6, momentum=0.05),
            nn.ReLU(inplace=True),
            # conv5
            nn.Conv2d(384, 256, 3, 1, groups=2))

        self.classifier = nn.Sequential(
            torch.nn.Linear(in_features=18432, out_features=4096, bias=True),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(p=0.5, inplace=False),
            torch.nn.Linear(in_features=4096, out_features=4096, bias=True),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(p=0.5, inplace=False),
            torch.nn.Linear(in_features=4096, out_features=1000, bias=True),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(p=0.5, inplace=False),
            torch.nn.Linear(in_features=1000, out_features=4, bias=True))

def get_patch_path_and_annotation(train_base_path):
    #base_path = r'C:\Users\youchao\Desktop\test'
    video_paths = sorted(glob.glob(os.path.join(train_base_path,'*')))
    all_video_annotation = [] #shape :(video_num * frame_name,4)
    for video_path in video_paths:
        video_name = os.path.basename(video_path)
        annotation_path = os.path.join(train_base_path,video_name,'IR_label.json')
        patch_paths = sorted(glob.glob(os.path.join(train_base_path, video_name, 'IR', '*.png')))
        with open(annotation_path,'r') as file:
            annotation = json.load(file)#{'exit','gt_rect'}
        video_annotation = [] #shape :(frame_num,3)
        assert len(annotation['exist']) == len(patch_paths)
        for frame_num in range(len(annotation['exist'])):
            object_exit = annotation['exist'][frame_num]
            if object_exit == 0:
                continue
            patch_path = patch_paths[frame_num]
            coordinate = annotation['gt_rect'][frame_num]
            patch_annotation = [patch_path,coordinate]
            video_annotation.append(patch_annotation)
        for index in range(len(video_annotation)-1):
            current_frame = video_annotation[index]
            next_frame = video_annotation[index+1]
            current_next = current_frame + next_frame #list:[c_path,[x,y,w,h],n_path,[x,y,w,h]]
            all_video_annotation.append(current_next)
    return all_video_annotation

def get_test_path_and_annotation(test_base_path):
    #base_path = r'C:\Users\youchao\Desktop\test'
    video_paths = sorted(glob.glob(os.path.join(test_base_path,'*')))
    all_video_annotation = [] #shape :(video_num * frame_name,4)
    video_name_list = []
    for video_path in video_paths:
        video_name = os.path.basename(video_path)
        video_name_list.append(video_name)
        annotation_path = os.path.join(test_base_path,video_name,'IR_label.json')
        patch_paths = sorted(glob.glob(os.path.join(test_base_path, video_name, 'IR', '*.png')))
        with open(annotation_path,'r') as file:
            annotation = json.load(file)#{'exit','gt_rect'}
        video_annotation = [] #shape :(frame_num,2)
        test_video_annotation = []
        assert len(annotation['exist']) == len(patch_paths)
        for frame_num in range(len(annotation['exist'])):
            patch_exit = annotation['exist'][frame_num]
            patch_path = patch_paths[frame_num]
            patch_annotation = [patch_path,patch_exit]
            video_annotation.append(patch_annotation)
        for index in range(len(video_annotation)-1):
            current_frame = video_annotation[index]
            index_current = index
            while current_frame[1] == 0:
                index_current -=1
                current_frame = video_annotation[index_current]
            next_frame = video_annotation[index+1]
            current_next = current_frame + next_frame #list:[c_path,1,n_path,exist]
            test_video_annotation.append(current_next)
        all_video_annotation.append(test_video_annotation)
    return all_video_annotation,video_name_list#list:[video_num , frame,4]

5.存在的问题

这个框架很理想，训练好模型后输入前后帧可以回归出目标框，实际操作的时候发现存在误差积累的现象，因为从第一帧开始，预测第二帧的包围框（简称bb）将产生几个像素点的误差，导致以第二帧的bb为基准对图像进行裁剪时，使得原来的目标发生了变化，一直累积变化，最后完全不知道目标是什么。实验过程中，在预测bb的时候，由于目标的误差的累积，最后预测输出的bb=[x,y,w,h]中h或者w的值变化到了0~1之间，小于一个像素点，当值小于0.5时，在裁剪图像时使用了int(w)直接使得w等于0而报错，图像超出索引。

monster.YC

发布了5 篇原创文章 · 获赞 5 · 访问量 222

私信关注

pytorch目标跟踪（二）