yolo v1笔记

1.代码

1.1 net

class YOLO_V1(nn.Module):
    def __init__(self):
        super(YOLO_V1, self).__init__()
        C = 20  # number of classes
        print("\n------Initiating YOLO v1------\n")
        self.conv_layer1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=7//2),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_layer2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=192, kernel_size=3, stride=1, padding=3//2),
            nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_layer3 = nn.Sequential(
            nn.Conv2d(in_channels=192, out_channels=128, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=3//2),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_layer4 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.BatchNorm2d(1024),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_layer5 = nn.Sequential(
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=1//2),
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=2, padding=3//2),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
        )
        self.conv_layer6 = nn.Sequential(
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=3//2),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        self.flatten = Flatten()
        self.conn_layer1 = nn.Sequential(
            nn.Linear(in_features=7*7*1024, out_features=4096),
            nn.Dropout(),
            nn.LeakyReLU(0.1)
        )
        self.conn_layer2 = nn.Sequential(nn.Linear(in_features=4096, out_features=7 * 7 * (2 * 5 + C)))

    def forward(self, input):
        conv_layer1 = self.conv_layer1(input)
        conv_layer2 = self.conv_layer2(conv_layer1)
        conv_layer3 = self.conv_layer3(conv_layer2)
        conv_layer4 = self.conv_layer4(conv_layer3)
        conv_layer5 = self.conv_layer5(conv_layer4)
        conv_layer6 = self.conv_layer6(conv_layer5)
        flatten = self.flatten(conv_layer6)
        conn_layer1 = self.conn_layer1(flatten)
        output = self.conn_layer2(conn_layer1)
        return output

1.2 loss

class YoloLoss(nn.Module):
    def __init__(self, n_batch, B, C, lambda_coord, lambda_noobj, use_gpu=False):
        """
        :param n_batch: number of batches
        :param B: number of bounding boxes
        :param C: number of bounding classes
        :param lambda_coord: factor for loss which contain objects
        :param lambda_noobj: factor for loss which do not contain objects
        """
        super(YoloLoss, self).__init__()
        self.n_batch = n_batch
        self.B = B # assume there are two bounding boxes
        self.C = C
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.use_gpu = use_gpu

    def compute_iou(self, bbox1, bbox2):
        """
        Compute the intersection over union of two set of boxes, each box is [x1,y1,w,h]
        :param bbox1: (tensor) bounding boxes, size [N,4]
        :param bbox2: (tensor) bounding boxes, size [M,4]
        :return:
        """
        # compute [x1,y1,x2,y2] w.r.t. top left and bottom right coordinates separately
        b1x1y1 = bbox1[:,:2]-bbox1[:,2:]**2 # [N, (x1,y1)=2]
        b1x2y2 = bbox1[:,:2]+bbox1[:,2:]**2 # [N, (x2,y2)=2]
        b2x1y1 = bbox2[:,:2]-bbox2[:,2:]**2 # [M, (x1,y1)=2]
        b2x2y2 = bbox2[:,:2]+bbox2[:,2:]**2 # [M, (x1,y1)=2]
        box1 = torch.cat((b1x1y1.view(-1,2), b1x2y2.view(-1, 2)), dim=1) # [N,4], 4=[x1,y1,x2,y2]
        box2 = torch.cat((b2x1y1.view(-1,2), b2x2y2.view(-1, 2)), dim=1) # [M,4], 4=[x1,y1,x2,y2]
        N = box1.size(0)
        M = box2.size(0)

        tl = torch.max(
            box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )
        br = torch.min(
            box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
            box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
        )

        wh = br - tl  # [N,M,2]
        wh[(wh<0).detach()] = 0
        #wh[wh<0] = 0
        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

        area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
        area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

        iou = inter / (area1 + area2 - inter)
        return iou

    def forward(self, pred_tensor, target_tensor):
        """
        :param pred_tensor: [batch,SxSx(Bx5+20))]
        :param target_tensor: [batch,S,S,Bx5+20]
        :return: total loss
        """
        n_elements = self.B * 5 + self.C
        batch = target_tensor.size(0)
        target_tensor = target_tensor.view(batch,-1,n_elements)
        #print(target_tensor.size())
        #print(pred_tensor.size())
        pred_tensor = pred_tensor.view(batch,-1,n_elements)
        coord_mask = target_tensor[:,:,5] > 0
        noobj_mask = target_tensor[:,:,5] == 0
        coord_mask = coord_mask.unsqueeze(-1).expand_as(target_tensor)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target_tensor)

        coord_target = target_tensor[coord_mask].view(-1,n_elements)
        coord_pred = pred_tensor[coord_mask].view(-1,n_elements)
        class_pred = coord_pred[:,self.B*5:]
        class_target = coord_target[:,self.B*5:]
        box_pred = coord_pred[:,:self.B*5].contiguous().view(-1,5)
        box_target = coord_target[:,:self.B*5].contiguous().view(-1,5)

        noobj_target = target_tensor[noobj_mask].view(-1,n_elements)
        noobj_pred = pred_tensor[noobj_mask].view(-1,n_elements)

        # compute loss which do not contain objects
        if self.use_gpu:
            noobj_target_mask = torch.cuda.ByteTensor(noobj_target.size())
        else:
            noobj_target_mask = torch.ByteTensor(noobj_target.size())
        noobj_target_mask.zero_()
        for i in range(self.B):
            noobj_target_mask[:,i*5+4] = 1
        noobj_target_c = noobj_target[noobj_target_mask] # only compute loss of c size [2*B*noobj_target.size(0)]
        noobj_pred_c = noobj_pred[noobj_target_mask]
        noobj_loss = functional.mse_loss(noobj_pred_c, noobj_target_c, size_average=False)

        # compute loss which contain objects
        if self.use_gpu:
            coord_response_mask = torch.cuda.ByteTensor(box_target.size())
            coord_not_response_mask = torch.cuda.ByteTensor(box_target.size())
        else:
            coord_response_mask = torch.ByteTensor(box_target.size())
            coord_not_response_mask = torch.ByteTensor(box_target.size())
        coord_response_mask.zero_()
        coord_not_response_mask = ~coord_not_response_mask.zero_()
        for i in range(0,box_target.size()[0],self.B):
            box1 = box_pred[i:i+self.B]
            box2 = box_target[i:i+self.B]
            iou = self.compute_iou(box1[:, :4], box2[:, :4])
            max_iou, max_index = iou.max(0)
            if self.use_gpu:
                max_index = max_index.data.cuda()
            else:
                max_index = max_index.data
            coord_response_mask[i+max_index]=1
            coord_not_response_mask[i+max_index]=0

        # 1. response loss
        box_pred_response = box_pred[coord_response_mask].view(-1, 5)
        box_target_response = box_target[coord_response_mask].view(-1, 5)
        contain_loss = functional.mse_loss(box_pred_response[:, 4], box_target_response[:, 4], size_average=False)
        loc_loss = functional.mse_loss(box_pred_response[:, :2], box_target_response[:, :2], size_average=False) +\
                   functional.mse_loss(box_pred_response[:, 2:4], box_target_response[:, 2:4], size_average=False)
        # 2. not response loss
        box_pred_not_response = box_pred[coord_not_response_mask].view(-1, 5)
        box_target_not_response = box_target[coord_not_response_mask].view(-1, 5)

        # compute class prediction loss
        class_loss = functional.mse_loss(class_pred, class_target, size_average=False)

        # compute total loss
        total_loss = self.lambda_coord * loc_loss + contain_loss + self.lambda_noobj * noobj_loss + class_loss
        return total_loss



def test():
    voc = False
    vot = 1-voc
    if voc:
        img_folder = '../codedata/voc2012train/JPEGImages'
        file = '../voc2012.txt'
        img_size = 448
        train_dataset = YoloDataset(img_folder=img_folder, file=file, img_size=img_size, S=7, B=2, C=20, transforms=[transforms.ToTensor()])
        train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, num_workers=0)
        train_iter = iter(train_loader)
        img, target = next(train_iter)
        print(target.size())
        target = Variable(target)
        img = Variable(img)
        net = YOLO_V1()
        pred = net(img)
        yololoss = YoloLoss(n_batch=2, B=2, C=20, lambda_coord=5, lambda_noobj=0.5)
        print(pred.size())
        print(target.size())
        loss = yololoss(pred, target)
        print(loss)

    if vot:
        img_folder = './small_train_dataset'
        bboxes = dd.io.load('girl_bbox_4dim.h5')
        learning_rate = 0.0005
        img_size = 224
        num_epochs = 2
        lambda_coord = 5
        lambda_noobj = .5
        n_batch = 5
        S = 7
        B = 2
        C = 1
        train_dataset = VotDataset(img_folder=img_folder, bboxes=bboxes, img_size=img_size, S=S, B=B, C=C,
                                   transforms=[transforms.ToTensor()])
        train_loader = DataLoader(train_dataset, batch_size=n_batch, shuffle=False, num_workers=2)
        yololoss = YoloLoss(n_batch=n_batch, B=B, C=C, lambda_coord=5, lambda_noobj=0.5)
        train_iter = iter(train_loader)
        img, target = next(train_iter)
        target = Variable(target)
        img = Variable(img)

        model = models.vgg16(pretrained=True)
        model.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 11 * 7 * 7),
            nn.Sigmoid(),
        )
        model.train()

        loss_fn = YoloLoss(n_batch, B, C, lambda_coord, lambda_noobj)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

        use_gpu = False
        for epoch in range(num_epochs):
            for i, (images, target) in enumerate(train_loader):
                images = Variable(images)
                target = Variable(target)
                if use_gpu:
                    images, target = images.cuda(), target.cuda()

                pred = model(images)
                print(pred.size())
                print(target.size())
                loss = loss_fn(pred, target)
                print(i + 1, loss)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if i == 10:
                    break
            break

1.3 loadDataset

class VotTrainDataset(data.Dataset):
    def __init__(self, img_folder, file, img_size, S, B, C, transforms):
        self.img_folder = img_folder
        self.file = file
        self.file_names = []
        self.img_size = img_size
        self.S = S
        self.B = B
        self.C = C
        self.transforms = transforms
        self.bboxes = []
        self.labels = []

        with open(file) as f:
            lines = f.readlines()

        for line in lines:
            bbox = []
            label = []
            splited = line.strip().split()
            self.file_names.append(splited[0])
            n_objects = int(1) # only one object
            for i in range(n_objects):
                x1 = float(splited[i*5+1])
                y1 = float(splited[i*5+2])
                x2 = float(splited[i*5+3])
                y2 = float(splited[i*5+4])
                bbox.append([x1,y1,x2,y2])
                label.append(int(splited[i*5+5]))
                self.bboxes.append(torch.Tensor(bbox))
                self.labels.append(torch.IntTensor(label))
            self.n_data = len(self.labels)
            #print(self.bboxes)

    def __getitem__(self, index):
        bbox = self.bboxes[index].clone()
        label = self.labels[index].clone()
        img = imread(os.path.join(self.img_folder, self.file_names[index])) # default cv2.imread BGR image
        #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # convert BGR image into RGB channel which I prefer

        height, width, _ = img.shape

        img = imresize(img, (self.img_size, self.img_size))
        bbox = bbox / torch.Tensor([width, height, width, height])# * self.img_size
        target = self.encode_target(bbox, label)
        transform = transforms.Compose(self.transforms)
        img = transform(img)
        return img, target

    def encode_target(self, bbox, label):
        """
        :param bbox: [xc,yc,w,h] coordinates in the top left and bottom right separately
        :param label: class label
        :return: [normalized_xc,normalized_yc,sqrt(normalized_w),sqrt(normalized_h)]
        """
        n_elements = self.B * 5 + self.C
        n_bbox = len(label)
        target = torch.zeros((self.S, self.S, n_elements))
        class_info = torch.zeros((n_bbox, self.C))
        for i in range(n_bbox):
            class_info[i, label[i]] = 1
        w = bbox[:,2]
        w_sqrt = torch.sqrt(w)
        x_center = bbox[:,0]
        h = bbox[:,3]
        h_sqrt = torch.sqrt(h)
        y_center = bbox[:,1]
        x_index = (x_center / (1 / self.S)).ceil()-1
        y_index = (y_center / (1 / self.S)).ceil()-1
        c = torch.ones_like(x_center)
        # set w_sqrt and h_sqrt directly
        box_block = torch.cat((x_center.view(-1,1), y_center.view(-1,1), w_sqrt.view(-1,1), h_sqrt.view(-1,1), c.view(-1,1)), dim=1)
        box_info = box_block.repeat(1, self.B)
        target_infoblock = torch.cat((box_info, class_info), dim=1)
        for i in range(n_bbox):
            target[int(x_index[i]),int(y_index[i])] = target_infoblock[i].clone()
        return target

    def __len__(self):
        return self.n_data


def main():
    img_size = 224
    file = './routine_generate_vot2017_train/vot2017_train.txt'
    img_folder = './routine_generate_vot2017_train'
    train_dataset = VotTrainDataset(img_folder=img_folder, file=file, img_size=224, S=7, B=2, C=20, transforms=[transforms.ToTensor()])
    #img, target = train_dataset.__getitem__(0)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=0)
    train_iter = iter(train_loader)
    img, target = next(train_iter)
    for i in range(7):
        for j in range(7):
            if target[0,i,j,4] != 0:
                print(i,j)
                print(target[0,i,j])

    # print(img.size())
    # print(type(img))
    # print(type(target))
    img, target = next(train_iter)

    print(img.size())
    print(target.size())
    img, target = next(train_iter)
    print(img.size())
    print(target.size())

2.yolo

2.1 结构

2.2 输入和输出的映射关系

输入就是原始图像，唯一要求就是缩放到448*448的大小，主要是因为yolo的网络中，卷积层最后连接了两个全连接层，全连接层要求固定大小的向量作为输入。输出是一个7*7*30的张量(tensor)

根据yolo的设计，输入图像被划分为7*7的网格，输出张量中的7*7就对应着输入图像的7*7的网格。下面看看30维向量

20个对象分类的概率：因为YOLO支持识别20种不同的对象（人、鸟、猫、汽车、椅子等），所以这里有20个值表示该网格位置存在任一种对象的概率。可以记为

2个bounding box的位置：每个bounding box需要4个数值表示其位置，(Center_x,Center_y,width,height)，即(bounding box的中心点的x坐标，y坐标，bounding box的宽度，高度)，2个bounding box共需要8个数值来表示其位置。

2个bounding box的置信度：bounding box的置信度=该bounding box内存在对象的概率*该bounding box与该对象实际bounding box的IOU，用公式表示就是。是bounding box内存在对象的概率，区别于上面。并不管是哪个对象，体现的是有或者没有对象的概率。是bounding box与对象真实bounding box的IOU（intersection over Union,交并比），体现了预测的bounding box与真实bounding box的接近程度。这个IOU是在训练阶段计算的，等到测试阶段，并不知道真实对象在哪里，只能完全依赖网络的输出，这时候也不需要计算IOU了。一个bounding box的置信度Cofidence意味着包含对象且位置准确的程度。置信度高表示这里存在一个对象且位置比较准确，置信度低表示可能没有对象或者即使有对象也存在较大的偏差。简单解释一下IOU。下图来自Andrew Ng的深度学习课程，IOU=交集部分面积/并集部分面积，2个box完全重合时IOU=1，不相交时IOU=0。总的来说，30维向量 = 20个对象的概率 + 2个bounding box * 4个坐标 + 2个bounding box的置信度

一张图片最多可以检测出49个对象，每个30维向量中只有一组（20个）对象分类概率，也就只能预测一个对象。所以输出的7*7=49个30维向量，最多表示出49个对象。总共98个候选区，一维每个30维向量中有两组bounding box。yolo的bounding box看起来很像一个grid中2个Anchor,但他们不是。yolo并没有预先设置2个bounding box的大小和形状，也没有对每个bounding box分别输出一个对象的预测。它的意思仅仅是对一个对象预测出2个bounding box，选择预测得先对比较准的那个。

在设置训练样本的时候，样本中的每个Object归属到且仅归属到一个grid，即便有时Object跨越了几个grid，也仅指定其中一个。具体就是计算出该Object的bounding box的中心位置，这个中心位置落在哪个grid，该grid对应的输出向量中该对象的类别概率是1（该gird负责预测该对象），所有其它grid对该Object的预测概率设为0（不负责预测该对象）。同样，虽然一个grid中会产生2个bounding box，但我们会选择其中一个作为预测结果，另一个会被忽略。

2.3 训练样本

20个对象分类的概率：

对于输入图像中的每个对象，先找到其中心点。比如图8中的自行车，其中心点在黄色圆点位置，中心点落在黄色网格内，所以这个黄色网格对应的30维向量中，自行车的概率是1，其它对象的概率是0。所有其它48个网格的30维向量中，该自行车的概率都是0。这就是所谓的"中心点所在的网格对预测该对象负责"。狗和汽车的分类概率也是同样的方法填写。

2个bounding box的位置：

根据输出的bounding box与对象实际bounding box的IOU来选择，所以要在训练过程中动态决定填哪一个bounding box。

2个bounding box的置信度:

举个例子，比如上图中自行车的中心点位于4行3列网格中，所以输出tensor中4行3列位置的30维向量如下图所示。

翻译成人话就是：4行3列网格位置有一辆自行车，它的中心点在这个网格内，它的位置边框是bounding box1所填写的自行车实际边框。

注意，图中将自行车的位置放在bounding box1，但实际上是在训练过程中等网络输出以后，比较两个bounding box与自行车实际位置的IOU，自行车的位置（实际bounding box）放置在IOU比较大的那个bounding box（图中假设是bounding box1），且该bounding box的置信度设为1。

2.4 损失函数

YOLO给出的损失函数如下：

总的来说，就是用网络输出与样本标签的各项内容的误差平方和作为一个样本的整体误差。
损失函数中的几个项是与输出的30维向量中的内容相对应的。

2.5 训练

YOLO先使用ImageNet数据集对前20层卷积网络进行预训练，然后使用完整的网络，在PASCAL VOC数据集上进行对象识别和定位的训练和预测。YOLO的网络结构如下图所示：

YOLO的最后一层采用线性激活函数，其它层都是Leaky ReLU。训练中采用了drop out和数据增强（data augmentation）来防止过拟合。训练好的YOLO网络，输入一张图片，将输出一个 7*7*30 的张量（tensor）来表示图片中所有网格包含的对象（概率）以及该对象可能的2个位置（bounding box）和可信程度（置信度）。为了从中提取出最有可能的那些对象和位置，YOLO采用NMS（Non-maximal suppression，非极大值抑制）算法。

江小北

发布了33 篇原创文章 · 获赞 3 · 访问量 1908

私信关注

猜你喜欢