pytorch yolov3代码理解（3）

3.解读evaluate部分

3.1 evaluate.py

首先从main()函数开始。

def main():
    '''
    logging.basicConfig打印日志时间，当前执行程序名以及日志信息。
    加载各种参数。
    '''
    logging.basicConfig(level=logging.DEBUG,
                        format="[%(asctime)s %(filename)s] %(message)s")

    if len(sys.argv) != 2:
        logging.error("Usage: python eval.py params.py")
        sys.exit()
    params_path = sys.argv[1]
    if not os.path.isfile(params_path):
        logging.error("no params file found! path: {}".format(params_path))
        sys.exit()
    config = importlib.import_module(params_path[:-3]).TRAINING_PARAMS
    config["batch_size"] *= len(config["parallels"])

    # Start training
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, config["parallels"]))
    evaluate(config)

此处插入evaluate/params.py函数。

TRAINING_PARAMS = \
{
    "model_params": {
        "backbone_name": "darknet_53",
        "backbone_pretrained": "",
    },
    "yolo": {
        "anchors": [[[116, 90], [156, 198], [373, 326]],
                    [[30, 61], [62, 45], [59, 119]],
                    [[10, 13], [16, 30], [33, 23]]],
        "classes": 20,
    },
    "batch_size": 4,
    "iou_thres": 0.5,
    "val_path": "../data/coco/5k.txt",
    "annotation_path": "../data/coco/annotations/instances_val2014.json",
    "img_h": 416,
    "img_w": 416,
    "parallels": [0],
    "pretrain_snapshot": "../weights/official_yolov3_weights_pytorch.pth",   #此处可以更改为训练网络后保存的模型。
}

def evaluate(config):
    is_training = False
    # Load and initialize network
    net = ModelMain(config, is_training=is_training)
    net.train(is_training)

    # Set data parallel
    net = nn.DataParallel(net)
    net = net.cuda()

    # Restore pretrain model
    if config["pretrain_snapshot"]:
        state_dict = torch.load(config["pretrain_snapshot"])
        net.load_state_dict(state_dict)
    else:
        logging.warning("missing pretrain_snapshot!!!")

    # YOLO loss with 3 scales
    yolo_losses = []
    for i in range(3):
        yolo_losses.append(YOLOLoss(config["yolo"]["anchors"][i],
                                    config["yolo"]["classes"], (config["img_w"], config["img_h"])))

    # DataLoader
    dataloader = torch.utils.data.DataLoader(COCODataset(config["val_path"],
                                                         (config["img_w"], config["img_h"]),
                                                         is_training=False),
                                             batch_size=config["batch_size"],
                                             shuffle=False, num_workers=16, pin_memory=False)

代码运行时yolo_losses如下所示。

跳转到yolo_loss.py函数中。

def forward(self, input, targets=None):
        bs = input.size(0)
        in_h = input.size(2)
        in_w = input.size(3)
        stride_h = self.img_size[1] / in_h
        stride_w = self.img_size[0] / in_w
        scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]

        prediction = input.view(bs,  self.num_anchors,
                                self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()    #[4,3,13,13,25]

        # Get outputs
        x = torch.sigmoid(prediction[..., 0])          # Center x
        y = torch.sigmoid(prediction[..., 1])          # Center y
        w = prediction[..., 2]                         # Width
        h = prediction[..., 3]                         # Height
        conf = torch.sigmoid(prediction[..., 4])       # Conf  [4,3,13,13]
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.  [4,3,13,13,20]

        if targets is not None:
            #  build target
            mask, noobj_mask, tx, ty, tw, th, tconf, tcls = self.get_target(targets, scaled_anchors,
                                                                           in_w, in_h,
                                                                           self.ignore_threshold)
            mask, noobj_mask = mask.cuda(), noobj_mask.cuda()
            tx, ty, tw, th = tx.cuda(), ty.cuda(), tw.cuda(), th.cuda()
            tconf, tcls = tconf.cuda(), tcls.cuda()
            #  losses.
            loss_x = self.bce_loss(x * mask, tx * mask)
            loss_y = self.bce_loss(y * mask, ty * mask)
            loss_w = self.mse_loss(w * mask, tw * mask)
            loss_h = self.mse_loss(h * mask, th * mask)
            loss_conf = self.bce_loss(conf * mask, mask) + \
                0.5 * self.bce_loss(conf * noobj_mask, noobj_mask * 0.0)
            loss_cls = self.bce_loss(pred_cls[mask == 1], tcls[mask == 1])
            #  total loss = losses * weight
            loss = loss_x * self.lambda_xy + loss_y * self.lambda_xy + \
                loss_w * self.lambda_wh + loss_h * self.lambda_wh + \
                loss_conf * self.lambda_conf + loss_cls * self.lambda_cls

            return loss, loss_x.item(), loss_y.item(), loss_w.item(),\
                loss_h.item(), loss_conf.item(), loss_cls.item()
        else:
            FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
            LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
            # Calculate offsets for each grid
            '''
            grid_x,grid_y,anchor_w,anchor_h:[4,3,13,13]
            '''
            grid_x = torch.linspace(0, in_w-1, in_w).repeat(in_w, 1).repeat(
                bs * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
            grid_y = torch.linspace(0, in_h-1, in_h).repeat(in_h, 1).t().repeat(
                bs * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
            # Calculate anchor w, h
            anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
            anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
            anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
            anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
            # Add offset and scale with anchors
            pred_boxes = FloatTensor(prediction[..., :4].shape)  #[4,3,13,13,4]
            pred_boxes[..., 0] = x.data + grid_x
            pred_boxes[..., 1] = y.data + grid_y
            pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
            pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
            # Results
            _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor)   #[4],[32.,32.,32.,32.]
            '''
            output:[4,507,25],507代表什么？
            507 = 13x13x3[anchor]
            对于每一个尺寸的输出值都会经过yolo_loss函数，返回output[0]:[4,507,25];output[1]:[4,2028,25];output[2]:[4,8112,25]
            第二个维度代表一张图片在每一个尺寸生成多少个anchors，后续再对这些anchors进行非极大值抑制等操作。
            '''
            output = torch.cat((pred_boxes.view(bs, -1, 4) * _scale,
                                conf.view(bs, -1, 1), pred_cls.view(bs, -1,                            self.num_classes)), -1)
            return output.data

该验证代码中计算mAP值是在类别预测与真实值相等的前提下，计算预测的图片与真实值的IOU，若大于某个阈值，则归为正确图片，用总的正确图片和真实图片做比值，得出map值。这种计算方法和其他代码中计算每一类map值再求平均的方法不一样，打算看看本文中代码以及其他计算每类的代码，修改修改。

# Start the eval loop
    logging.info("Start eval.")
    n_gt = 0
    correct = 0
    for step, samples in enumerate(dataloader):
        images, labels = samples["image"], samples["label"]
        labels = labels.cuda()
        with torch.no_grad():
            outputs = net(images)
            output_list = []
            for i in range(3):
                '''
                此处的output_list与yolo_losses是不是一个性质？之后会附上output_list输出值。
                output经过非极大值抑制后得到[4,?,7],?代表每一张图片中有多少个类，7代表(x1, y1, x2, y2, obj_conf, class_conf, class_pred)。
                '''
                output_list.append(yolo_losses[i](outputs[i]))
            output = torch.cat(output_list, 1)     #[4,10647,25]
            output = non_max_suppression(output, config["yolo"]["classes"], conf_thres=0.2)
            #  calculate
            '''
            samples是一个字典，包括image[4,3,416,416],label,image_path,origi_size,
            labels.size(0)即取出labels文件夹中的第一个txt文件，
            target_sample同上，shape为[1,5]
            '''
            for sample_i in range(labels.size(0)):
                '''
                sample_i  = 0,1,2,3
                labels:[4,?,5],labels.size(0) = 4,?代表这batch_size个图片中总的有多少个标签即多少个框，5代表标注的时候坐标位置和类别。
                '''
                # Get labels for sample where width is not zero (dummies)
                target_sample = labels[sample_i, labels[sample_i, :, 3] != 0]
                for obj_cls, tx, ty, tw, th in target_sample:
                    # Get rescaled gt coordinates
                    '''
                    tx1,tx2,ty1,ty2是针对原图的坐标。
                    box_gt是原图中的标签，如[206.3360,296.1920,270.4000,414.8907],sample_pred是预测出来的坐标等，如[200.0479,294.9911,278.0021,420.8348,0.5516,1.0000,14.0000],
                    '''
                    tx1, tx2 = config["img_w"] * (tx - tw / 2), config["img_w"] * (tx + tw / 2)
                    ty1, ty2 = config["img_h"] * (ty - th / 2), config["img_h"] * (ty + th / 2)
                    n_gt += 1
                    box_gt = torch.cat([coord.unsqueeze(0) for coord in [tx1, ty1, tx2, ty2]]).view(1, -1)
                    sample_pred = output[sample_i]
                    if sample_pred is not None:
                        # Iterate through predictions where the class predicted is same as gt
                        for x1, y1, x2, y2, conf, obj_conf, obj_pred in sample_pred[sample_pred[:, 6] == obj_cls]:
                            box_pred = torch.cat([coord.unsqueeze(0) for coord in [x1, y1, x2, y2]]).view(1, -1)
                            iou = bbox_iou(box_pred, box_gt)
                            if iou >= config["iou_thres"]:
                                correct += 1
                                break
        '''
        对每一个step的图片求map值。平均map值即对所有图片中预测正确的个数与总图片个数比值。
        若len(dataloader) = 7000，batch_size = 4，则step = 7000/4 = 1750.
        '''
        if n_gt:
            logging.info('Batch [%d/%d] mAP: %.5f' % (step, len(dataloader), float(correct / n_gt)))

    logging.info('Mean Average Precision: %.5f' % float(correct / n_gt))

(1)对output进行非极大值抑制。common/utils中的non_max_suppression函数。

def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
    """
    首先将每一类中object_conf<thresh的置0，即有目标的前提下IOU小于thresh的情况。
    其次对每一类进行非极大值抑制，已知conf*pr(classi)，conf = pr(object)*IOU,故class_score = pr(classi|object)*IOU。
    
    Removes detections with lower object confidence score than 'conf_thres' and performs
    Non-Maximum Suppression to further filter detections.
    Returns detections with shape:
        (x1, y1, x2, y2, object_conf, class_score, class_pred)
    """

    # From (center x, center y, width, height) to (x1, y1, x2, y2)
    '''
    prediction[x,y,w,h,obj_conf]---->box_corner[x1,y2,x2,y1,obj_conf]
    已知x,y,w,h,求x1,y1,x2,y2.
    x1 = x - w/2
    y1 = y + h/2
    x2 = x + w/2
    y2 = y - h/2
    此处prediction即output返回的张量是什么？
    output:[4,10647,25]
    '''
    box_corner = prediction.new(prediction.shape)
    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
    prediction[:, :, :4] = box_corner[:, :, :4]

    output = [None for _ in range(len(prediction))]   #[None,None,None,None]
 # Step1:对输出每个尺寸的anchors数量合并，每张图片是[10647,25].除掉conf低于阈值的anchors，剩下4个。
    for image_i, image_pred in enumerate(prediction):    #image_pred:[10647,25]
        # Filter out confidence scores below threshold
        '''
        squeeze()删除所有单维度条目。
        conf_mask 输出的形式是什么？
        '''
        conf_mask = (image_pred[:, 4] >= conf_thres).squeeze()   #[10647]
        # image_pred由[10647,25]变为[4,25]表明已经过滤掉了得分低的anchors，最后得到四个anchors.
        image_pred = image_pred[conf_mask]                       #[4,25]
        # 如果没有检测到物体则跳出，处理下一张图片。
        if not image_pred.size(0):
            continue
        # Get score and class with highest confidence
        #class_conf(分类准确率):[4,1],[1.0, 1.0, 1.0, 1.0];class_pred(分类类别):[4,1],[14, 14, 14, 14]
        class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1,  keepdim=True)
        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
        #detections:[4,7]
        detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
    # Step2:对剩下anchors中每一个类别进行非极大值抑制操作。
        # Iterate through all predicted classes
        #获取一张图片中检测到的所有类别。
        unique_labels = detections[:, -1].cpu().unique()    #[14.]
        if prediction.is_cuda:
            unique_labels = unique_labels.cuda()
        for c in unique_labels:
            # Get the detections with the particular class
            # detections_class即检测出来的类别数量，4类即[4,7],2类即[2,7]
            detections_class = detections[detections[:, -1] == c]
            # Sort the detections by maximum objectness confidence
            _, conf_sort_index = torch.sort(detections_class[:, 4], descending=True)  #conf_sort_index:[4],[1,3,0,2]
            detections_class = detections_class[conf_sort_index]    #[4,7]
            # Perform non-maximum suppression
            max_detections = []
            while detections_class.size(0):
                # Get detection with highest confidence and save as max detection
                max_detections.append(detections_class[0].unsqueeze(0))   #max_detections:[1,7]
                # Stop if we're at the last detection
                if len(detections_class) == 1:
                    break
                # Get the IOUs for all boxes with lower confidence
                # 此处的detections_class由[4,7]变为[3,7]是为什么？
                '''
                此处IOU返回的[1,3],[0.7455, 0.8212, 0.6986].针对每个尺寸的三个anchor box分别于gt做交并比。
                '''
                ious = bbox_iou(max_detections[-1], detections_class[1:])
                # Remove detections with IoU >= NMS threshold
                detections_class = detections_class[1:][ious < nms_thres]

            max_detections = torch.cat(max_detections).data
            # Add max detections to outputs
            output[image_i] = max_detections if output[image_i] is None else torch.cat((output[image_i], max_detections))

return output

pytorch yolov3代码理解（3）

3.解读evaluate部分

猜你喜欢