3.解读evaluate部分
3.1 evaluate.py
首先从main()函数开始。
def main():
'''
logging.basicConfig打印日志时间,当前执行程序名以及日志信息。
加载各种参数。
'''
logging.basicConfig(level=logging.DEBUG,
format="[%(asctime)s %(filename)s] %(message)s")
if len(sys.argv) != 2:
logging.error("Usage: python eval.py params.py")
sys.exit()
params_path = sys.argv[1]
if not os.path.isfile(params_path):
logging.error("no params file found! path: {}".format(params_path))
sys.exit()
config = importlib.import_module(params_path[:-3]).TRAINING_PARAMS
config["batch_size"] *= len(config["parallels"])
# Start training
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, config["parallels"]))
evaluate(config)
此处插入evaluate/params.py函数。
TRAINING_PARAMS = \
{
"model_params": {
"backbone_name": "darknet_53",
"backbone_pretrained": "",
},
"yolo": {
"anchors": [[[116, 90], [156, 198], [373, 326]],
[[30, 61], [62, 45], [59, 119]],
[[10, 13], [16, 30], [33, 23]]],
"classes": 20,
},
"batch_size": 4,
"iou_thres": 0.5,
"val_path": "../data/coco/5k.txt",
"annotation_path": "../data/coco/annotations/instances_val2014.json",
"img_h": 416,
"img_w": 416,
"parallels": [0],
"pretrain_snapshot": "../weights/official_yolov3_weights_pytorch.pth", #此处可以更改为训练网络后保存的模型。
}
def evaluate(config):
is_training = False
# Load and initialize network
net = ModelMain(config, is_training=is_training)
net.train(is_training)
# Set data parallel
net = nn.DataParallel(net)
net = net.cuda()
# Restore pretrain model
if config["pretrain_snapshot"]:
state_dict = torch.load(config["pretrain_snapshot"])
net.load_state_dict(state_dict)
else:
logging.warning("missing pretrain_snapshot!!!")
# YOLO loss with 3 scales
yolo_losses = []
for i in range(3):
yolo_losses.append(YOLOLoss(config["yolo"]["anchors"][i],
config["yolo"]["classes"], (config["img_w"], config["img_h"])))
# DataLoader
dataloader = torch.utils.data.DataLoader(COCODataset(config["val_path"],
(config["img_w"], config["img_h"]),
is_training=False),
batch_size=config["batch_size"],
shuffle=False, num_workers=16, pin_memory=False)
代码运行时yolo_losses如下所示。
跳转到yolo_loss.py函数中。
def forward(self, input, targets=None):
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
stride_h = self.img_size[1] / in_h
stride_w = self.img_size[0] / in_w
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
prediction = input.view(bs, self.num_anchors,
self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous() #[4,3,13,13,25]
# Get outputs
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
conf = torch.sigmoid(prediction[..., 4]) # Conf [4,3,13,13]
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. [4,3,13,13,20]
if targets is not None:
# build target
mask, noobj_mask, tx, ty, tw, th, tconf, tcls = self.get_target(targets, scaled_anchors,
in_w, in_h,
self.ignore_threshold)
mask, noobj_mask = mask.cuda(), noobj_mask.cuda()
tx, ty, tw, th = tx.cuda(), ty.cuda(), tw.cuda(), th.cuda()
tconf, tcls = tconf.cuda(), tcls.cuda()
# losses.
loss_x = self.bce_loss(x * mask, tx * mask)
loss_y = self.bce_loss(y * mask, ty * mask)
loss_w = self.mse_loss(w * mask, tw * mask)
loss_h = self.mse_loss(h * mask, th * mask)
loss_conf = self.bce_loss(conf * mask, mask) + \
0.5 * self.bce_loss(conf * noobj_mask, noobj_mask * 0.0)
loss_cls = self.bce_loss(pred_cls[mask == 1], tcls[mask == 1])
# total loss = losses * weight
loss = loss_x * self.lambda_xy + loss_y * self.lambda_xy + \
loss_w * self.lambda_wh + loss_h * self.lambda_wh + \
loss_conf * self.lambda_conf + loss_cls * self.lambda_cls
return loss, loss_x.item(), loss_y.item(), loss_w.item(),\
loss_h.item(), loss_conf.item(), loss_cls.item()
else:
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
# Calculate offsets for each grid
'''
grid_x,grid_y,anchor_w,anchor_h:[4,3,13,13]
'''
grid_x = torch.linspace(0, in_w-1, in_w).repeat(in_w, 1).repeat(
bs * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, in_h-1, in_h).repeat(in_h, 1).t().repeat(
bs * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
# Calculate anchor w, h
anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
# Add offset and scale with anchors
pred_boxes = FloatTensor(prediction[..., :4].shape) #[4,3,13,13,4]
pred_boxes[..., 0] = x.data + grid_x
pred_boxes[..., 1] = y.data + grid_y
pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
# Results
_scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor) #[4],[32.,32.,32.,32.]
'''
output:[4,507,25],507代表什么?
507 = 13x13x3[anchor]
对于每一个尺寸的输出值都会经过yolo_loss函数,返回output[0]:[4,507,25];output[1]:[4,2028,25];output[2]:[4,8112,25]
第二个维度代表一张图片在每一个尺寸生成多少个anchors,后续再对这些anchors进行非极大值抑制等操作。
'''
output = torch.cat((pred_boxes.view(bs, -1, 4) * _scale,
conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1)
return output.data
该验证代码中计算mAP值是在类别预测与真实值相等的前提下,计算预测的图片与真实值的IOU,若大于某个阈值,则归为正确图片,用总的正确图片和真实图片做比值,得出map值。这种计算方法和其他代码中计算每一类map值再求平均的方法不一样,打算看看本文中代码以及其他计算每类的代码,修改修改。
# Start the eval loop
logging.info("Start eval.")
n_gt = 0
correct = 0
for step, samples in enumerate(dataloader):
images, labels = samples["image"], samples["label"]
labels = labels.cuda()
with torch.no_grad():
outputs = net(images)
output_list = []
for i in range(3):
'''
此处的output_list与yolo_losses是不是一个性质?之后会附上output_list输出值。
output经过非极大值抑制后得到[4,?,7],?代表每一张图片中有多少个类,7代表(x1, y1, x2, y2, obj_conf, class_conf, class_pred)。
'''
output_list.append(yolo_losses[i](outputs[i]))
output = torch.cat(output_list, 1) #[4,10647,25]
output = non_max_suppression(output, config["yolo"]["classes"], conf_thres=0.2)
# calculate
'''
samples是一个字典,包括image[4,3,416,416],label,image_path,origi_size,
labels.size(0)即取出labels文件夹中的第一个txt文件,
target_sample同上,shape为[1,5]
'''
for sample_i in range(labels.size(0)):
'''
sample_i = 0,1,2,3
labels:[4,?,5],labels.size(0) = 4,?代表这batch_size个图片中总的有多少个标签即多少个框,5代表标注的时候坐标位置和类别。
'''
# Get labels for sample where width is not zero (dummies)
target_sample = labels[sample_i, labels[sample_i, :, 3] != 0]
for obj_cls, tx, ty, tw, th in target_sample:
# Get rescaled gt coordinates
'''
tx1,tx2,ty1,ty2是针对原图的坐标。
box_gt是原图中的标签,如[206.3360,296.1920,270.4000,414.8907],sample_pred是预测出来的坐标等,如[200.0479,294.9911,278.0021,420.8348,0.5516,1.0000,14.0000],
'''
tx1, tx2 = config["img_w"] * (tx - tw / 2), config["img_w"] * (tx + tw / 2)
ty1, ty2 = config["img_h"] * (ty - th / 2), config["img_h"] * (ty + th / 2)
n_gt += 1
box_gt = torch.cat([coord.unsqueeze(0) for coord in [tx1, ty1, tx2, ty2]]).view(1, -1)
sample_pred = output[sample_i]
if sample_pred is not None:
# Iterate through predictions where the class predicted is same as gt
for x1, y1, x2, y2, conf, obj_conf, obj_pred in sample_pred[sample_pred[:, 6] == obj_cls]:
box_pred = torch.cat([coord.unsqueeze(0) for coord in [x1, y1, x2, y2]]).view(1, -1)
iou = bbox_iou(box_pred, box_gt)
if iou >= config["iou_thres"]:
correct += 1
break
'''
对每一个step的图片求map值。平均map值即对所有图片中预测正确的个数与总图片个数比值。
若len(dataloader) = 7000,batch_size = 4,则step = 7000/4 = 1750.
'''
if n_gt:
logging.info('Batch [%d/%d] mAP: %.5f' % (step, len(dataloader), float(correct / n_gt)))
logging.info('Mean Average Precision: %.5f' % float(correct / n_gt))
(1)对output进行非极大值抑制。common/utils中的non_max_suppression函数。
def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
"""
首先将每一类中object_conf<thresh的置0,即有目标的前提下IOU小于thresh的情况。
其次对每一类进行非极大值抑制,已知conf*pr(classi),conf = pr(object)*IOU,故class_score = pr(classi|object)*IOU。
Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
Returns detections with shape:
(x1, y1, x2, y2, object_conf, class_score, class_pred)
"""
# From (center x, center y, width, height) to (x1, y1, x2, y2)
'''
prediction[x,y,w,h,obj_conf]---->box_corner[x1,y2,x2,y1,obj_conf]
已知x,y,w,h,求x1,y1,x2,y2.
x1 = x - w/2
y1 = y + h/2
x2 = x + w/2
y2 = y - h/2
此处prediction即output返回的张量是什么?
output:[4,10647,25]
'''
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))] #[None,None,None,None]
# Step1:对输出每个尺寸的anchors数量合并,每张图片是[10647,25].除掉conf低于阈值的anchors,剩下4个。
for image_i, image_pred in enumerate(prediction): #image_pred:[10647,25]
# Filter out confidence scores below threshold
'''
squeeze()删除所有单维度条目。
conf_mask 输出的形式是什么?
'''
conf_mask = (image_pred[:, 4] >= conf_thres).squeeze() #[10647]
# image_pred由[10647,25]变为[4,25]表明已经过滤掉了得分低的anchors,最后得到四个anchors.
image_pred = image_pred[conf_mask] #[4,25]
# 如果没有检测到物体则跳出,处理下一张图片。
if not image_pred.size(0):
continue
# Get score and class with highest confidence
#class_conf(分类准确率):[4,1],[1.0, 1.0, 1.0, 1.0];class_pred(分类类别):[4,1],[14, 14, 14, 14]
class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
# Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
#detections:[4,7]
detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
# Step2:对剩下anchors中每一个类别进行非极大值抑制操作。
# Iterate through all predicted classes
#获取一张图片中检测到的所有类别。
unique_labels = detections[:, -1].cpu().unique() #[14.]
if prediction.is_cuda:
unique_labels = unique_labels.cuda()
for c in unique_labels:
# Get the detections with the particular class
# detections_class即检测出来的类别数量,4类即[4,7],2类即[2,7]
detections_class = detections[detections[:, -1] == c]
# Sort the detections by maximum objectness confidence
_, conf_sort_index = torch.sort(detections_class[:, 4], descending=True) #conf_sort_index:[4],[1,3,0,2]
detections_class = detections_class[conf_sort_index] #[4,7]
# Perform non-maximum suppression
max_detections = []
while detections_class.size(0):
# Get detection with highest confidence and save as max detection
max_detections.append(detections_class[0].unsqueeze(0)) #max_detections:[1,7]
# Stop if we're at the last detection
if len(detections_class) == 1:
break
# Get the IOUs for all boxes with lower confidence
# 此处的detections_class由[4,7]变为[3,7]是为什么?
'''
此处IOU返回的[1,3],[0.7455, 0.8212, 0.6986].针对每个尺寸的三个anchor box分别于gt做交并比。
'''
ious = bbox_iou(max_detections[-1], detections_class[1:])
# Remove detections with IoU >= NMS threshold
detections_class = detections_class[1:][ious < nms_thres]
max_detections = torch.cat(max_detections).data
# Add max detections to outputs
output[image_i] = max_detections if output[image_i] is None else torch.cat((output[image_i], max_detections))
return output