opencv进阶-YOLO对象检测的代码解读

视频教程：
基于OpenCV4.4.0的YOLO v4实战（C++编写）
文字教程：基于OpenCV和YOLOv3深度学习的目标检测

一、打印输出层

YOLO有多个输出层，打印所有的输出层：

vector <string>outNames=net.getUnconnectedOut;
for (int k= 0;k<outnames.size();k++)
{
    
    
printf("output layer name:%s/n",outNames[k].c_str())
}

c_str()函数返回是字符串的一个临时指针

在这里插入图片描述

二、推理输出

由于yolo有多个输出层，因此前项推理的时候跟SSD等模型不同，forward的第二个参数需要填入所有层，结果就会有多张输出的图像，即Mat类，代码如下表示：

vector outs;
net.forward(outs, outNames);

forward推理的所有输出结果都保存在outs数组中，即outputbob，其性质为Mat类。
补充：SSD等检测模型只有一个输出层，因此前向推理时只需要一行代码即可，即：

net.forward()

三、解析输出

在二中说道输出所有的信息都存放在了outs数组中，因此下面要对它进行解析，显示我们想要的数据，有矩形边框、类别名称、置信度，
我们用如下的数组进行表示和保存。

重点：输出的结构为:
【center_x, center_y, width, heigth，confidence，】

vector<int> classIds;
vector<float> confidences;
vector<Rect> boxes;

利用for循环解析输出(outputblob)：

//获取输出的数据，并存放在data中，类型为float
	for (size_t i = 0; i < outs.size(); ++i)
		{
    
    
			float* data = (float*)outs[i].data;
//循环打印输出
			for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
			{
    
    
				Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
				Point classIdPoint;
				double confidence;
				minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);//从scores中输出置信度以及类别最大值

minMaxLoc用法参考：opencv学习-模板匹配

根据是否满足置信度输出classIds、confidences、boxes

if (confidence > 0.5)
				{
    
    //得到矩形边框的左上角坐标以及宽高
					int centerX = (int)(data[0] * frame.cols);
					int centerY = (int)(data[1] * frame.rows);
					int width = (int)(data[2] * frame.cols);
					int height = (int)(data[3] * frame.rows);
					int left = centerX - width / 2;
					int top = centerY - height / 2;
					
					//得到类别名，置信度，边框
					classIds.push_back(classIdPoint.x);
					confidences.push_back((float)confidence);
					boxes.push_back(Rect(left, top, width, height));
				}
			}
		}

四、NMS

YOLO有多个输出层，因此一个对象可以在多个层上被检测到，即三中输出的结果是对物体进行多次框选，所以需要去掉重复的box，这时我们需要用到NMS（非最大抑制）-------non maximum suppression

vector<int> indices;
		NMSBoxes(boxes, confidences, 0.5, 0.2, indices);

API-NMSBoxes解读

函数原型：NMSBoxes(bboxes, scores, score_threshold, nms_threshold, indices，eta=None, top_k=None)
各个参数含义：

1.bboxes：a set of bounding boxes to apply NMS.输入所有待处理的边界框 box

2.scores：a set of corresponding confidences. 对于待处理边界框的 scores

3.score_threshold：a threshold used to filter boxes by score. 过滤掉置信度低于该阈值的box

4.nms_threshold：a threshold used in non maximum suppression. 非最大抑制的阈值，这个值其实就是交并比，最大为1，超过该阈值的边框将会被取消，因此该值越大，保留的边框就会越多。一般选取常用0.2

5.indices：the kept indices of bboxes after NMS. 输出经过NMS后的结果，并保存在indices中

以下参数不怎么使用

eta：a coefficient in adaptive threshold formula:
nms_thresholdi+1=eta⋅nms_thresholdi.
top_k：if >0, keep at most top_k picked indices.

原理解读

1）先对输入检测框按置信度由高到低排序

2）挑选第一个检测框(即最高置信度，记为A）和其它检测框（记为B）进行IOU计算。IOU计算是指两个矩形的交集与两个矩形并集的比值

3）如果iou大于nmsThreshold，那就将B清除掉，否则保留。

4）不管是否保留下一步都将跳转到2）即从剩余得框集里面找置信度最大得框和其它框分别计算iou，然后进行3）。

5）直到所有框都过滤完。

五、二次解析输出

对index进行解析

for (size_t i = 0; i < indices.size(); ++i)
		{
    
    
			int idx = classIds[i];
			Rect box = boxes[i];
			rectangle(frame, box, Scalar(0, 0, 255), 2, 8, 0);
			String className = classNamesVec[classIds[idx]];//根据类别名称文本中输出对应的类别名称
			putText(frame, format("%s:%.2f",className, confidences[idx]), box.tl(), FONT_HERSHEY_SIMPLEX, 1.0, Scalar(255, 0, 0), 2, 8);
			
		}

全部代码

#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <cstdlib>

using namespace std;
using namespace cv;
using namespace cv::dnn;
void image_detection();

String yolo_cfg = "D:/opencv-4.4.0/models/yolov4/yolov4.cfg";
String yolo_model = "D:/opencv-4.4.0/models/yolov4/yolov4.weights";

int main()
{
    
    
	Net net = readNetFromDarknet(yolo_cfg, yolo_model);
	vector<string> classNamesVec;
	ifstream classNamesFile("D:/opencv-4.4.0/models/yolov4/coco.names");
	if (classNamesFile.is_open())
	{
    
    
		string className = "";
		while (std::getline(classNamesFile, className))
			classNamesVec.push_back(className);
	}

	//net.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
	net.setPreferableTarget(DNN_TARGET_CPU);
	net.setPreferableBackend(DNN_BACKEND_OPENCV);

	std::vector<String> outNames = net.getUnconnectedOutLayersNames();
	for (int i = 0; i < outNames.size(); i++) {
    
    
		printf("output layer name : %s\n", outNames[i].c_str());
	}

	VideoCapture capture("D:/images/balltest.mp4");
	Mat frame;
	while (capture.read(frame))
	{
    
    
	/*	transpose(frame, frame);
		flip(frame, frame, 1);*/
		Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false);
		net.setInput(inputBlob);

		// 输出检测频率和每帧耗时
		std::vector<Mat> outs;
		net.forward(outs, outNames);
		vector<double> layersTimings;
		double freq = getTickFrequency() / 1000;
		double time = net.getPerfProfile(layersTimings) / freq;
		ostringstream ss;
		ss << "FPS" << 1000 / time << ";time:" << time << "ms";
		putText(frame, ss.str(), Point(20, 20), FONT_HERSHEY_PLAIN, 1, Scalar(0, 0, 255), 2, 8);

		// 输出检测框和置信度
		vector<Rect> boxes;
		vector<int> classIds; 
		vector<float> confidences;
		for (size_t i = 0; i < outs.size(); ++i)
		{
    
    
			float* data = (float*)outs[i].data;
			for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
			{
    
    
				Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
				Point classIdPoint;
				double confidence;
				minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
				if (confidence > 0.5)
				{
    
    
					int centerX = (int)(data[0] * frame.cols);
					int centerY = (int)(data[1] * frame.rows);
					int width = (int)(data[2] * frame.cols);
					int height = (int)(data[3] * frame.rows);
					int left = centerX - width / 2;
					int top = centerY - height / 2;

					classIds.push_back(classIdPoint.x);
					confidences.push_back((float)confidence);
					boxes.push_back(Rect(left, top, width, height));

				}
			}
		}

		vector<int> indices;
		NMSBoxes(boxes, confidences, 0.5, 0.2, indices);
		for (size_t i = 0; i < indices.size(); ++i)
		{
    
    
			int idx = indices[i];
			Rect box = boxes[idx];
			rectangle(frame, box, Scalar(0, 0, 255), 2, 8, 0);
			String className = classNamesVec[classIds[idx]];
			putText(frame, format("%s:%.2f",className.c_str(), confidences[idx]), box.tl(), FONT_HERSHEY_SIMPLEX, 1.0, Scalar(255, 0, 0), 2, 8);
		}

		imshow("YOLOv4", frame);
		char c = waitKey(5);
		if (c == 27) {
    
     // ESC退出
			break;
		}
	}
	capture.release();//释放资源
	waitKey(0);
	return 0;
}