痛点

yolov5训练后的模型，通过nncf量化之后，转成精度为Int8的xml和bin模型，在c++部署加载xml模型时，会遇到一个问题，在初次加载模型时需要较长的时间，虽然经过量化后的模型在检测速度上有大概2倍的提升，但是过长的模型加载时间，会对工业生产造成较大的影响，对于已经启动的产线，当我们启动检测软件时，过长的模型加载时间会导致最先一批的产品无法检测而被判定成PreNG。因此有必要通过某种方法来解决前期xml模型加载时间过长的问题。

解决方案

经过测试我们发现，在使用没有量化的onnx模型时，推理时的加载速度能够很快，满足生产需求（我估摸着是因为onnx的计算图包含了权重信息，而xml保存了计算图，bin保存的是权重，分开的存储方式导致加载时的缓慢）。因此，我们尝试选择在一开始的时候先加载onnx模型来进行检测，并开辟一个独立的线程来加载xml模型，实现一边用onnx检测一边加载xml的形式，并且我们在外部定义好一个全局变量count来为onnx检测计数，当计数到达某个值时，并且xml模型的推理请求创建完毕，在同时满足这两个条件的情况下，我们即进入xml模型推理的代码，使用加速模型来继续完成后续的推理。因此，我们查日志的时候会发现，在一开始，模型的检测速度相对比较慢，然后随着模型的加载完毕，会在某个时刻检测速度突然加快。

代码细节

全局变量的设置

#include"yolov5_detect.h"
#include "mobilenet.h"
//#include <opencv2/cudaarithm.hpp>
//#include <opencv2/cudaoptflow.hpp>
//#include <opencv2/cudaimgproc.hpp>
#include<crtdbg.h>

#ifdef _DEBUG
#define new new(_NORMAL_BLOCK,__FILE__,__LINE__)
#endif
#include <time.h>
#include <chrono>
using namespace std::chrono;
//#include <windows.h>

InferRequest infer_request_left_up;
InferRequest infer_request_left_down;
InferRequest infer_request_right_up;
InferRequest infer_request_right_down;
InferRequest infer_request_WuzhuangSeCha_left;
InferRequest infer_request_WuzhuangSeCha_right;

InferRequest infer_request_left_up_;
InferRequest infer_request_left_down_;
InferRequest infer_request_right_up_;
InferRequest infer_request_right_down_;
InferRequest infer_request_WuzhuangSeCha_left_;
InferRequest infer_request_WuzhuangSeCha_right_;

string model_file = "yolov5s_int8.xml";
string model_file_onnx = "yolo_detect.onnx";
string model_file_WuZhuangSeCha = "yolo_WuZhuangSeCha_int8.xml";
string model_file_WuZhuangSeCha_onnx = "yolo_WuZhuangSeCha_detect.onnx";
string device_name = "CPU";
const float threshold_nms = 0.45; // 不需要修改
const float confidence_score = 0.6; // 不需要修改
const float threshold_score = 0.25;
const int OVERLAP = 100; // 不需要修改
const int ADD_HEIGHT = 900;// 4000 / 4 - 100;
vector<string>class_names(0);
vector<string>class_names_WuzhuangSeCha(0);
int detect_num_classes = 0;  // 不需要修改
int detect_num_classes_WuzhuangSeCha = 0;  // 不需要修改
int max_detection = 100;
int tt = 0;
bool load_xml_left = true;
bool load_xml_right = true;
bool load_xml_Wu_left = true;
bool load_xml_Wu_right = true;
bool xml_loading_left = false;
bool xml_loading_right = false;
bool xml_loading_Wu_left = false;
bool xml_loading_Wu_right = false;

//onnx模型检测计数器
int onnx_detect_count = 0;

定义检测模型的初始化函数（xml）

CPPDLL_API void Initial_model_xml_left()
{
    
    
	if (load_xml_left && xml_loading_left == false)
	{
    
    
		Core core;
		//2.载入并编译模型
		CompiledModel compiled_model_left_up_ = core.compile_model(model_file, device_name);

		//2.载入并编译模型
		CompiledModel compiled_model_left_down_ = core.compile_model(model_file, device_name);

		if (compiled_model_left_up_ && compiled_model_left_down_)
		{
    
    
			infer_request_left_down_ = compiled_model_left_down_.create_infer_request();
			infer_request_left_up_ = compiled_model_left_up_.create_infer_request();

			xml_loading_left = true;
		}
	}
	if (onnx_detect_count >= 60 && infer_request_left_down && infer_request_left_up)
	{
    
    
		infer_request_left_down.~InferRequest();
		infer_request_left_up.~InferRequest();
	}
}

CPPDLL_API void Initial_model_xml_right()
{
    
    

	if (load_xml_right && xml_loading_right == false)
	{
    
    
		Core core;
		//2.载入并编译模型
		CompiledModel compiled_model_right_up_ = core.compile_model(model_file, device_name);

		//2.载入并编译模型
		CompiledModel compiled_model_right_down_ = core.compile_model(model_file, device_name);

		if (compiled_model_right_up_ && compiled_model_right_down_)
		{
    
    
			infer_request_right_down_ = compiled_model_right_down_.create_infer_request();
			infer_request_right_up_ = compiled_model_right_up_.create_infer_request();

			xml_loading_right = true;
		}
	}
	if (onnx_detect_count >= 60 && infer_request_right_down && infer_request_right_up)
	{
    
    
		infer_request_right_down.~InferRequest();
		infer_request_right_up.~InferRequest();
	}

}

CPPDLL_API void Initial_model_xml_Wu_left()
{
    
    
	if (load_xml_Wu_left && xml_loading_Wu_left == false)
	{
    
    
		Core core;
		//2.载入并编译模型
		CompiledModel compiled_model_WuzhuangSeCha_left_ = core.compile_model(model_file_WuZhuangSeCha, device_name);

		if (compiled_model_WuzhuangSeCha_left_)
		{
    
    
			infer_request_WuzhuangSeCha_left_ = compiled_model_WuzhuangSeCha_left_.create_infer_request();

			xml_loading_Wu_left = true;
		}
	}
	if (onnx_detect_count >= 60 && infer_request_WuzhuangSeCha_left)
	{
    
    
		infer_request_WuzhuangSeCha_left.~InferRequest();
	}
}

CPPDLL_API void Initial_model_xml_Wu_right()
{
    
    
	if (load_xml_Wu_right && xml_loading_Wu_right == false)
	{
    
    
		Core core;
		//2.载入并编译模型
		CompiledModel compiled_model_WuzhuangSeCha_right_ = core.compile_model(model_file_WuZhuangSeCha, device_name);

		if (compiled_model_WuzhuangSeCha_right_)
		{
    
    
			infer_request_WuzhuangSeCha_right_ = compiled_model_WuzhuangSeCha_right_.create_infer_request();

			xml_loading_Wu_right = true;
		}
	}
	if (onnx_detect_count >= 60 && infer_request_WuzhuangSeCha_right)
	{
    
    
		infer_request_WuzhuangSeCha_right.~InferRequest();
	}
}

void xml_init() 
{
    
    
	Initial_model_xml_left();
	Initial_model_xml_right();
	Initial_model_xml_Wu_left();
	Initial_model_xml_Wu_right();
}

其中 xml_init()函数中包含了我们所有检测模型的xml读取方法，该方法会在后面的onnx模型初始化函数中，传入一个新创建的线程中。针对每一个xml模型的读取方法，简单说一下细节：load_xml_left为全局变量，被我们设置为true，表示xml模型需要被加载，xml_loading_left也是一个全局变量，被设置成false，表示xml模型还没有被加载好。在这两个条件同时满足的情况下，我们进入第一个if语句，实例化一个core，并创建模型对象，再创建好模型对象后，我们进入下一个if语句，即两个模型都存在的情况下，我们对这两个模型创建推理请求，经过我之前的测试，发现主要就是创建推理请求时会占用大量的时间。当推理请求创建好之后，我们将xml_loading_left设置为true，表示xml模型加载完毕，这样我们在之后就不会再进入第一个判断语句中了。来看第三个if语句，当检测计数>=60时并且两个onnx模型都存在时，我们将其析构，释放掉模型占用的内存。再完成这些之后，程序就不会进入第一个if和第三个if，因此也不用担心在检测的过程中，一直执行初始化函数。

定义检测模型的初始化函数（onnx），并将xml的初始化函数加入子线程中

CPPDLL_API void Initial_Model() {
    
    
	//1.创建OpenVINO Runtime Core对象
	Core core;

	//2.载入并编译模型
	CompiledModel compiled_model_left_up = core.compile_model(model_file_onnx, device_name);

	//3.创建推理请求
	infer_request_left_up = compiled_model_left_up.create_infer_request();

	//2.载入并编译模型
	CompiledModel compiled_model_left_down = core.compile_model(model_file_onnx, device_name);

	//3.创建推理请求
	infer_request_left_down = compiled_model_left_down.create_infer_request();

	//2.载入并编译模型
	CompiledModel compiled_model_right_down = core.compile_model(model_file_onnx, device_name);

	//3.创建推理请求
	infer_request_right_up = compiled_model_right_down.create_infer_request();

	//2.载入并编译模型
	CompiledModel compiled_model_right_up = core.compile_model(model_file_onnx, device_name);

	//3.创建推理请求
	infer_request_right_down = compiled_model_right_up.create_infer_request();

	class_names = getClassName("cocoName.txt");
	detect_num_classes = class_names.size();

	initial_model_classification();
	Initial_Model_WuzhuangSeCha();
	
	thread xml_thread(xml_init);
	xml_thread.detach();


	//预热
	//Warm_CPU();
}

这里的重点就是最后两行代码，首先通过thread库函数实例化一个子线程对象，并将xml模型的初始化函数传入其中。然后使用detach()方法将创建的子线程从主调线程中剥离出来，这样的话，当初始化函数调用时，前面的onnx模型加载完毕就不会卡在xml模型加载中，主调线程会继续往下执行，而子线程（被调线程）会驻留后台运行。当使用join()时，主调线程会等待被调线程执行完毕之后，才会继续执行下面的代码。std::thread中的join()和detach()的具体区别可以参考这一篇博客：std::thread中join与detach的区别与实现

前处理，推理，后处理

这里我们选取其中一个模型做为例子，因为这个项目中包含了六次推理过程，我们只选取其中一次，其他的都是差不多的流程

void detect_left_up(cv::Mat frame, string& return_result) {
    
    
	Shape tensor_shape;
	Tensor input_node;
	if (xml_loading_left && onnx_detect_count >= 20)
	{
    
    
		input_node = infer_request_left_up_.get_input_tensor();
		tensor_shape = input_node.get_shape();

		//Mat frame = imread(".\\test_data0\\2.png", IMREAD_COLOR);
		//Lettterbox resize is the default resize method in YOLOv5.
		int w = frame.cols;
		int h = frame.rows;
		int _max = max(h, w);
		Mat image(Size(_max, _max), CV_8UC3, cv::Scalar(255, 255, 255));
		Rect roi(0, 0, w, h);
		frame.copyTo(image(roi));
		//交换RB通道
		cvtColor(image, image, COLOR_BGR2RGB);
		//计算缩放因子
		size_t num_channels = tensor_shape[1];
		size_t height = tensor_shape[2];
		size_t width = tensor_shape[3];
		float x_factor = image.cols / float(width);
		float y_factor = image.rows / float(height);

		//int64 start = cv::getTickCount();
		//缩放图片并归一化
		Mat blob_image;
		resize(image, blob_image, cv::Size(width, height));
		blob_image.convertTo(blob_image, CV_32F);
		blob_image = blob_image / 255.0;

		// 4.3 将图像数据填入input tensor
		Tensor input_tensor = infer_request_left_up_.get_input_tensor();
		// 获取指向模型输入节点数据块的指针
		float* input_tensor_data = input_node.data<float>();
		// 将图片数据填充到模型输入节点中
		// 原有图片数据为 HWC格式，模型输入节点要求的为 CHW 格式
		for (size_t c = 0; c < num_channels; c++) {
    
    
			for (size_t h = 0; h < height; h++) {
    
    
				for (size_t w = 0; w < width; w++) {
    
    
					input_tensor_data[c * width * height + h * width + w] = blob_image.at<Vec<float, 3>>(h, w)[c];
				}
			}
		}

		// 5.执行推理计算
		auto start1 = std::chrono::system_clock::now();
		infer_request_left_up_.infer();
		auto end1 = std::chrono::system_clock::now();
		//auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end1 - start1).count();
		//cout << "left_up推理耗时:" << duration << "ms" << endl;


		// 6.处理推理计算结果
		// 6.1 获得推理结果
		const ov::Tensor& output = infer_request_left_up_.get_tensor("output");
		const float* output_buffer = output.data<const float>();

		// 6.2 解析推理结果，YOLOv5 output format: cx,cy,w,h,score
		int out_rows = output.get_shape()[1]; //获得"output"节点的rows
		int out_cols = output.get_shape()[2]; //获得"output"节点的cols
		Mat det_output(out_rows, out_cols, CV_32F, (float*)output_buffer);

		vector<cv::Rect> boxes;
		vector<int> classIds;
		vector<float> confidences;

		for (int i = 0; i < det_output.rows; i++) {
    
    
			float confidence = det_output.at<float>(i, 4);
			if (confidence < confidence_score) {
    
    
				continue;
			}
			int endindex = detect_num_classes + 5;
			Mat classes_scores = det_output.row(i).colRange(5, endindex);
			Point classIdPoint;
			double score;
			minMaxLoc(classes_scores, 0, &score, 0, &classIdPoint);

			// 置信度 0～1之间
			if (score > confidence_score)
			{
    
    
				float cx = det_output.at<float>(i, 0);
				float cy = det_output.at<float>(i, 1);
				float ow = det_output.at<float>(i, 2);
				float oh = det_output.at<float>(i, 3);
				int x = static_cast<int>((cx - 0.5 * ow) * x_factor);
				int y = static_cast<int>((cy - 0.5 * oh) * y_factor);
				int width = static_cast<int>(ow * x_factor);
				int height = static_cast<int>(oh * y_factor);
				Rect box;
				box.x = x;
				box.y = y;
				box.width = width;
				box.height = height;

				boxes.push_back(box);
				classIds.push_back(classIdPoint.x);
				confidences.push_back(score);
			}
		}

		// NMS
		vector<int> indexes;
		dnn::NMSBoxes(boxes, confidences, threshold_score, threshold_nms, indexes);
		int count = 0;
		if (indexes.size() > max_detection) {
    
    
			count = max_detection;
		}
		else {
    
    
			count = indexes.size();
		}
		for (size_t i = 0; i < count; i++) {
    
    
			int index = indexes[i];
			int idx = classIds[index];


			int x0 = int(boxes[index].x);
			int y0 = int(boxes[index].y);
			int x1 = int(boxes[index].x) + int(boxes[index].width);
			int y1 = int(boxes[index].y) + int(boxes[index].height);
			int area = boxes[index].width * boxes[index].height;

			return_result += class_names[idx] + "," + std::to_string(int(x0)) + "," + std::to_string(int(y0)) + "," + std::to_string(int(x1)) + "," +
				std::to_string(int(y1)) + "," + std::to_string(int(area)) + "," + std::to_string(confidences[index]) + ";" + "\n";

			rectangle(frame, Point(x0, y0), Point(x1, y1), Scalar(0, 255, 255), 4);

			cout << " " << endl;

			//putText(frame, class_names[idx], Point(boxes[index].tl().x, boxes[index].tl().y - 10), FONT_HERSHEY_SIMPLEX, .5, Scalar(0, 0, 0));
		}
		//cv::namedWindow("left_up", WINDOW_NORMAL);
		//cv::imshow("left_up", frame);
		//cv::waitKey(0);
	}
	else
	{
    
    
		input_node = infer_request_left_up.get_input_tensor();
		tensor_shape = input_node.get_shape();

		//Mat frame = imread(".\\test_data0\\2.png", IMREAD_COLOR);
	//Lettterbox resize is the default resize method in YOLOv5.
		int w = frame.cols;
		int h = frame.rows;
		int _max = max(h, w);
		Mat image(Size(_max, _max), CV_8UC3, cv::Scalar(255, 255, 255));
		Rect roi(0, 0, w, h);
		frame.copyTo(image(roi));
		//交换RB通道
		cvtColor(image, image, COLOR_BGR2RGB);
		//计算缩放因子
		size_t num_channels = tensor_shape[1];
		size_t height = tensor_shape[2];
		size_t width = tensor_shape[3];
		float x_factor = image.cols / float(width);
		float y_factor = image.rows / float(height);

		//int64 start = cv::getTickCount();
		//缩放图片并归一化
		Mat blob_image;
		resize(image, blob_image, cv::Size(width, height));
		blob_image.convertTo(blob_image, CV_32F);
		blob_image = blob_image / 255.0;

		// 4.3 将图像数据填入input tensor
		Tensor input_tensor = infer_request_left_up.get_input_tensor();
		// 获取指向模型输入节点数据块的指针
		float* input_tensor_data = input_node.data<float>();
		// 将图片数据填充到模型输入节点中
		// 原有图片数据为 HWC格式，模型输入节点要求的为 CHW 格式
		for (size_t c = 0; c < num_channels; c++) {
    
    
			for (size_t h = 0; h < height; h++) {
    
    
				for (size_t w = 0; w < width; w++) {
    
    
					input_tensor_data[c * width * height + h * width + w] = blob_image.at<Vec<float, 3>>(h, w)[c];
				}
			}
		}

		// 5.执行推理计算
		auto start1 = std::chrono::system_clock::now();
		infer_request_left_up.infer();
		auto end1 = std::chrono::system_clock::now();
		//auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end1 - start1).count();
		//cout << "left_up推理耗时:" << duration << "ms" << endl;


		// 6.处理推理计算结果
		// 6.1 获得推理结果
		const ov::Tensor& output = infer_request_left_up.get_tensor("output");
		const float* output_buffer = output.data<const float>();

		// 6.2 解析推理结果，YOLOv5 output format: cx,cy,w,h,score
		int out_rows = output.get_shape()[1]; //获得"output"节点的rows
		int out_cols = output.get_shape()[2]; //获得"output"节点的cols
		Mat det_output(out_rows, out_cols, CV_32F, (float*)output_buffer);

		vector<cv::Rect> boxes;
		vector<int> classIds;
		vector<float> confidences;

		for (int i = 0; i < det_output.rows; i++) {
    
    
			float confidence = det_output.at<float>(i, 4);
			if (confidence < confidence_score) {
    
    
				continue;
			}
			int endindex = detect_num_classes + 5;
			Mat classes_scores = det_output.row(i).colRange(5, endindex);
			Point classIdPoint;
			double score;
			minMaxLoc(classes_scores, 0, &score, 0, &classIdPoint);

			// 置信度 0～1之间
			if (score > confidence_score)
			{
    
    
				float cx = det_output.at<float>(i, 0);
				float cy = det_output.at<float>(i, 1);
				float ow = det_output.at<float>(i, 2);
				float oh = det_output.at<float>(i, 3);
				int x = static_cast<int>((cx - 0.5 * ow) * x_factor);
				int y = static_cast<int>((cy - 0.5 * oh) * y_factor);
				int width = static_cast<int>(ow * x_factor);
				int height = static_cast<int>(oh * y_factor);
				Rect box;
				box.x = x;
				box.y = y;
				box.width = width;
				box.height = height;

				boxes.push_back(box);
				classIds.push_back(classIdPoint.x);
				confidences.push_back(score);
			}
		}


		// NMS
		vector<int> indexes;
		dnn::NMSBoxes(boxes, confidences, threshold_score, threshold_nms, indexes);
		int count = 0;
		if (indexes.size() > max_detection) {
    
    
			count = max_detection;
		}
		else {
    
    
			count = indexes.size();
		}
		for (size_t i = 0; i < count; i++) {
    
    
			int index = indexes[i];
			int idx = classIds[index];


			int x0 = int(boxes[index].x);
			int y0 = int(boxes[index].y);
			int x1 = int(boxes[index].x) + int(boxes[index].width);
			int y1 = int(boxes[index].y) + int(boxes[index].height);
			int area = boxes[index].width * boxes[index].height;

			return_result += class_names[idx] + "," + std::to_string(int(x0)) + "," + std::to_string(int(y0)) + "," + std::to_string(int(x1)) + "," +
				std::to_string(int(y1)) + "," + std::to_string(int(area)) + "," + std::to_string(confidences[index]) + ";" + "\n";

			rectangle(frame, Point(x0, y0), Point(x1, y1), Scalar(0, 255, 255), 4);
			onnx_detect_count++;

			cout << " " << endl;

			//putText(frame, class_names[idx], Point(boxes[index].tl().x, boxes[index].tl().y - 10), FONT_HERSHEY_SIMPLEX, .5, Scalar(0, 0, 0));
		}
		//cv::namedWindow("left_up", WINDOW_NORMAL);
		//cv::imshow("left_up", frame);
		//cv::waitKey(0);
	}
	// 计算FPS
	//float t = (getTickCount() - start) / static_cast<float>(getTickFrequency());
	//cout << "Infer time(ms): " << t * 1000 << "ms; Detections: " << indexes.size() << endl;
	//putText(frame, format("FPS: %.2f", 1.0 / t), Point(20, 40), FONT_HERSHEY_PLAIN, 2.0, Scalar(255, 0, 0), 2, 8);
	//cv::namedWindow("YOLOv5-6.1 + OpenVINO 2022.1 C++ Demo", WINDOW_NORMAL);
	//imshow("YOLOv5-6.1 + OpenVINO 2022.1 C++ Demo", frame);

	//waitKey(0);
	//destroyAllWindows();

}

这里我们只需要关注两个细节，第一个细节就是if语句的判断条件，xml_loading_left为true即加载完xml模型了，onnx_detect_count >= 20，这个20这个数可以比较随意，只要别超过之前加载模型时设置的60即可，那么为什么可以这么设置呢？在一开始的时候，我在xml的模型加载函数中的onnx模型析构条件里写的是 >= 30而不是>=60，之所以改成后者是因为前者在执行代码的过程中，当onnx模型检测完计数器达到30时，此时执行了析构函数将onnx模型析构掉了，而此时的xml模型并没有加载完毕，导致既没有xml模型也没有onnx模型，程序异常退出，因此我就设置了60，给xml模型足够的加载时间。

总结

至此，我们就顺利解决了经过nncf量化后得到的xml模型，在初始化模型加载的时候时长过长，导致工业生产应用中产生的大量产品漏检的问题，这个过程中我学到了灵活使用模型去检测和thread线程的一些操作。

yolov5-openvino部署-解决量化模型部署时加载速度过慢的问题

yolov5-openvino部署时的模型加载问题

痛点