yolov5 picture frame duplication, large frame package small frame problem solved, c++, python code calls onnx

After the training is completed, yolov5 obtains the model (pt) file, or converts it to an onnx file. When reasoning on the picture, the following situation will occur. The large frame includes a small frame, which will lead to the recognition of two targets in the scene. There are one or more targets, and the drawn frames are all marked on the targets. This phenomenon is more serious in a scene with many targets in a single picture, as shown in the figure below.

        As shown in the figure above, the label of the hat in the upper right corner appears, and the large frame wraps the small frame.

        By searching the information, it is found that the anchor box decoding process is included when generating the model and exporting the onnx model due to the latest code. The exported onnx model is shown in the figure below (only the anchor box decoding process is shown).

Use opencv's dnn module to do yolov5 target detection_nihate's blog-CSDN blog_opencv yolov5

        Here (link above), the detailed process of changing the code processing model, and summarizing the code and processing steps of the relevant parts are as follows (or there are differences, because the code was slightly modified due to error reporting during use).

Make some modifications in the detect class in models/yolo.py (sigmoid can be added or not)

 Code snippets (code snippets can be copied)

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        # if torch.onnx.is_in_onnx_export():
        for i in range(self.nl):  # 分别对三个输出层处理
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            y = x[i].sigmoid()
            z.append(y.view(bs, -1, self.no))
            # z.append(x[i].view(bs, self.na * nx * ny, self.no))
        return torch.cat(z, 1)

Add def my_export_onnx in the export.py file 

@try_export
def my_export_onnx(model, im, file, opset, dynamic, prefix=colorstr('ONNX:')):
    print('anchors:', model.yaml['anchors'])
    wtxt = open('class.names', 'w')
    for name in model.names:
        wtxt.write(name+'\n')
    wtxt.close()
    # YOLOv5 ONNX export
    # print(im.shape)
    if not dynamic:
        f = os.path.splitext(file)[0] + '.onnx'
        torch.onnx.export(model, im, f, verbose=False, opset_version=12, input_names=['images'], output_names=['output'])
    else:
        f = os.path.splitext(file)[0] + '_dynamic.onnx'
        torch.onnx.export(model, im, f, verbose=False, opset_version=12, input_names=['images'],
                          output_names=['output'], dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # shape(1,3,640,640)
                                        'output': {0: 'batch', 1: 'anchors'}  # shape(1,25200,85)
                                        })
    try:
        import cv2
        net = cv2.dnn.readNet(f)
    except:
        exit(f'export {f} failed')
    exit(f'export {f} sucess')

Also modify the def export_onnx in the export.py file. The code looks like this

@try_export
def export_onnx(model, im, file, opset, dynamic, simplify, prefix=colorstr('ONNX:')):
    # YOLOv5 ONNX export
    check_requirements('onnx')
# ============== 2022.12.14剪枝yolov5的decode部分添加判断代码========================
    my_export_onnx(model, im, file, opset,  False, simplify)
 
    import onnx

    LOGGER.info(f'\n{prefix} starting export with onnx {onnx.__version__}...')
    f = file.with_suffix('.onnx')

After the modification is complete, use the following command to generate the onnx model file

python export.py --weights yolov5s.pt --img 640 --batch 1 --include=onnx --simplify 

The modified onnx model file, the anchor box decode process is shown in the figure below

 At the same time, the model reasoning code also needs to be changed, the python code is as follows

import cv2
import argparse
import numpy as np

class yolov5():
    def __init__(self, modelpath, confThreshold=0.5, nmsThreshold=0.5, objThreshold=0.5):
        with open(r'F:\XunLeiDownLoad\yolov5-v6.1-opencv-onnxrun-main\opencv/safetyclass.names', 'rt') as f:
            self.classes = f.read().rstrip('\n').split('\n')
        self.num_classes = len(self.classes)
        if modelpath.endswith('6.onnx'):
            self.inpHeight, self.inpWidth = 1280, 1280
            anchors = [[19, 27, 44, 40, 38, 94], [96, 68, 86, 152, 180, 137], [140, 301, 303, 264, 238, 542],
                       [436, 615, 739, 380, 925, 792]]
            self.stride = np.array([8., 16., 32., 64.])
        else:
            self.inpHeight, self.inpWidth = 640, 640
            anchors = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]]
            self.stride = np.array([8., 16., 32.])
        self.nl = len(anchors)
        self.na = len(anchors[0]) // 2
        self.grid = [np.zeros(1)] * self.nl
        self.anchor_grid = np.asarray(anchors, dtype=np.float32).reshape(self.nl, -1, 2)
        self.net = cv2.dnn.readNet(modelpath)
        self.confThreshold = confThreshold
        self.nmsThreshold = nmsThreshold
        self.objThreshold = objThreshold
        self._inputNames = ''

    def resize_image(self, srcimg, keep_ratio=True, dynamic=False):
        top, left, newh, neww = 0, 0, self.inpWidth, self.inpHeight
        if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
            hw_scale = srcimg.shape[0] / srcimg.shape[1]
            if hw_scale > 1:
                newh, neww = self.inpHeight, int(self.inpWidth / hw_scale)
                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
                if not dynamic:
                    left = int((self.inpWidth - neww) * 0.5)
                    img = cv2.copyMakeBorder(img, 0, 0, left, self.inpWidth - neww - left, cv2.BORDER_CONSTANT,
                                             value=(114, 114, 114))  # add border
            else:
                newh, neww = int(self.inpHeight * hw_scale), self.inpWidth
                img = cv2.resize(srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
                if not dynamic:
                    top = int((self.inpHeight - newh) * 0.5)
                    img = cv2.copyMakeBorder(img, top, self.inpHeight - newh - top, 0, 0, cv2.BORDER_CONSTANT,
                                             value=(114, 114, 114))
        else:
            img = cv2.resize(srcimg, (self.inpWidth, self.inpHeight), interpolation=cv2.INTER_AREA)
        return img, newh, neww, top, left

    def _make_grid(self, nx=20, ny=20):
        xv, yv = np.meshgrid(np.arange(ny), np.arange(nx))
        return np.stack((xv, yv), 2).reshape((-1, 2)).astype(np.float32)

    def preprocess(self, img):
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32) / 255.0
        return img

    def postprocess(self, frame, outs, padsize=None):
        frameHeight = frame.shape[0]
        frameWidth = frame.shape[1]
        newh, neww, padh, padw = padsize
        ratioh, ratiow = frameHeight / newh, frameWidth / neww
        # Scan through all the bounding boxes output from the network and keep only the
        # ones with high confidence scores. Assign the box's class label as the class with the highest score.

        confidences = []
        boxes = []
        classIds = []
        for detection in outs:
            if detection[4] > self.objThreshold:
                scores = detection[5:]
                classId = np.argmax(scores)
                confidence = scores[classId] * detection[4]
                if confidence > self.confThreshold:
                    center_x = int((detection[0] - padw) * ratiow)
                    center_y = int((detection[1] - padh) * ratioh)
                    width = int(detection[2] * ratiow)
                    height = int(detection[3] * ratioh)
                    left = int(center_x - width * 0.5)
                    top = int(center_y - height * 0.5)

                    confidences.append(float(confidence))
                    boxes.append([left, top, width, height])
                    classIds.append(classId)
        # Perform non maximum suppression to eliminate redundant overlapping boxes with
        # lower confidences.
        indices = cv2.dnn.NMSBoxes(boxes, confidences, self.confThreshold, self.nmsThreshold).flatten()
        for i in indices:
            box = boxes[i]
            left = box[0]
            top = box[1]
            width = box[2]
            height = box[3]
            frame = self.drawPred(frame, classIds[i], confidences[i], left, top, left + width, top + height)
        return frame

    def drawPred(self, frame, classId, conf, left, top, right, bottom):
        # Draw a bounding box.
        cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), thickness=4)

        label = '%.2f' % conf
        label = '%s:%s' % (self.classes[classId], label)

        # Display the label at the top of the bounding box
        labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        top = max(top, labelSize[1])
        # cv.rectangle(frame, (left, top - round(1.5 * labelSize[1])), (left + round(1.5 * labelSize[0]), top + baseLine), (255,255,255), cv.FILLED)
        cv2.putText(frame, label, (left, top - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), thickness=2)
        return frame

    def detect(self, srcimg):
        img, newh, neww, padh, padw = self.resize_image(srcimg)
        blob = cv2.dnn.blobFromImage(img, scalefactor=1 / 255.0, swapRB=True)
        # blob = cv2.dnn.blobFromImage(self.preprocess(img))
        # Sets the input to the network
        self.net.setInput(blob, self._inputNames)

        # Runs the forward pass to get output of the output layers
        outs = self.net.forward(self.net.getUnconnectedOutLayersNames())[0].squeeze(axis=0)

        # inference output
        row_ind = 0
        for i in range(self.nl):
            h, w = int(self.inpHeight / self.stride[i]), int(self.inpWidth / self.stride[i])
            length = int(self.na * h * w)
            if self.grid[i].shape[2:4] != (h, w):
                self.grid[i] = self._make_grid(w, h)

            outs[row_ind:row_ind + length, 0:2] = (outs[row_ind:row_ind + length, 0:2] * 2. - 0.5 + np.tile(
                self.grid[i], (self.na, 1))) * int(self.stride[i])
            outs[row_ind:row_ind + length, 2:4] = (outs[row_ind:row_ind + length, 2:4] * 2) ** 2 * np.repeat(
                self.anchor_grid[i], h * w, axis=0)
            row_ind += length
        srcimg = self.postprocess(srcimg, outs, padsize=(newh, neww, padh, padw))
        return srcimg

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--imgpath', type=str, default=r'E:\code\detect\yolov5\dataset\safety_clothing\test_safety_clothing\images1\398404624-1-16_6675.jpg', help="image path")
    parser.add_argument('--modelpath', type=str, default=r'E:\code\detect\yolov5\runs\train\all_safetly\best_all_che.onnx')
    parser.add_argument('--confThreshold', default=0.3, type=float, help='class confidence')
    parser.add_argument('--nmsThreshold', default=0.5, type=float, help='nms iou thresh')
    parser.add_argument('--objThreshold', default=0.3, type=float, help='object confidence')
    args = parser.parse_args()

    yolonet = yolov5(args.modelpath, confThreshold=args.confThreshold, nmsThreshold=args.nmsThreshold,
                     objThreshold=args.objThreshold)
    srcimg = cv2.imread(args.imgpath)
    srcimg = yolonet.detect(srcimg)

    winName = 'Deep learning object detection in OpenCV'
    cv2.namedWindow(winName, 0)
    cv2.imshow(winName, srcimg)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

safetyclass.names is a label file, the following figure is a screenshot display

The c++ reasoning code is as follows

#include <fstream>
#include <sstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

using namespace cv;
using namespace dnn;
using namespace std;

struct Net_config
{
	float confThreshold; // Confidence threshold
	float nmsThreshold;  // Non-maximum suppression threshold
	float objThreshold;  //Object Confidence threshold
	string modelpath;
};

//int endsWith(const string& s, const string& sub) {
//	return s.rfind(sub) == (s.length() - sub.length()) ? 1 : 0;
//}

const float anchors_640[3][6] = { {10.0,  13.0, 16.0,  30.0,  33.0,  23.0},
								 {30.0,  61.0, 62.0,  45.0,  59.0,  119.0},
								 {116.0, 90.0, 156.0, 198.0, 373.0, 326.0} };

//const float anchors_1280[4][6] = { {19, 27, 44, 40, 38, 94},{96, 68, 86, 152, 180, 137},{140, 301, 303, 264, 238, 542},
//					   {436, 615, 739, 380, 925, 792} };

class YOLO
{
public:
	explicit YOLO(const Net_config& config);
	void detect(Mat& frame);
private:
	float* anchors;
	int num_stride;
	int inpWidth;
	int inpHeight;
	vector<string> class_names;
//	int num_class;
	
	float confThreshold;
	float nmsThreshold;
	float objThreshold;
	const bool keep_ratio = true;
	Net net;
	void drawPred(float conf, int left, int top, int right, int bottom, Mat& frame, int classid);
	Mat resize_image(const Mat& srcimg, int *newh, int *neww, int *top, int *left) const;
};

YOLO::YOLO(const Net_config& config)
{
	this->confThreshold = config.confThreshold;
	this->nmsThreshold = config.nmsThreshold;
	this->objThreshold = config.objThreshold;

	this->net = readNet(config.modelpath);
//	ifstream ifs("F:\\XunLeiDownLoad\\yolov5-v6.1-opencv-onnxrun-main\\opencv/class.names");
    ifstream ifs(R"(F:\XunLeiDownLoad\yolov5-v6.1-opencv-onnxrun-main\opencv/safetyclass.names)");
	string line;
	while (getline(ifs, line)) this->class_names.push_back(line);
//	this->num_class = class_names.size();

//	if (endsWith(config.modelpath, "6.onnx"))
//	{
//		anchors = (float*)anchors_1280;
//		this->num_stride = 4;
//		this->inpHeight = 1280;
//		this->inpWidth = 1280;
//	}
//	else
//	{
		anchors = (float*)anchors_640;
		this->num_stride = 3;
		this->inpHeight = 640;
		this->inpWidth = 640;
//	}
}

Mat YOLO::resize_image(const Mat& srcimg, int *newh, int *neww, int *top, int *left) const
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	Mat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
			*left = int((this->inpWidth - *neww) * 0.5);
			copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, BORDER_CONSTANT, 114);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
			*top = (int)(this->inpHeight - *newh) * 0.5;
			copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, BORDER_CONSTANT, 114);
		}
	}
	else {
		resize(srcimg, dstimg, Size(*neww, *newh), INTER_AREA);
	}
	return dstimg;
}

void YOLO::drawPred(float conf, int left, int top, int right, int bottom, Mat& frame, int classid)   // Draw the predicted bounding box
{
	//Draw a rectangle displaying the bounding box
	rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 0, 255), 2);

	//Get the label for the class name and its confidence
	string label = format("%.2f", conf);
	label = this->class_names[classid] + ":" + label;

	//Display the label at the top of the bounding box
	int baseLine;
	Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
	top = max(top, labelSize.height);
	//rectangle(frame, Point(left, top - int(1.5 * labelSize.height)), Point(left + int(1.5 * labelSize.width), top + baseLine), Scalar(0, 255, 0), FILLED);
	putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
}

void YOLO::detect(Mat& frame)
{
	int newh = 0, neww = 0, padh = 0, padw = 0;
	Mat dstimg = this->resize_image(frame, &newh, &neww, &padh, &padw);
	Mat blob = blobFromImage(dstimg, 1 / 255.0, Size(this->inpWidth, this->inpHeight), Scalar(0, 0, 0), true, false);
	this->net.setInput(blob);
	vector<Mat> outs;
	this->net.forward(outs, this->net.getUnconnectedOutLayersNames());

	int num_proposal = outs[0].size[1];
	int nout = outs[0].size[2];
	if (outs[0].dims > 2)
	{
		outs[0] = outs[0].reshape(0, num_proposal);
	}
	/generate proposals
	vector<float> confidences;
	vector<Rect> boxes;
	vector<int> classIds;
	float ratioh = (float)frame.rows / newh, ratiow = (float)frame.cols / neww;
	int n = 0, q = 0, i = 0, j = 0, row_ind = 0; ///xmin,ymin,xamx,ymax,box_score,class_score
	auto* pdata = (float*)outs[0].data;
	for (n = 0; n < this->num_stride; n++)   ///特征图尺度
	{
		const float stride = pow(2, n + 3);
		int num_grid_x = (int)ceil((this->inpWidth / stride));
		int num_grid_y = (int)ceil((this->inpHeight / stride));
		for (q = 0; q < 3; q++)    ///anchor
		{
			const float anchor_w = this->anchors[n * 6 + q * 2];
			const float anchor_h = this->anchors[n * 6 + q * 2 + 1];
			for (i = 0; i < num_grid_y; i++)
			{
				for (j = 0; j < num_grid_x; j++)
				{
					float box_score = pdata[4];
					if (box_score > this->objThreshold)
					{
						Mat scores = outs[0].row(row_ind).colRange(5, nout);
						Point classIdPoint;
						double max_class_socre;
						// Get the value and location of the maximum score
						minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);
						max_class_socre *= box_score;
						if (max_class_socre > this->confThreshold)
						{ 
							const int class_idx = classIdPoint.x;
							float cx = (pdata[0] * 2.f - 0.5f + j) * stride;  ///cx
							float cy = (pdata[1] * 2.f - 0.5f + i) * stride;   ///cy
							float w = powf(pdata[2] * 2.f, 2.f) * anchor_w;   ///w
							float h = powf(pdata[3] * 2.f, 2.f) * anchor_h;  ///h

							int left = int((cx - padw - 0.5 * w)*ratiow);
							int top = int((cy - padh - 0.5 * h)*ratioh);

							confidences.push_back((float)max_class_socre);
//							boxes.push_back(Rect(left, top, (int)(w*ratiow), (int)(h*ratioh)));
                            boxes.emplace_back(left, top, (int)(w*ratiow), (int)(h*ratioh));
							classIds.push_back(class_idx);
						}
					}
					row_ind++;
					pdata += nout;
				}
			}
		}
	}

	// Perform non maximum suppression to eliminate redundant overlapping boxes with
	// lower confidences
	vector<int> indices;
    /*dnn::NMSBoxes
     * 作用:根据给定的检测boxes和对应的scores进行NMS(非极大值抑制)处理
     * NMSBoxes(bboxes,
             scores,
             score_threshold,
             nms_threshold,
             eta=None,
             top_k=None)
            参数:
            boxes: 待处理的边界框 bounding boxes
            scores: 对于于待处理边界框的 scores
            score_threshold: 用于过滤 boxes 的 score 阈值
            nms_threshold: NMS 用到的阈值
            indices: NMS 处理后所保留的边界框的索引值
            eta: 自适应阈值公式中的相关系数:
     * */
	dnn::NMSBoxes(boxes, confidences, this->confThreshold, this->nmsThreshold, indices);
//	for (size_t i = 0; i < indices.size(); ++i)
//	{
//		int idx = indices[i];
//		Rect box = boxes[idx];
//		this->drawPred(confidences[idx], box.x, box.y,
//			box.x + box.width, box.y + box.height, frame, classIds[idx]);
//	}
    for (int idx : indices)
    {
        Rect box = boxes[idx];
        this->drawPred(confidences[idx], box.x, box.y,
                       box.x + box.width, box.y + box.height, frame, classIds[idx]);
    }
}

int main()
{
//	Net_config yolo_nets = { 0.3, 0.5, 0.3, "F:\\XunLeiDownLoad\\yolov5-v6.1-opencv-onnxrun-main\\opencv/weights/yolov5s.onnx" };
  Net_config yolo_nets = { 0.3, 0.5, 0.3, R"(E:\code\detect\yolov5\runs\train\all_safetly\best_all_che.onnx)" };
	YOLO yolo_model(yolo_nets);
	string imgpath = R"(E:\code\detect\yolov5\dataset\safety_clothing\test_safety_clothing\images1\398404624-1-16_6675.jpg)";
	Mat srcimg = imread(imgpath);
	yolo_model.detect(srcimg);
    string saveimg_path= R"(E:\code\detect\yolov5\testsave\safety2.jpg)";
    imwrite(saveimg_path, srcimg);


//	static const string kWinName = "Deep learning object detection in OpenCV";
//	namedWindow(kWinName, WINDOW_NORMAL);
//	imshow(kWinName, srcimg);
//	waitKey(5000);
//	destroyAllWindows();
}

After the above process, the inference result of the picture is as follows

 

 It can be seen that the phenomenon that a large frame wraps a small frame disappears.

Reference article:
Use opencv's dnn module to do yolov5 target detection !'s blog-CSDN blog_yolov5 conversion xnno

Original code address: https://github.com/hpc203/yolov5-v6.1-opencv-onnxrun

Guess you like

Origin blog.csdn.net/weixin_45994963/article/details/128338286