一.环境

我的环境：ubuntu18.04，kernel5.4，cuda11.2，RTX3050，cmake3.22，没有cudnn

二、onnx部署

1.下载

在你的终端虚拟环境输入

pip install onnx
pip install onnxruntime-gpu  （这行命令是用onnx推理时才必要的）

2.运行models文件夹里的expor.py,得到.onnx

注意运行时需要输入img_size大小，必须和你之后detect时用的img_size相匹配且必须为64的倍数？

改完后能正常推理，但是输出结果没有目标框

原因：经过调试，原来pred输出的tensor为1*30240*30，30对应（4xywh+1置信度+8关键点+17类），而onnx输出的pred为1*3*96*96*30。实际上是因为onnx-runtimex运行之后应该还有一些后处理操作，而export.py没有这个功能。这些功能被集成到了torch2tensorrt文件夹的main.py中。接下来直接将其转化为tensorrt

ps：model.model[-1].export = True  # set Detect() layer export=True  这一行注释掉，则卷积出来的结果是对的，但不能转化为onnx

三、tensorrt部署

1.下载cudnn

Ubuntu16.04下安装cuda和cudnn的三种方法（亲测全部有效）_隔壁老王的博客-CSDN博客_ubuntu安装cuda

ps：先看评论区第一条

2.下载tensorrt

https://developer.nvidia.cn/nvidia-tensorrt-7x-download

下载.tar格式的

扫描二维码关注公众号，回复： 14254552 查看本文章

3.安装tensorrt

yolov5的TensorRT部署【tensorrt+cudnn@主机】_epic_Lin的博客-CSDN博客

按这位博主的流程安装好后，运行import tensorrt，报错

libcudnn.so.8: cannot open shared object file:....

解决办法：进入环境变量

终端执行gedit ~/.bashrc

注释掉之前写的那行

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/TensorRT-7.2.2.3/lib

添加
export LD_LIBRARY_PATH="/opt/TensorRT-7.2.2.3/lib:$LD_LIBRARY_PATH"
export PATH="/usr/local/cuda-11.1/bin:$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-11.1/lib64:$LD_LIBRARY_PATH"

保存并关闭（ps：记得改为自己的版本）

终端执行source ~/.bashrc

4.pycharm报错

如上执行之后，可以在命令行中执行文件，但是在pycharm中执行还是会报原来的错

原因：没有将新配置的路径加入pycharm的环境变量

解决办法：

打开pycharm，右上角编辑配置-->环境-->环境变量

直接复制LD_LIBRARY_PATH处的/usr/local/cuda-11.1/lib64即可

TensorFlow可以在终端和通过终端打开的PyCharm中运行，不能在直接打开的PyCharm中运行 - Youpeng - 博客园

5.程序无反馈

执行卡在 Building an engine from file/home/luoxinhao/桌面/yolov5-face-master-lxh/yoloface训练/torch2tensorrt/2000+3000esay.onnx' this may take a while... 这一步

解决办法：等十分钟（真的只是单纯的慢而已....）

6.更改main.py代码

源代码如下

import os
import sys
import cv2
import copy
import torch
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径：获取当前路径，再上级路径
sys.path.append(root_path)  # 将项目根路径写入系统路径
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks,show_results
cur_path=os.path.abspath(os.path.dirname(__file__))
def img_process(img_path,long_side=320,stride_max=32):
    '''
    图像预处理
    '''
    orgimg=cv2.imread(img_path)
    img0 = copy.deepcopy(orgimg)
    h0, w0 = orgimg.shape[:2]  # orig hw
    r = long_side/ max(h0, w0)  # resize image to img_size
    if r != 1:  # always resize down, only resize up if training with augmentation
        interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
        img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)

    imgsz = check_img_size(long_side, s=stride_max)  # check img_size

    img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形   False固定尺度
    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1).copy()  # BGR to RGB, to 3x416x416
    img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    return img,orgimg

def img_vis(img,orgimg,pred,device,vis_thres = 0.6):
    '''
    预测可视化
    vis_thres: 可视化阈值
    '''

    print('img.shape: ', img.shape)
    print('orgimg.shape: ', orgimg.shape)

    no_vis_nums=0
    # Process detections
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device)  # normalization gain whwh
        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]].to(device)  # normalization gain landmarks
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class

            det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round()

            for j in range(det.size()[0]):
                
                
                if det[j, 4].cpu().numpy() < vis_thres:
                    no_vis_nums+=1
                    continue

                xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
                conf = det[j, 4].cpu().numpy()
                landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist()
                class_num = det[j, 15].cpu().numpy()
                orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)

    cv2.imwrite(cur_path+'/result.jpg', orgimg)
    print('result save in '+cur_path+'/result.jpg')


if __name__ == '__main__':
    # ============参数================
    img_path=cur_path+"/sample.jpg" #测试图片路径
    device="cuda:0" 
    onnx_model_path=cur_path+"/2000+3000.onnx" #ONNX模型路径
    fp16_mode=True  #True则FP16推理

    # ============图像预处理================
    img,orgimg=img_process(img_path) #[1,3,320,320]
    
    # ============TensorRT推理================
    # 初始化TensorRT引擎
    yolo_trt_model=YoloTrtModel(device,onnx_model_path,fp16_mode)

    # 耗时统计 = tensorrt推理 + torch后处理
    pred=yolo_trt_model(img.cpu().numpy()) #tensorrt推理
    pred=yolo_trt_model.after_process(pred,device) # torch后处理

    # Apply NMS
    pred = non_max_suppression_face(pred, conf_thres=0.3, iou_thres=0.5)
   
    # ============可视化================
    img_vis(img,orgimg,pred,device)

改完如下

import os
import sys
import cv2
import copy
import torch
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径：获取当前路径，再上级路径
sys.path.append(root_path)  # 将项目根路径写入系统路径
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks,show_results
cur_path=os.path.abspath(os.path.dirname(__file__))
def img_process(img_path,long_side=640,stride_max=32):
    '''
    图像预处理
    '''
    orgimg=cv2.imread(img_path)
    img0 = copy.deepcopy(orgimg)
    h0, w0 = orgimg.shape[:2]  # orig hw
    r = long_side/ max(h0, w0)  # resize image to img_size
    if r != 1:  # always resize down, only resize up if training with augmentation
        interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
        img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)

    imgsz = check_img_size(long_side, s=stride_max)  # check img_size

    img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形   False固定尺度
    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1).copy()  # BGR to RGB, to 3x416x416
    img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    return img,orgimg

def img_vis(img,orgimg,pred,device,vis_thres = 0.6):
    '''
    预测可视化
    vis_thres: 可视化阈值
    '''

    print('img.shape: ', img.shape)
    print('orgimg.shape: ', orgimg.shape)

    no_vis_nums=0
    # Process detections
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device)  # normalization gain whwh
        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0]].to(device)  # normalization gain landmarks   去掉两个
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class

            det[:, 5:13] = scale_coords_landmarks(img.shape[2:], det[:, 5:13], orgimg.shape).round()   #15变13

            for j in range(det.size()[0]):
                
                
                if det[j, 4].cpu().numpy() < vis_thres:
                    no_vis_nums+=1
                    continue

                xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
                conf = det[j, 4].cpu().numpy()
                landmarks = (det[j, 5:13].view(1, 8) / gn_lks).view(-1).tolist()   #15变13，10变8
                class_num = det[j, 13].cpu().numpy()             #15变13
                orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)

    cv2.imwrite(cur_path+'/result.jpg', orgimg)
    print('result save in '+cur_path+'/result.jpg')


if __name__ == '__main__':
    # ============参数================
    img_path=cur_path+"/sample.jpeg" #测试图片路径
    device="cuda:0" 
    onnx_model_path=cur_path+"/2000+-3000.onnx" #ONNX模型路径
    fp16_mode=True  #True则FP16推理

    # ============图像预处理================
    img,orgimg=img_process(img_path) #[1,3,320,320]
    
    # ============TensorRT推理================
    # 初始化TensorRT引擎
    yolo_trt_model=YoloTrtModel(device,onnx_model_path,fp16_mode)

    # 耗时统计 = tensorrt推理 + torch后处理
    pred=yolo_trt_model(img.cpu().numpy()) #tensorrt推理
    pred=yolo_trt_model.after_process(pred,device) # torch后处理

    # Apply NMS
    pred = non_max_suppression_face(pred, conf_thres=0.3, iou_thres=0.5)
   
    # ============可视化================
    img_vis(img,orgimg,pred,device)

7.执行tensorrt权重

首先将onnx文件路径换成trt的路径

然后进入YoloTrtModel，将转换代码注释掉

ps：注意图像大小的匹配

7.1报错

pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument

报错原因：输入图像和模型尺寸不匹配（由于转换为onnx尺寸固定，所以trt也是固定的，只能更改输入图像大小

7.2 报错

ValueError: cannot reshape array of size 576000 into shape (1,3,80,80,16)

报错原因：因为我将关键点改为4个，并且类别数也改变为17了，所以需要重写输出特征，16改为30（xywh+置信度+关键点+类别）

        # 输出特征
        self.stride8_shape=(1,3,80,80,16)
        self.stride16_shape=(1,3,40,40,16)
        self.stride32_shape=(1,3,20,20,16)

7.3 能正常运行，但结果不正确

解决办法

修改函数

def after_process(self,pred,device):

原代码

    def after_process(self,pred,device):
        '''
        Pytorch后处理
        pred: tensorrt输出
        device: "cuda:0"
        '''

        # 降8、16、32倍
        stride= torch.tensor([8.,16.,32.]).to(device)

        x=[torch.from_numpy(pred[0]).to(device),torch.from_numpy(pred[1]).to(device),torch.from_numpy(pred[2]).to(device)]
        # =====提取自models/yolo.py=====
        no=16 # 4坐标+1置信度+10关键点坐标+1类别
        nl=3
     
        grid=[torch.zeros(1).to(device)] * nl 

        anchor_grid=torch.tensor([[[[[[  4.,   5.]]],
            [[[  8.,  10.]]],
            [[[ 13.,  16.]]]]],
            [[[[[ 23.,  29.]]],
            [[[ 43.,  55.]]],
            [[[ 73., 105.]]]]],
            [[[[[146., 217.]]],
            [[[231., 300.]]],
            [[[335., 433.]]]]]]).to(device)
       
        
        z = [] 
        for i in range(len(x)):
        
            bs,ny, nx = x[i].shape[0],x[i].shape[2] ,x[i].shape[3] 
            if grid[i].shape[2:4] != x[i].shape[2:4]:
                grid[i] = self._make_grid(nx, ny).to(x[i].device)
            y = torch.full_like(x[i], 0)
            y[..., [0,1,2,3,4,15]] = x[i][..., [0,1,2,3,4,15]].sigmoid()
            y[..., 5:15] = x[i][..., 5:15]
            #y = x[i].sigmoid()

            y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + grid[i].to(x[i].device)) * stride[i]  # xy
            y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid[i]  # wh

            #y[..., 5:15] = y[..., 5:15] * 8 - 4
            y[..., 5:7]   = y[..., 5:7] *   anchor_grid[i] + grid[i].to(x[i].device) * stride[i] # landmark x1 y1
            y[..., 7:9]   = y[..., 7:9] *   anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x2 y2
            y[..., 9:11]  = y[..., 9:11] *  anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x3 y3
            y[..., 11:13] = y[..., 11:13] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x4 y4
            y[..., 13:15] = y[..., 13:15] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x5 y5

            #y[..., 5:7] = (y[..., 5:7] * 2 -1) * anchor_grid[i]  # landmark x1 y1
            #y[..., 7:9] = (y[..., 7:9] * 2 -1) * anchor_grid[i]  # landmark x2 y2
            #y[..., 9:11] = (y[..., 9:11] * 2 -1) * anchor_grid[i]  # landmark x3 y3
            #y[..., 11:13] = (y[..., 11:13] * 2 -1) * anchor_grid[i]  # landmark x4 y4
            #y[..., 13:15] = (y[..., 13:15] * 2 -1) * anchor_grid[i]  # landmark x5 y5

            z.append(y.view(bs, -1, no))
        return torch.cat(z, 1)

修改如下

    def after_process(self,pred,device):
        '''
        Pytorch后处理
        pred: tensorrt输出
        device: "cuda:0"
        '''

        # 降8、16、32倍
        stride= torch.tensor([8.,16.,32.]).to(device)

        x=[torch.from_numpy(pred[0]).to(device),torch.from_numpy(pred[1]).to(device),torch.from_numpy(pred[2]).to(device)]
        # =====提取自models/yolo.py=====
        no=30 # 4坐标+1置信度+8关键点坐标+17类别
        nc=17
        nl=3
     
        grid=[torch.zeros(1).to(device)] * nl 

        anchor_grid=torch.tensor([[[[[[  4.,   5.]]],
            [[[  8.,  10.]]],
            [[[ 13.,  16.]]]]],
            [[[[[ 23.,  29.]]],
            [[[ 43.,  55.]]],
            [[[ 73., 105.]]]]],
            [[[[[146., 217.]]],
            [[[231., 300.]]],
            [[[335., 433.]]]]]]).to(device)
       
        
        z = [] 
        for i in range(len(x)):   #x是一个列表，相当于遍历x里的元素
        
            bs,ny, nx = x[i].shape[0],x[i].shape[2] ,x[i].shape[3] 
            if grid[i].shape[2:4] != x[i].shape[2:4]:
                grid[i] = self._make_grid(nx, ny).to(x[i].device)
            y = torch.full_like(x[i], 0)
            class_range = list(range(5)) + list(range(13, 13 + nc))
            y[..., class_range] = x[i][..., class_range].sigmoid()  # 这里是只对关键点以外的值进行sigmoid
            y[..., 5:13] = x[i][..., 5:13]
            #y = x[i].sigmoid()

            y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + grid[i].to(x[i].device)) * stride[i]  # xy
            y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid[i]  # wh

            #y[..., 5:15] = y[..., 5:15] * 8 - 4
            y[..., 5:7]   = y[..., 5:7] *   anchor_grid[i] + grid[i].to(x[i].device) * stride[i] # landmark x1 y1
            y[..., 7:9]   = y[..., 7:9] *   anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x2 y2
            y[..., 9:11]  = y[..., 9:11] *  anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x3 y3
            y[..., 11:13] = y[..., 11:13] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x4 y4
            # y[..., 13:15] = y[..., 13:15] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x5 y5

            #y[..., 5:7] = (y[..., 5:7] * 2 -1) * anchor_grid[i]  # landmark x1 y1
            #y[..., 7:9] = (y[..., 7:9] * 2 -1) * anchor_grid[i]  # landmark x2 y2
            #y[..., 9:11] = (y[..., 9:11] * 2 -1) * anchor_grid[i]  # landmark x3 y3
            #y[..., 11:13] = (y[..., 11:13] * 2 -1) * anchor_grid[i]  # landmark x4 y4
            #y[..., 13:15] = (y[..., 13:15] * 2 -1) * anchor_grid[i]  # landmark x5 y5

            z.append(y.view(bs, -1, no))
        return torch.cat(z, 1)

7.4 各部分耗时

初始化引擎： 1.2982511520385742
tensorrt推理： 0.005248308181762695
torch后处理 0.0032684803009033203
非极大值抑制 0.0015017986297607422

可以看出，torch后处理和非极大值抑制还是在pytorch上运行的，tensorrt，因此这两部分耗时颇多

八.重写main.py

由于main.py将onnx转换为tensorrt和tensorrt运行集成到了一起，并且不能对某些参数进行调整，也不能运行视频，因此我打算重写一份main.py，命名为trtdetect.py

遇到问题：opencv.namewindows（）卡死，无图像输出

原因：opencv配置环境有问题，没有安装qt

解决办法：在pycharm的IDLE中搜索qt，然后安装（在虚拟环境中pip install qt安装失败）

修改后，trtdetect.py如下

import os
import sys
import cv2
import copy
import torch
import argparse

root_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))  # 项目根路径：获取当前路径，再上级路径
sys.path.append(root_path)  # 将项目根路径写入系统路径
from utils.general import check_img_size, non_max_suppression_face, scale_coords, xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks, show_results

cur_path = os.path.abspath(os.path.dirname(__file__))
from utils.torch_utils import time_synchronized


def img_process(img_path, img1=str(0),long_side=640, stride_max=32):
    '''
    图像预处理
    如果输入的是图片路径，则img_path为路径，img1为0
    如果输入的是视频，则img_path不重要，img1是视频原图
    无论输入的视频还是图片路径是什么样子，最终输出的，识别后的都是640*640
    事实证明后面那两个参数没用，因为输入图像大小一定是原图，输出一定是onnx大小
    '''
    if img1!= str(0):   #如果输入的是视频 即传入source是0（默认是none）
        orgimg=img1     #则令orgimg等于视频
    else:               # 如果输入的不是视频
        orgimg = cv2.imread(img_path)
    img0 = copy.deepcopy(orgimg)
    # h0, w0 = orgimg.shape[:2]  # orig hw  直接读取得到的
    # r = long_side / max(h0, w0)  # resize image to img_size
    # if r != 1:  # always resize down, only resize up if training with augmentation
    #     interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
    #     img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
    #
    # imgsz = check_img_size(long_side, s=stride_max)  # check img_size

    img = letterbox(img0, new_shape=opt.trt_size, auto=False)[0]  # auto True最小矩形 ,缩放成想要的矩形  False固定尺度  传入的img_size是img0，也就是未缩放的图，img是按长边缩放得到的矩形，
    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1).copy()  # BGR to RGB, to 3x416x416
    img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    return img, orgimg




def img_vis(img, orgimg, pred, device, vis_thres=0.6,save_jpg=True):
    '''
    预测可视化
    vis_thres: 可视化阈值
    这里传入img除了print一下并没有什么作用
    默认图片保存，视频直接imshow不保存
    '''

    print('         img.shape: ', img.shape)
    print('         orgimg.shape: ', orgimg.shape)

    no_vis_nums = 0
    # Process detections
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device)  # normalization gain whwh
        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0]].to(device)  # normalization gain landmarks   去掉两个
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class

            det[:, 5:13] = scale_coords_landmarks(img.shape[2:], det[:, 5:13], orgimg.shape).round()  # 15变13

            for j in range(det.size()[0]):

                if det[j, 4].cpu().numpy() < vis_thres:
                    no_vis_nums += 1
                    continue

                xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
                conf = det[j, 4].cpu().numpy()
                landmarks = (det[j, 5:13].view(1, 8) / gn_lks).view(-1).tolist()  # 15变13，10变8
                class_num = det[j, 13].cpu().numpy()  # 15变13
                orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
    if save_jpg == True:
        cv2.imwrite(cur_path + '/result.jpg', orgimg)
        print('result save in ' + cur_path + '/result.jpg')


if __name__ == '__main__':
    # ============参数================
    parser = argparse.ArgumentParser()   #所有路径根目录都是yoloface训练
    parser.add_argument('--weights', type=str, default= 'weights/2000+3000.trt', help='weights path')  # from yolov5/models/
    parser.add_argument('--trt_size', nargs='+', type=int, default=[640, 640], help='must same as onnx,imput 2 numbers')  # 在letterbox函数调用 必须与onnx模型匹配
    parser.add_argument('--source', type=str, default='data/images/1.jpeg', help='file/dir/URL/glob, 0 for webcam') #判断信息源 如果为0，1，2，200则认为是视频流
    parser.add_argument('--cls', type=int, default=17, help='class numbers')  #在初始化引擎时调用 必须与onnx模型匹配
    parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold')  #在非极大值抑制时调用
    parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold') #在非极大值抑制时调用
    opt = parser.parse_args()
    if len(opt.trt_size)!=2:
        print("error:--trt_size must imput 2 numbers")
        exit()


    # ============参数处理=============
    device = "cuda:0"
    fp16_mode = True  # True则FP16推理
    onnx_model_path = root_path + "/" + opt.weights  # ONNX模型路径
    # print(opt.img_size)

    # ============初始化TensorRT引擎================
    t0 = time_synchronized()  # 计算时间
    yolo_trt_model = YoloTrtModel(device, onnx_model_path, fp16_mode, opt.cls,opt.trt_size[0],opt.trt_size[1])
    t = time_synchronized()  # 计算时间
    print("初始化引擎：", t - t0)

    # ============判断视频还是图片================
    if (opt.source == str(0) or opt.source == str(1) or opt.source == str(2) or opt.source == str(200)):
        capture = cv2.VideoCapture(int(opt.source))
        while (True):
            t1 = time_synchronized()
            ret, frame = capture.read()  # ret为返回值，frame为视频的每一帧

            img, orgimg = img_process(opt.source,frame)  #视频预处理
            t2 = time_synchronized()#视频预处理时间

            pred = yolo_trt_model(img.cpu().numpy())  # tensorrt推理
            t3 = time_synchronized() # tensorrt推理时间

            pred = yolo_trt_model.after_process(pred, device)  # torch后处理
            t4 = time_synchronized()# torch后处理时间

            pred = non_max_suppression_face(pred, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres) # 非极大值抑制
            t5 = time_synchronized() # 非极大值抑制时间

            img_vis(img, orgimg, pred, device,save_jpg=False) #可视化
            cv2.imshow("video", orgimg)
            cv2.imshow("video", frame)
            c = cv2.waitKey(1)
            if c == 27:  # 按了esc候可以退出
                break
            t6 = time_synchronized()  # 可视化时间

            print("视频流读取：", t2 - t1)
            print("tensorrt推理：", t3 - t2)
            print("torch后处理：", t4 - t3)
            print("非极大值抑制：", t5 - t4)
            print("可视化：", t6 - t5)
            print("总耗时", t6 - t1)

    else:
        img_path = root_path + "/" + opt.source  # 图片路径
        t1 = time_synchronized()

        img, orgimg = img_process(img_path)  # 图像预处理
        t2 = time_synchronized()  # 图像预处理时间

        pred = yolo_trt_model(img.cpu().numpy())  # tensorrt推理
        t3 = time_synchronized()  # tensorrt推理时间

        pred = yolo_trt_model.after_process(pred, device)  # torch后处理
        t4 = time_synchronized()  # torch后处理时间

        pred = non_max_suppression_face(pred, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres) # 非极大值抑制
        t5 = time_synchronized() # 非极大值抑制时间

        img_vis(img, orgimg, pred, device)
        t6 = time_synchronized() # 可视化时间

        print("图像预处理：", t2 - t1)
        print("tensorrt推理：", t3 - t2)
        print("torch后处理：", t4 - t3)
        print("非极大值抑制：", t5 - t4)
        print("图像可视化：", t6 - t5)

同时YoloTrtModel()函数中的onnx-->trt语句被注释掉，并进行如下更改

    def __init__(self,device_id="cuda:0",onnx_model_path=None,fp16_mode=False,cls=17,x=640,y=640):
        '''
        device_id: "cuda:0"
        onnx_model_path: 加载onnx模型的路径
        output_size: # 输出尺寸 eg:(1,-1) 
        fp16_mode: True则FP16推理
        cls:类别数
        x，y：横向纵向图片大小
        '''

        trt_engine_path = onnx_model_path.replace('.onnx','.trt')

        # 初始化TensorRT, 加载trt引擎文件
        self.model_params=Init_TensorRT(trt_engine_path)
        self.cls=cls
        # 输出特征
        self.stride8_shape=(1,3,x//8,y//8,13+cls)
        self.stride16_shape=(1,3,x//16,y//16,13+cls)
        self.stride32_shape=(1,3,x//32,y//32,13+cls)

main函数则只保留onnx-->trt语句，如下

if __name__ == '__main__':

    # ============参数================
    # img_path=cur_path+"/sample.jpeg" #测试图片路径
    # device="cuda:0"
    onnx_model_path=cur_path+"/2000+3000.onnx" #ONNX模型路径
    fp16_mode=True  #True则FP16推理
    trt_engine_path = onnx_model_path.replace('.onnx', '.trt')
    ONNX_to_TensorRT(fp16_mode=fp16_mode, onnx_model_path=onnx_model_path, trt_engine_path=trt_engine_path)

yolov5face-tensorrt部署