pytorch训练的模型用Tensor模型部署

我们在部署AI模型时可以有多种选择，使用的是cpu还是gpu，部署框架直接使用pytorch，也可以选择tensorRT，会加速模型推理。这里主要讲的是pytorch模型怎么用tensorRT来部署，先来总结一下我们常用的部署方案。

常用部署方案：

cpu: pytorch->onnx->onnxruntime

gpu: pytorch->onnx->onnx2trt->tensorRT

arm: pytorch->onnx->ncnn/mace/mnn等

1.pytorch到onnx模型转换

转换的方法可以直接看到我的另外一篇博客（https://blog.csdn.net/u012505617/article/details/108770840），这里也顺便贴一下转换的代码：

import torch
 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
def model_converter():
    model = torch.load('resnet50.pth').to(device)  # 这里保存的是完整模型
    model.eval()
 
    dummy_input = torch.randn(1, 3, 96, 96, device=device)
    input_names = ['data']
    output_names = ['fc']
    torch.onnx.export(model, dummy_input, 'resnet50.onnx', 
                      export_params=True, 
                      verbose=True, 
                      input_names=input_names, 
                      output_names=output_names)

2.onnx模型转TensorRT模型

这里直接使用我们安装的TensorRT-7.0.0.11自带的工具进行转换，进入TensorRT-7.0.0.11/bin 目录，执行：

./trtexec --onnx=resnet50.onnx --saveEngine=resnet50.trt

3.TensorRT模型推理

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

import numpy as np
import time
import cv2

TRT_LOGGER = trt.Logger()

def get_img_np_nchw(image):
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (112, 112))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img_np = np.array(image_cv, dtype=float) / 255.
    img_np = (img_np - mean) / std
    img_np = img_np.transpose((2, 0, 1))
    img_np_nchw = np.expand_dims(img_np, axis=0)
    return img_np_nchw

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        super(HostDeviceMem, self).__init__()
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()
        

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()  # pycuda 操作缓冲区
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))

        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)  # 分配内存
        bindings.append(int(device_mem))

        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def get_engine(engine_file_path=""):
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # 将输入放入device
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # 执行模型推理
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # 将预测结果从缓冲区取出
    stream.synchronize()    # 线程同步
    return [out.host for out in outputs]

def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs

def landmark_detection(image_path):
    trt_engine_path = './models/landmark_detect_106.trt'

    engine = get_engine(trt_engine_path)
    context = engine.create_execution_context()
    inputs, outputs, bindings, stream = allocate_buffers(engine)

    image = cv2.imread(image_path)
    image = cv2.resize(image, (112, 112))
    img_np_nchw = get_img_np_nchw(image)
    img_np_nchw = img_np_nchw.astype(dtype=np.float32)

    inputs[0].host = img_np_nchw.reshape(-1)
    t1 = time.time()
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
    t2 = time.time()
    print('used time: ', t2-t1)

    shape_of_output = (1, 212)
    landmarks = postprocess_the_outputs(trt_outputs[1], shape_of_output)
    landmarks = landmarks.reshape(landmarks.shape[0], -1, 2)

    height, width = image.shape[:2]
    pred_landmark = landmarks[0] * [height, width]

    for (x, y) in pred_landmark.astype(np.int32):
        cv2.circle(image, (x, y), 1, (0, 255, 255), -1)

    cv2.imshow('landmarks', image)
    cv2.waitKey(0)

    return pred_landmark

if __name__ == '__main__':
    image_path = './images/3766_20190805_12_10.png'
    landmarks = landmark_detection(image_path)

4.效果对比

pytorch：0.00593s

tensorRT：0.00109s

可见在速度上TensorRT比pytorch有5倍多的提升。

参考：

https://blog.csdn.net/qq_37546267/article/details/106767640

pytorch模型转TensorRT模型部署

pytorch训练的模型用Tensor模型部署

1.pytorch到onnx模型转换

2.onnx模型转TensorRT模型

3.TensorRT模型推理

4.效果对比

猜你喜欢