Load multiple models simultaneously on one or more GPUs using tensorrt in win10 and C++

Before reading this article, please read my teaching article on how to deploy yolov5 with tensorrt under Win10, so it will be easier to come back and read this article.

When doing a deep learning target detection project, sometimes there are requirements as shown in the following table:

Algorithm FPS Requirements	≥50
operating system	Win10
Development language	C++
Model loading method	One GPU loads multiple models
Model loading method	Multiple GPUs load a single model separately

The following blog post will take you to complete these two requirements separately.

One GPU loads multiple models

To implement a GPU to load multiple models, we can understand that different engine files are loaded in sequence, that is to say, two sets of Cuda stream and buffer variables need to be defined, or the stream variables and buffer variables are encapsulated in classes, and then Create two interface classes and call the corresponding API to load multiple models on one GPU. Let's take calling the weight engine of yolov5 as an example. First, we encapsulate a class:

The implementation of the .h file looks like this:

class True_interface :
{
public:
	True_interface();
	virtual void load_model(int,std::string); // API：加载模型
    virtual void release_buffer(); // API：释放缓冲及指针变量
    virtual void interface(cv::Mat, float); // API：前向推理外部调用接口(输入图像和置信度阈值)
    void doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize); // API：前向推理
    
    const int INPUT_H = Yolo::INPUT_H;
    const int INPUT_W = Yolo::INPUT_W;
    const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  
    float data[BATCH_SIZE * 3 * Yolo::INPUT_H * Yolo::INPUT_W];
    float prob[BATCH_SIZE * (Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1)];
    ICudaEngine* engine;
    IExecutionContext* context; // 创建context,创建一些空间来存储中间激活值
    void* buffers[2]; // 创建buffers指向 GPU 上输入和输出缓冲区
    cudaStream_t stream; // 创建cuda流
    int inputIndex;
    int outputIndex;
    IRuntime* runtime; // 创建runtime
    const char* INPUT_BLOB_NAME = "data";
    const char* OUTPUT_BLOB_NAME = "prob";
    Logger gLogger;
};

The API implementations of the .cpp file are as follows. In fact, the really useful code is just those lines:

load_model()：

void True_interface::load_model(int gpu_id, std::string model_Name_)
{
    if ((gpu_id == 0) || (gpu_id == 1)) // 选择要加载的GPU设备
    {
        cudaError_t cudaError = cudaSetDevice(gpu_id);
        if (cudaError == cudaSuccess)
            std::cout << "GPU device selected successfully！" << std::endl;
        else
            std::cout << "GPU device selection failed！" << std::endl;
    }
    else
    {
        std::cerr << "GPU device assignment error!" << std::endl;
        return;
    }
    std::string engine_name;
    if (model_Name_ == "model_one")
    {
        engine_name = std::string("model_one.engine");
    }
    else if(model_Name_ == "model_two")
    {
        engine_name = std::string("model_two.engine");  // 权重路径
    }
    else
    {
        std::cout << "Please enter the correct detector type！" << std::endl;
        return;
    }
    float gd = 0.0f, gw = 0.0f;
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) 
    {
        std::cerr << "read " << engine_name << " error!" << std::endl;
    }
    char* trtModelStream = nullptr;
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    trtModelStream = new char[size];
    file.read(trtModelStream, size);
    file.close();

    runtime = createInferRuntime(gLogger); // 创建runtime
    engine = runtime->deserializeCudaEngine(trtModelStream, size); // 反序列化引擎
    context = engine->createExecutionContext(); // 创建一个上下文，创建一些空间用来存储中间值
    delete[] trtModelStream;
    inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); // 使用输入和输出blob名称来获得相应的输入和输出索引
    outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); // 使用输入和输出blob名称来获得相应的输入和输出索引
    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); // 为输入输出开辟GPU显存
    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); // 为输入输出开辟GPU显存
    CUDA_CHECK(cudaStreamCreate(&stream));// 绑定Cuda流
    

    if (model_Name_ == "model_one")
    {
        std::cout << "model_one loaded successfully!" << std::endl;
    }
    else if (model_Name_ == "model_two")
    {
        std::cout << "model_two loaded successfully!" << std::endl;
    }
}

doInference()：

void True_interface::doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize)
{
    // ---------------------------------从CPU到GPU,拷贝input数据-----------------------------------------------------
    CUDA_CHECK(cudaMemcpyAsync(buffers[0],  // //显存上的存储区域，用于存放输入数据
                               input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),  // //读入内存中的数据
                               cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr); // 异步推理
    // ---------------------------------GPU到CPU，拷贝output数据-----------------------------------------------------
    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream); // 同步cuda流
}

interface()：

void True_interface::interface_B(cv::Mat img, float conf_thresh)
{
    // ---------------------------------------------------清空结果向量----------------------------------------------------
    result_B.clear();
    // ----------------------------------------------------图像预处理-----------------------------------------------------
    cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // resize 图像并转换通道
    int i = 0;
    for (int row = 0; row < INPUT_H; ++row)
    {
        uchar* uc_pixel = pr_img.data + row * pr_img.step;
        for (int col = 0; col < INPUT_W; ++col)
        {
            data[i] = (float)uc_pixel[2] / 255.0;
            data[i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
            data[i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
            uc_pixel += 3;
            ++i;
        }
    }
    // ----------------------------------------------------前向推理-----------------------------------------------------
    doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
    // ----------------------------------------------------非极大值抑制-------------------------------------------------
    std::vector<std::vector<Yolo::Detection>> batch_res(1);
    auto& res = batch_res[0];
    nms(res, &prob[0], conf_thresh, NMS_THRESH);
    // ----------------------------------------------------输出目标坐标及置信度-------------------------------------------------
    for (size_t j = 0; j < res.size(); j++)
    {
        cv::Rect r = get_rect(img, res[j].bbox);
        std::vector<int> temp = { r.tl().x, r.tl().y, r.br().x, r.br().y, (int)res[j].class_id ,int(100*res[j].conf) };
        result_B.push_back(temp);
    }
}

release_buffer()：

void True_interface::release_buffer()
{
    // 释放资源
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(buffers[inputIndex]));
    CUDA_CHECK(cudaFree(buffers[outputIndex]));
    // 销毁变量
    context->destroy();
    engine->destroy();
    runtime->destroy();
    std::cout << "Memory freed successfully!" << std::endl;
}

Then, we only need to initialize two classes and call the load_model() method respectively to load two models on one GPU.

If you want to load the model on different GPUs, you only need to pass in the ID of the different GPU as the first parameter when calling the load_model() method . Is it convenient and simple.

Load multiple models simultaneously on one or more GPUs using tensorrt in win10 and C++

One GPU loads multiple models

Guess you like