Dynamic shape during tensorRT model inference

dynamic shape

The so-called dynamic shape is to specify the dynamic range [LH] at compile time, and L<=shape<=H can be allowed during reasoning. In the fully convolutional network, we usually have this appeal. The shape during inference can be changed dynamically, and it does not have to be restricted. This dynamic shape is not necessarily only width and height, but also means that the batch size is also dynamic.

The operation of realizing dynamic shape is mainly to modify the two aspects mentioned below.

1. When building the network :
1.1. The input dimension must be given as -1 when the model is defined, otherwise the dimension will not be dynamic. Pay attention to two points:

  • For an onnx file, if the dimension is a letter or -1 after the onnx file is opened, its dimension is considered dynamic.
  • If there are reshape operations in your model, then the parameters of reshape must be calculated dynamically. And most of the time that's the problem. Unless you are a fully convolutional model, most of the time you only need to set dynamic for the batch_size dimension, and try to avoid setting dynamic for other dimensions

1.2.Configure profile:

  • create: builder->createOptimizationProfile()
  • set: setDimensions() sets a series of input size ranges for kMIN, kOPT, kMAX
  • add:config->addOptimizationProfile(profile); add profile to the network configuration

2. During the reasoning stage :

2.1. It is necessary to set the dimension of the input, namely the shape, after selecting the index of the profile:

//这里就是指的把输入的shape设置成(1,1,3,3),Bindings的第0个索引表示输入(具体bindings的概念参见文章:tensorRT实现模型的推理过程)
execution_context->setBindingDimensions(0, nvinfer1::Dims4(1, 1, 3, 3))

code example

The only difference from the previous fully connected code is two points, one is that the definition of the network structure is replaced by CNN, and the other is the dynamic shape configuration createOptimizationProfile. OptimizationProfile is an optimization profile, which is used to specify the range of the input shape that can be transformed. Don't be blinded by the word optimization. In fact, it is to tell tensorRT what range my shape is!


// tensorRT include
#include <NvInfer.h>
#include <NvInferRuntime.h>

// cuda include
#include <cuda_runtime.h>

// system include
#include <stdio.h>
#include <math.h>

#include <iostream> 
#include <fstream> // 后面要用到ios这个库
#include <vector>

using namespace std;

class TRTLogger : public nvinfer1::ILogger{
    
    
public:
    virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
    
    
        if(severity <= Severity::kINFO){
    
    
            printf("%d: %s\n", severity, msg);
        }
    }
} logger;

nvinfer1::Weights make_weights(float* ptr, int n){
    
    
    nvinfer1::Weights w;
    w.count = n;
    w.type = nvinfer1::DataType::kFLOAT;
    w.values = ptr;
    return w;
}

bool build_model(){
    
    
    TRTLogger logger;

    // ----------------------------- 1. 定义 builder, config 和network -----------------------------
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);

    // 构建一个模型
    /*
        Network definition:

        image
          |
        conv(3x3, pad=1)  input = 1(指的是channel), output = 1, bias = True     w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5], [0.2, 0.2, 0.1]], b=0.0
          |
        relu
          |
        prob
    */


    // ----------------------------- 2. 输入,模型结构和输出的基本信息 -----------------------------
    const int num_input = 1;
    const int num_output = 1;
    float layer1_weight_values[] = {
    
    
        1.0, 2.0, 3.1, 
        0.1, 0.1, 0.1, 
        0.2, 0.2, 0.2
    }; // 行优先
    float layer1_bias_values[]   = {
    
    0.0};

    // 如果要使用动态shape,必须让NetworkDefinition的维度定义为-1,in_channel是固定的
    nvinfer1::ITensor* input = network->addInput("image", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(-1, num_input, -1, -1));
    nvinfer1::Weights layer1_weight = make_weights(layer1_weight_values, 9);
    nvinfer1::Weights layer1_bias   = make_weights(layer1_bias_values, 1);
    //网络定义卷积层
    auto layer1 = network->addConvolution(*input, num_output, nvinfer1::DimsHW(3, 3), layer1_weight, layer1_bias);
    layer1->setPadding(nvinfer1::DimsHW(1, 1));

    auto prob = network->addActivation(*layer1->getOutput(0), nvinfer1::ActivationType::kRELU); // *(layer1->getOutput(0))
     
    // 将我们需要的prob标记为输出
    network->markOutput(*prob->getOutput(0));

    int maxBatchSize = 10;
    printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);
    // 配置暂存存储器,用于layer实现的临时存储,也用于保存中间激活值
    config->setMaxWorkspaceSize(1 << 28);

    // --------------------------------- 2.1 关于profile ----------------------------------
    // profile就是关于实现模型编译时动态shape的配置!如果模型有多个输入,则必须多个profile
    auto profile = builder->createOptimizationProfile();

    // 配置最小允许1 x 1 x 3 x 3,kMIN表示配置的最小值
    profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, num_input, 3, 3));
    //kOPT是表示配置的最优值
    profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(1, num_input, 3, 3));

    // 配置最大允许10 x 1 x 5 x 5,kMAX表示配置的最大值
    // if networkDims.d[i] != -1, then minDims.d[i] == optDims.d[i] == maxDims.d[i] == networkDims.d[i]
    profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(maxBatchSize, num_input, 5, 5));
    config->addOptimizationProfile(profile);

    nvinfer1::ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    if(engine == nullptr){
    
    
        printf("Build engine failed.\n");
        return false;
    }

    // -------------------------- 3. 序列化 ----------------------------------
    // 将模型序列化,并储存为文件
    nvinfer1::IHostMemory* model_data = engine->serialize();
    FILE* f = fopen("engine.trtmodel", "wb");
    fwrite(model_data->data(), 1, model_data->size(), f);
    fclose(f);

    // 卸载顺序按照构建顺序倒序
    model_data->destroy();
    engine->destroy();
    network->destroy();
    config->destroy();
    builder->destroy();
    printf("Done.\n");
    return true;
}

vector<unsigned char> load_file(const string& file){
    
    
    ifstream in(file, ios::in | ios::binary);
    if (!in.is_open())
        return {
    
    };

    in.seekg(0, ios::end);
    size_t length = in.tellg();

    std::vector<uint8_t> data;
    if (length > 0){
    
    
        in.seekg(0, ios::beg);
        data.resize(length);

        in.read((char*)&data[0], length);
    }
    in.close();
    return data;
}

void inference(){
    
    
    // ------------------------------- 1. 加载model并反序列化 -------------------------------
    TRTLogger logger;
    auto engine_data = load_file("engine.trtmodel");
    nvinfer1::IRuntime* runtime   = nvinfer1::createInferRuntime(logger);
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
    if(engine == nullptr){
    
    
        printf("Deserialize cuda engine failed.\n");
        runtime->destroy();
        return;
    }

    nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
    cudaStream_t stream = nullptr;
    cudaStreamCreate(&stream);

    /*
        Network definition:

        image
          |
        conv(3x3, pad=1)  input = 1, output = 1, bias = True     w=[[1.0, 2.0, 0.5], [0.1, 0.2, 0.5], [0.2, 0.2, 0.1]], b=0.0
          |
        relu
          |
        prob
    */

    // ------------------------------- 2. 输入与输出 -------------------------------
    float input_data_host[] = {
    
    
        // batch 0
        1,   1,   1,
        1,   1,   1,
        1,   1,   1,

        // batch 1
        -1,   1,   1,
        1,   0,   1,
        1,   1,   -1
    };
    float* input_data_device = nullptr;

    // 3x3输入,对应3x3输出
    int ib = 2;
    int iw = 3;
    int ih = 3;
    float output_data_host[ib * iw * ih];
    float* output_data_device = nullptr;
    cudaMalloc(&input_data_device, sizeof(input_data_host));
    cudaMalloc(&output_data_device, sizeof(output_data_host));
    cudaMemcpyAsync(input_data_device, input_data_host, sizeof(input_data_host), cudaMemcpyHostToDevice, stream);


    // ------------------------------- 3. 推理 -------------------------------
    // 明确当前推理时,使用的数据输入大小,setBindingDimensions第一个参数为0指的是输入
    execution_context->setBindingDimensions(0, nvinfer1::Dims4(ib, 1, ih, iw));
    float* bindings[] = {
    
    input_data_device, output_data_device};
    bool success      = execution_context->enqueueV2((void**)bindings, stream, nullptr);
    cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);


    // ------------------------------- 4. 输出结果 -------------------------------
    for(int b = 0; b < ib; ++b){
    
    
        printf("batch %d. output_data_host = \n", b);
        for(int i = 0; i < iw * ih; ++i){
    
    
            printf("%f, ", output_data_host[b * iw * ih + i]);
            if((i + 1) % iw == 0)
                printf("\n");
        }
    }

    printf("Clean memory\n");
    cudaStreamDestroy(stream);
    cudaFree(input_data_device);
    cudaFree(output_data_device);
    execution_context->destroy();
    engine->destroy();
    runtime->destroy();
}

int main(){
    
    

    if(!build_model()){
    
    
        return -1;
    }
    inference();
    return 0;
}

Guess you like

Origin blog.csdn.net/Rolandxxx/article/details/127705038#comments_28500103