tensorrt基本流程

使用tensorrt加速分为构建阶段和运行阶段两个流程,其中构建阶段主要通过onnx_Parser或tensorrt的API搭建网络序列化engine模型。运行阶段加载反序列化engine模型创建分配内存现存资源,执行网络的推理。

tensorrt构建阶段

1、建立日志记录器Logger
Logger gLogger;

2、建立Builder和BuilderConfig
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();

3、创建Network(API)
ICudaEngine engine = nullptr;
engine = build_det_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
{
INetworkDefinition
network = builder->createNetworkV2(0U);

ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW });
auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, “model.0”);
auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, “model.1”);
auto bottleneck_CSP2 = C3(network, weightMap, conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, “model.2”);
.
.
.
auto yolo = addYoLoLayer(network, weightMap, “model.24”, std::vector<IConvolutionLayer
>{det0, det1, det2});

yolo->getOutput(0)->setName(kOutputTensorName);
network->markOutput(*yolo->getOutput(0));

builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 * (1 << 20));
config->setFlag(BuilderFlag::kFP16);
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

network->destroy();
}

4、生成SerializedNetwork
IHostMemory* serialized_engine = engine->serialize();

5、释放资源
engine->destroy();
builder->destroy();
config->destroy();
serialized_engine->destroy();

tensorrt运行阶段

1、构建Engine
std::ifstream file(engine_name, std::ios::binary);
char* trtModelStream= new char[size];
file.read(trtModelStream, size);

IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);

2、创建context
IExecutionContext* context = engine->createExecutionContext();

3、Buffer准备(Host端和Device端)
const int inputIndex = m_Engine->getBindingIndex(“data”); // input
const int outputIndex = m_Engine->getBindingIndex(“prob”); // bbox

float * input= new float[inputsize];
float * cpu_output_buffer = new float[kBatchSize * kOutputSize];

float * gpu_buffers[2];
cudaMalloc(&gpu_buffers[inputIndex], kBatchSize * 3 * kInputH * kInputW * sizeof(float));
cudaMalloc(&gpu_buffers[outputIndex], kBatchSize * kOutputSize1 * sizeof(float));

4、Buffer拷贝 Host to Device
cudaMemcpyAsync(gpu_buffers[inputIndex], input, batchSize * 3 * m_InputH * m_InputW * sizeof(float), cudaMemcpyHostToDevice, stream);

5、执行推理Execute
context->enqueue(batchSize, (void**)gpu_buffers, stream, nullptr);

6、Buffer拷贝 Device to Host
cudaMemcpyAsync(cpu_output_buffer, gpu_buffers[outputIndex], batchSize * kOutputSize1 * sizeof(float), cudaMemcpyDeviceToHost, stream);

7、释放资源
cudaFree(gpu_buffers[inputIndex]);
cudaFree(gpu_buffers[outputIndex]);
context ->destroy();
engine->destroy();

猜你喜欢

转载自blog.csdn.net/threestooegs/article/details/130344329