pytorch usa tensorrt para acelerar
Directorio de artículos
Registro completo de implementación del modelo
Componente enigen
alrededores
ubuntu 16.04
tensorrt 7.0.0.11
cuda 10.0
#include <cuda_runtime.h>
#include <iostream>
#include <string>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
// static Logger
// gLogger; // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数
const std::string gSampleName = "TensorRT.sample_onnx_image";
// bool OnnxToTRTModel(const std::string& )
int main() {
Logger gLogger;
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch =
1U << static_cast<uint32_t>(
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
// 创建模型容器
std::cout << "创建模型容器" << std::endl;
nvinfer1::INetworkDefinition* network =
builder->createNetworkV2(explicitBatch);
// 开始填充模型
std::cout << "开始填充模型" << std::endl;
auto parser = nvonnxparser::createParser(*network, gLogger);
const char* onnx_filename = "../gpu.onnx";
parser->parseFromFile(onnx_filename,
static_cast<int>(Logger::Severity::kWARNING));
// 开始构建enigen
std::cout << "开始构建enigen" << std::endl;
builder->setMaxBatchSize(1); // 设置batch size
// builder->setMaxWorkspaceSize(1600 * (1 << 20)); // 最大占用显存1600M
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
config->setMaxWorkspaceSize(160 * (1 << 20));
config->setFlag(nvinfer1::BuilderFlag::kFP16);
std::cout << "Building engine, please wait for a while..." << std::endl;
nvinfer1::ICudaEngine* enigen =
builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;
// 序列化模型保存
std::cout << "序列化模型保存" << std::endl;
nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
std::string serialize_str;
std::ofstream serialize_output_stream;
serialize_str.resize(giemodelstream->size());
// memcpy((void*)serialize_str.data(), giemodelstream->data(),
// giemodelstream->size());
serialize_output_stream.open("./serialize_engine_output.trt",
std::ios::binary | std::ios::out);
serialize_output_stream.write(
reinterpret_cast<const char*>(giemodelstream->data()),
giemodelstream->size());
std::cout << "writing engine file..." << std::endl;
// serialize_output_stream << serialize_str;
serialize_output_stream.close();
}
Implementar el modelo para inferencia
#include <cuda_runtime.h>
#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
// Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
// 1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
void** buffers, float* input, float* output, int batchSize) {
// DMA input batch data to device, infer on the batch asynchronously, and DMA
// output back to host
cudaMemcpyAsync(buffers[0], input,
batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
cudaMemcpyHostToDevice, stream);
context.enqueue(batchSize, buffers, stream, nullptr);
cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
}
cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
int w, h, x, y;
float r_w = input_w / (img.cols * 1.0);
float r_h = input_h / (img.rows * 1.0);
if (r_h > r_w) {
w = input_w;
h = r_w * img.rows;
x = 0;
y = (input_h - h) / 2;
} else {
w = r_h * img.cols;
h = input_h;
x = (input_w - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
return out;
}
int main() {
Logger gLogger;
std::string engine_name = "serialize_engine_output.trt"; // load engine name
// start load engine
char* trtModelStream{nullptr};
size_t size{0};
std::ifstream engine_file(engine_name, std::ios::binary);
if (engine_file.good()) {
engine_file.seekg(0,
engine_file.end); // 定位输入流结束位置地址偏移量为0初
size = engine_file.tellg();
engine_file.seekg(0, engine_file.beg);
trtModelStream = new char[size];
engine_file.read(trtModelStream, size);
engine_file.close();
}
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
static float prob[BATCH_SIZE * OUTPUT_SIZE];
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
IExecutionContext* context = engine->createExecutionContext();
delete[] trtModelStream;
void* buffers[2];
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
// 使用上面的indices,在GPU上创建一个指向input和output缓冲区的buffer数组
// Create GPU buffers on device
cudaMalloc(&buffers[inputIndex],
BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
// Create stream
cudaStream_t stream;
cudaStreamCreate(&stream);
//
cv::Mat img = cv::imread("../3.jpg");
cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
auto start = std::chrono::system_clock::now();
int i = 0;
int fcount = 0;
// 分离BGR并变化成RGB
for (int row = 0; row < INPUT_H; ++row) {
uchar* uc_pixel = pre_img.data + row * pre_img.step;
for (int col = 0; col < INPUT_W; ++col) {
data[fcount * 3 * INPUT_H * INPUT_W + i] =
static_cast<float>(uc_pixel[2]) / 255.0;
data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
static_cast<float>(uc_pixel[1]) / 255.0;
data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
static_cast<float>(uc_pixel[0]) / 255.0;
uc_pixel += 3;
++i;
}
}
// Run inference
doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
auto end = std::chrono::system_clock::now();
std::cout << "prob: " << std::endl;
std::cout << prob[0] << " " << prob[1] << std::endl;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
start)
.count()
<< "ms" << std::endl;
}
Visualización de resultados
prob:
-9.97636 10.7392
8ms
Pozos encontrados
[TensorRT] ERROR: la red debe tener al menos una salida
La razón es que su versión de pytorch es demasiado alta.
La versión de pytorch es 1.4.0, el problema de análisis de onnx causado por la versión de pytorch es demasiado alto (se informa que este problema de análisis ocurrirá en trt5.0-6.0, trt7.0 no aparecerá, consulte trt6 para torch1. 2 y superior onnx)
Utilice tensorrt7.0.0.11 en su lugar
"ONNX parser only supports networks with an explicit batch dimension"
Solución
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);