Pytorch model deployment---------pytorch uses tensorrt to accelerate

pytorch uses tensorrt to accelerate

Full record of model deployment

Component enigen

surroundings

ubuntu 16.04 
tensorrt 7.0.0.11
cuda 10.0
#include <cuda_runtime.h>

#include <iostream>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

// static Logger
//     gLogger;  // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数

const std::string gSampleName = "TensorRT.sample_onnx_image";

// bool OnnxToTRTModel(const std::string& )

int main() {
  Logger gLogger;

  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);

  const auto explicitBatch =
      1U << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  // 创建模型容器
  std::cout << "创建模型容器" << std::endl;
  nvinfer1::INetworkDefinition* network =
      builder->createNetworkV2(explicitBatch);

  // 开始填充模型
  std::cout << "开始填充模型" << std::endl;
  auto parser = nvonnxparser::createParser(*network, gLogger);
  const char* onnx_filename = "../gpu.onnx";
  parser->parseFromFile(onnx_filename,
                        static_cast<int>(Logger::Severity::kWARNING));

  // 开始构建enigen
  std::cout << "开始构建enigen" << std::endl;
  builder->setMaxBatchSize(1);  // 设置batch size
  //   builder->setMaxWorkspaceSize(1600 * (1 << 20));  // 最大占用显存1600M

  nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(160 * (1 << 20));
  config->setFlag(nvinfer1::BuilderFlag::kFP16);
  std::cout << "Building engine, please wait for a while..." << std::endl;

  nvinfer1::ICudaEngine* enigen =
      builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // 序列化模型保存
  std::cout << "序列化模型保存" << std::endl;
  nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
  std::string serialize_str;
  std::ofstream serialize_output_stream;
  serialize_str.resize(giemodelstream->size());
  //   memcpy((void*)serialize_str.data(), giemodelstream->data(),
  //          giemodelstream->size());
  serialize_output_stream.open("./serialize_engine_output.trt",
                               std::ios::binary | std::ios::out);
  serialize_output_stream.write(
      reinterpret_cast<const char*>(giemodelstream->data()),
      giemodelstream->size());
  std::cout << "writing engine file..." << std::endl;
  //   serialize_output_stream << serialize_str;
  serialize_output_stream.close();
}

Deploy the model for inference

#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
//     Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
//     1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
                 void** buffers, float* input, float* output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  cudaMemcpyAsync(buffers[0], input,
                  batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                  cudaMemcpyHostToDevice, stream);
  context.enqueue(batchSize, buffers, stream, nullptr);
  cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
}

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

int main() {
  Logger gLogger;

  std::string engine_name = "serialize_engine_output.trt";  // load engine name

  // start load engine
  char* trtModelStream{nullptr};
  size_t size{0};
  std::ifstream engine_file(engine_name, std::ios::binary);
  if (engine_file.good()) {
    engine_file.seekg(0,
                      engine_file.end);  // 定位输入流结束位置地址偏移量为0初
    size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);
    trtModelStream = new char[size];
    engine_file.read(trtModelStream, size);
    engine_file.close();
  }

  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  IRuntime* runtime = createInferRuntime(gLogger);

  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);

  IExecutionContext* context = engine->createExecutionContext();
  delete[] trtModelStream;

  void* buffers[2];
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  //  使用上面的indices,在GPU上创建一个指向input和output缓冲区的buffer数组

  // Create GPU buffers on device
  cudaMalloc(&buffers[inputIndex],
             BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
  cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));

  // Create stream
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  //
  cv::Mat img = cv::imread("../3.jpg");
  cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
  auto start = std::chrono::system_clock::now();

  int i = 0;
  int fcount = 0;

  // 分离BGR并变化成RGB
  for (int row = 0; row < INPUT_H; ++row) {
    uchar* uc_pixel = pre_img.data + row * pre_img.step;
    for (int col = 0; col < INPUT_W; ++col) {
      data[fcount * 3 * INPUT_H * INPUT_W + i] =
          static_cast<float>(uc_pixel[2]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[1]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[0]) / 255.0;
      uc_pixel += 3;
      ++i;
    }
  }
  // Run inference
  doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
  auto end = std::chrono::system_clock::now();
  std::cout << "prob: " << std::endl;
  std::cout << prob[0] << " " << prob[1] << std::endl;
  std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                     start)
                   .count()
            << "ms" << std::endl;
}

Result display

prob: 
-9.97636 10.7392
8ms

Pits encountered

[TensorRT] ERROR: Network must have at least one output

The reason is that your pytorch version is too high

The pytorch version is 1.4.0, the onnx parsing problem caused by the pytorch version is too high. (It is reported that this parsing problem will occur in trt5.0-6.0, trt7.0 will not appear, see trt6 to torch1.2 and above onnx)

Use tensorrt7.0.0.11 instead

"ONNX parser only supports networks with an explicit batch dimension"

Solution

nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);

Guess you like

Origin blog.csdn.net/ahelloyou/article/details/114895955