pytorch利用tensorrt加速

文章目录

pytorch利用tensorrt加速

构件enigen

环境

ubuntu 16.04 
tensorrt 7.0.0.11
cuda 10.0

#include <cuda_runtime.h>

#include <iostream>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

// static Logger
//     gLogger;  // 创建全局变量Logger，作为TensorRT各种调用的方法的输入参数

const std::string gSampleName = "TensorRT.sample_onnx_image";

// bool OnnxToTRTModel(const std::string& )

int main() {
  Logger gLogger;

  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);

  const auto explicitBatch =
      1U << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  // 创建模型容器
  std::cout << "创建模型容器" << std::endl;
  nvinfer1::INetworkDefinition* network =
      builder->createNetworkV2(explicitBatch);

  // 开始填充模型
  std::cout << "开始填充模型" << std::endl;
  auto parser = nvonnxparser::createParser(*network, gLogger);
  const char* onnx_filename = "../gpu.onnx";
  parser->parseFromFile(onnx_filename,
                        static_cast<int>(Logger::Severity::kWARNING));

  // 开始构建enigen
  std::cout << "开始构建enigen" << std::endl;
  builder->setMaxBatchSize(1);  // 设置batch size
  //   builder->setMaxWorkspaceSize(1600 * (1 << 20));  // 最大占用显存1600M

  nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(160 * (1 << 20));
  config->setFlag(nvinfer1::BuilderFlag::kFP16);
  std::cout << "Building engine, please wait for a while..." << std::endl;

  nvinfer1::ICudaEngine* enigen =
      builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // 序列化模型保存
  std::cout << "序列化模型保存" << std::endl;
  nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
  std::string serialize_str;
  std::ofstream serialize_output_stream;
  serialize_str.resize(giemodelstream->size());
  //   memcpy((void*)serialize_str.data(), giemodelstream->data(),
  //          giemodelstream->size());
  serialize_output_stream.open("./serialize_engine_output.trt",
                               std::ios::binary | std::ios::out);
  serialize_output_stream.write(
      reinterpret_cast<const char*>(giemodelstream->data()),
      giemodelstream->size());
  std::cout << "writing engine file..." << std::endl;
  //   serialize_output_stream << serialize_str;
  serialize_output_stream.close();
}

部署模型进行推理

#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
//     Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
//     1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
                 void** buffers, float* input, float* output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  cudaMemcpyAsync(buffers[0], input,
                  batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                  cudaMemcpyHostToDevice, stream);
  context.enqueue(batchSize, buffers, stream, nullptr);
  cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
}

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

int main() {
  Logger gLogger;

  std::string engine_name = "serialize_engine_output.trt";  // load engine name

  // start load engine
  char* trtModelStream{nullptr};
  size_t size{0};
  std::ifstream engine_file(engine_name, std::ios::binary);
  if (engine_file.good()) {
    engine_file.seekg(0,
                      engine_file.end);  // 定位输入流结束位置地址偏移量为0初
    size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);
    trtModelStream = new char[size];
    engine_file.read(trtModelStream, size);
    engine_file.close();
  }

  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  IRuntime* runtime = createInferRuntime(gLogger);

  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);

  IExecutionContext* context = engine->createExecutionContext();
  delete[] trtModelStream;

  void* buffers[2];
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  //  使用上面的indices，在GPU上创建一个指向input和output缓冲区的buffer数组

  // Create GPU buffers on device
  cudaMalloc(&buffers[inputIndex],
             BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
  cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));

  // Create stream
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  //
  cv::Mat img = cv::imread("../3.jpg");
  cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
  auto start = std::chrono::system_clock::now();

  int i = 0;
  int fcount = 0;

  // 分离BGR并变化成RGB
  for (int row = 0; row < INPUT_H; ++row) {
    uchar* uc_pixel = pre_img.data + row * pre_img.step;
    for (int col = 0; col < INPUT_W; ++col) {
      data[fcount * 3 * INPUT_H * INPUT_W + i] =
          static_cast<float>(uc_pixel[2]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[1]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[0]) / 255.0;
      uc_pixel += 3;
      ++i;
    }
  }
  // Run inference
  doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
  auto end = std::chrono::system_clock::now();
  std::cout << "prob: " << std::endl;
  std::cout << prob[0] << " " << prob[1] << std::endl;
  std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                     start)
                   .count()
            << "ms" << std::endl;
}

结果展示

prob: 
-9.97636 10.7392
8ms

遇到的坑

[TensorRT] ERROR: Network must have at least one output

原因是你的pytorch 版本太高

pytorch版本为1.4.0，pytorch版本过高引起的onnx解析问题.(据悉这个解析问题会发生在trt5.0-6.0,trt7.0不会出现，详见trt6转torch1.2以上版本的onnx)

改用tensorrt7.0.0.11

"ONNX parser only supports networks with an explicit batch dimension"

解决方法

nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);

Pytorch模型部署之---------pytorch利用tensorrt加速

pytorch利用tensorrt加速

文章目录

构件enigen

部署模型进行推理

结果展示

遇到的坑

[TensorRT] ERROR: Network must have at least one output

改用tensorrt7.0.0.11

猜你喜欢