pytorch utilise tensorrt pour accélérer

Répertoire d'articles

pytorch utilise tensorrt pour accélérer

Enregistrement complet du déploiement du modèle

Enigen de composant

alentours

ubuntu 16.04 
tensorrt 7.0.0.11
cuda 10.0

#include <cuda_runtime.h>

#include <iostream>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

// static Logger
//     gLogger;  // 创建全局变量Logger，作为TensorRT各种调用的方法的输入参数

const std::string gSampleName = "TensorRT.sample_onnx_image";

// bool OnnxToTRTModel(const std::string& )

int main() {
  Logger gLogger;

  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);

  const auto explicitBatch =
      1U << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  // 创建模型容器
  std::cout << "创建模型容器" << std::endl;
  nvinfer1::INetworkDefinition* network =
      builder->createNetworkV2(explicitBatch);

  // 开始填充模型
  std::cout << "开始填充模型" << std::endl;
  auto parser = nvonnxparser::createParser(*network, gLogger);
  const char* onnx_filename = "../gpu.onnx";
  parser->parseFromFile(onnx_filename,
                        static_cast<int>(Logger::Severity::kWARNING));

  // 开始构建enigen
  std::cout << "开始构建enigen" << std::endl;
  builder->setMaxBatchSize(1);  // 设置batch size
  //   builder->setMaxWorkspaceSize(1600 * (1 << 20));  // 最大占用显存1600M

  nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(160 * (1 << 20));
  config->setFlag(nvinfer1::BuilderFlag::kFP16);
  std::cout << "Building engine, please wait for a while..." << std::endl;

  nvinfer1::ICudaEngine* enigen =
      builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // 序列化模型保存
  std::cout << "序列化模型保存" << std::endl;
  nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
  std::string serialize_str;
  std::ofstream serialize_output_stream;
  serialize_str.resize(giemodelstream->size());
  //   memcpy((void*)serialize_str.data(), giemodelstream->data(),
  //          giemodelstream->size());
  serialize_output_stream.open("./serialize_engine_output.trt",
                               std::ios::binary | std::ios::out);
  serialize_output_stream.write(
      reinterpret_cast<const char*>(giemodelstream->data()),
      giemodelstream->size());
  std::cout << "writing engine file..." << std::endl;
  //   serialize_output_stream << serialize_str;
  serialize_output_stream.close();
}

Déployer le modèle pour l'inférence

#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
//     Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
//     1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
                 void** buffers, float* input, float* output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  cudaMemcpyAsync(buffers[0], input,
                  batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                  cudaMemcpyHostToDevice, stream);
  context.enqueue(batchSize, buffers, stream, nullptr);
  cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
}

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

int main() {
  Logger gLogger;

  std::string engine_name = "serialize_engine_output.trt";  // load engine name

  // start load engine
  char* trtModelStream{nullptr};
  size_t size{0};
  std::ifstream engine_file(engine_name, std::ios::binary);
  if (engine_file.good()) {
    engine_file.seekg(0,
                      engine_file.end);  // 定位输入流结束位置地址偏移量为0初
    size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);
    trtModelStream = new char[size];
    engine_file.read(trtModelStream, size);
    engine_file.close();
  }

  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  IRuntime* runtime = createInferRuntime(gLogger);

  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);

  IExecutionContext* context = engine->createExecutionContext();
  delete[] trtModelStream;

  void* buffers[2];
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  //  使用上面的indices，在GPU上创建一个指向input和output缓冲区的buffer数组

  // Create GPU buffers on device
  cudaMalloc(&buffers[inputIndex],
             BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
  cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));

  // Create stream
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  //
  cv::Mat img = cv::imread("../3.jpg");
  cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
  auto start = std::chrono::system_clock::now();

  int i = 0;
  int fcount = 0;

  // 分离BGR并变化成RGB
  for (int row = 0; row < INPUT_H; ++row) {
    uchar* uc_pixel = pre_img.data + row * pre_img.step;
    for (int col = 0; col < INPUT_W; ++col) {
      data[fcount * 3 * INPUT_H * INPUT_W + i] =
          static_cast<float>(uc_pixel[2]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[1]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[0]) / 255.0;
      uc_pixel += 3;
      ++i;
    }
  }
  // Run inference
  doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
  auto end = std::chrono::system_clock::now();
  std::cout << "prob: " << std::endl;
  std::cout << prob[0] << " " << prob[1] << std::endl;
  std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                     start)
                   .count()
            << "ms" << std::endl;
}

Affichage des résultats

prob: 
-9.97636 10.7392
8ms

Pits rencontrés

[TensorRT] ERREUR: le réseau doit avoir au moins une sortie

La raison en est que votre version pytorch est trop élevée

La version pytorch est 1.4.0, le problème d'analyse onnx causé par la version pytorch est trop élevé (il est signalé que ce problème d'analyse se produira dans trt5.0-6.0, trt7.0 n'apparaîtra pas, voir trt6 à torch1. 2 et plus onnx)

Utilisez plutôt tensorrt7.0.0.11

"ONNX parser only supports networks with an explicit batch dimension"

Solution

nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);

Déploiement du modèle de Pytorch --------- pytorch utilise Tensorrt pour accélérer