Implementación del modelo Pytorch --------- pytorch usa tensorrt para acelerar

pytorch usa tensorrt para acelerar

Registro completo de implementación del modelo

Componente enigen

alrededores

ubuntu 16.04 
tensorrt 7.0.0.11
cuda 10.0
#include <cuda_runtime.h>

#include <iostream>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 2;
samplesCommon::Args gArgs;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

// static Logger
//     gLogger;  // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数

const std::string gSampleName = "TensorRT.sample_onnx_image";

// bool OnnxToTRTModel(const std::string& )

int main() {
  Logger gLogger;

  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger);

  const auto explicitBatch =
      1U << static_cast<uint32_t>(
          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  // 创建模型容器
  std::cout << "创建模型容器" << std::endl;
  nvinfer1::INetworkDefinition* network =
      builder->createNetworkV2(explicitBatch);

  // 开始填充模型
  std::cout << "开始填充模型" << std::endl;
  auto parser = nvonnxparser::createParser(*network, gLogger);
  const char* onnx_filename = "../gpu.onnx";
  parser->parseFromFile(onnx_filename,
                        static_cast<int>(Logger::Severity::kWARNING));

  // 开始构建enigen
  std::cout << "开始构建enigen" << std::endl;
  builder->setMaxBatchSize(1);  // 设置batch size
  //   builder->setMaxWorkspaceSize(1600 * (1 << 20));  // 最大占用显存1600M

  nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(160 * (1 << 20));
  config->setFlag(nvinfer1::BuilderFlag::kFP16);
  std::cout << "Building engine, please wait for a while..." << std::endl;

  nvinfer1::ICudaEngine* enigen =
      builder->buildEngineWithConfig(*network, *config);
  std::cout << "Build engine successfully!" << std::endl;

  // 序列化模型保存
  std::cout << "序列化模型保存" << std::endl;
  nvinfer1::IHostMemory* giemodelstream = enigen->serialize();
  std::string serialize_str;
  std::ofstream serialize_output_stream;
  serialize_str.resize(giemodelstream->size());
  //   memcpy((void*)serialize_str.data(), giemodelstream->data(),
  //          giemodelstream->size());
  serialize_output_stream.open("./serialize_engine_output.trt",
                               std::ios::binary | std::ios::out);
  serialize_output_stream.write(
      reinterpret_cast<const char*>(giemodelstream->data()),
      giemodelstream->size());
  std::cout << "writing engine file..." << std::endl;
  //   serialize_output_stream << serialize_str;
  serialize_output_stream.close();
}

Implementar el modelo para inferencia

#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <string>

#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "common.h"
#include "common/logger.h"

#define DEVICE 0  // GPU id
#define BATCH_SIZE 1

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int CLASS_NUM = 2;
// static const int OUTPUT_SIZE =
//     Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) +
//     1;
static const int OUTPUT_SIZE = 2;
void doInference(IExecutionContext& context, cudaStream_t& stream,
                 void** buffers, float* input, float* output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA
  // output back to host
  cudaMemcpyAsync(buffers[0], input,
                  batchSize * 3 * INPUT_H * INPUT_W * sizeof(float),
                  cudaMemcpyHostToDevice, stream);
  context.enqueue(batchSize, buffers, stream, nullptr);
  cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float),
                  cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
}

cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
  int w, h, x, y;
  float r_w = input_w / (img.cols * 1.0);
  float r_h = input_h / (img.rows * 1.0);
  if (r_h > r_w) {
    w = input_w;
    h = r_w * img.rows;
    x = 0;
    y = (input_h - h) / 2;
  } else {
    w = r_h * img.cols;
    h = input_h;
    x = (input_w - w) / 2;
    y = 0;
  }
  cv::Mat re(h, w, CV_8UC3);
  cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
  cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
  return out;
}

int main() {
  Logger gLogger;

  std::string engine_name = "serialize_engine_output.trt";  // load engine name

  // start load engine
  char* trtModelStream{nullptr};
  size_t size{0};
  std::ifstream engine_file(engine_name, std::ios::binary);
  if (engine_file.good()) {
    engine_file.seekg(0,
                      engine_file.end);  // 定位输入流结束位置地址偏移量为0初
    size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);
    trtModelStream = new char[size];
    engine_file.read(trtModelStream, size);
    engine_file.close();
  }

  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];

  IRuntime* runtime = createInferRuntime(gLogger);

  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);

  IExecutionContext* context = engine->createExecutionContext();
  delete[] trtModelStream;

  void* buffers[2];
  const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

  //  使用上面的indices,在GPU上创建一个指向input和output缓冲区的buffer数组

  // Create GPU buffers on device
  cudaMalloc(&buffers[inputIndex],
             BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float));
  cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float));

  // Create stream
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  //
  cv::Mat img = cv::imread("../3.jpg");
  cv::Mat pre_img = preprocess_img(img, INPUT_W, INPUT_H);
  auto start = std::chrono::system_clock::now();

  int i = 0;
  int fcount = 0;

  // 分离BGR并变化成RGB
  for (int row = 0; row < INPUT_H; ++row) {
    uchar* uc_pixel = pre_img.data + row * pre_img.step;
    for (int col = 0; col < INPUT_W; ++col) {
      data[fcount * 3 * INPUT_H * INPUT_W + i] =
          static_cast<float>(uc_pixel[2]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[1]) / 255.0;
      data[fcount * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] =
          static_cast<float>(uc_pixel[0]) / 255.0;
      uc_pixel += 3;
      ++i;
    }
  }
  // Run inference
  doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
  auto end = std::chrono::system_clock::now();
  std::cout << "prob: " << std::endl;
  std::cout << prob[0] << " " << prob[1] << std::endl;
  std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
                                                                     start)
                   .count()
            << "ms" << std::endl;
}

Visualización de resultados

prob: 
-9.97636 10.7392
8ms

Pozos encontrados

[TensorRT] ERROR: la red debe tener al menos una salida

La razón es que su versión de pytorch es demasiado alta.

La versión de pytorch es 1.4.0, el problema de análisis de onnx causado por la versión de pytorch es demasiado alto (se informa que este problema de análisis ocurrirá en trt5.0-6.0, trt7.0 no aparecerá, consulte trt6 para torch1. 2 y superior onnx)

Utilice tensorrt7.0.0.11 en su lugar

"ONNX parser only supports networks with an explicit batch dimension"

Solución

nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);

Supongo que te gusta

Origin blog.csdn.net/ahelloyou/article/details/114895955
Recomendado
Clasificación