Primeiros passos com TensorRT (6) tensorrtx lenet

Artigo Diretório

0. Prefácio

Existem poucos materiais para aprender TensorRT
- Contam principalmente com documentos oficiais e demostrações oficiais
- Outros projetos de código aberto são basicamente para uma determinada rede ou certas redes, e não há tutoriais semelhantes aos tutoriais.
Além das informações oficiais, a estrela mais famosa no Github é tensorrtx
- Este projeto poderá ser praticado pelo autor no início, e posteriormente será enriquecido por terceiros para apresentação de RP
- O conteúdo principal deste projeto é construir uma rede por meio da API TensorRT Network, converter os pesos do modelo original e importá-los para a rede construída.
- Esta biblioteca contém muitos modelos, o que é mais conveniente, então quero aprender o código-fonte e me familiarizar com C ++
O conteúdo principal deste artigo
- Aprenda o código-fonte Lenet (lenet é o submodelo mais simples em tensorrtx)
- Processo de uso: como usar
- Análise do código-fonte: Happy C ++

1. Processo de uso

Etapa 1: Gere um arquivo de peso.
- Use o pytorch para gerar arquivos de peso.
- Na verdade, ele usa outro repo do autor para gerar pesos.
- O processo é executar dois scripts Python python lenet5.py && python inference.pypor vez para registrar os resultados da inferência do modelo.
- Obtenha lenet5.wts e vá /path/to/tensorrtx/lenetpara.
Etapa 2: compilar e executar o código C ++.
- Compilar:cd /path/to/tensorrtx/lenet && mkdir build && cd build && cmake .. && make
- Executar: Gere arquivos do mecanismo sudo ./lenet -s, execute a inferência do modelo sudo ./lenet -de registre os resultados da inferência.
- Compare se os resultados de inferência de C ++ e Python são iguais.

2. Análise do código fonte

O código-fonte do PyTorch para gerar o arquivo de peso é ignorado, principalmente relacionado ao C ++.
Existem dois arquivos C ++ originais:
- lenet.cpp: Incluindo a construção e o raciocínio do modelo, principalmente introduzidos posteriormente.
- logging.h: Parece o código-fonte nos exemplos oficiais do TensorRT, então não vou dar uma olhada mais de perto, principalmente por causa de algumas funções de saída de log.
lenet.cpp As principais funções incluem
- Arquivo Build Engine
- Raciocínio de modelo

2.1. Criar arquivo do motor

Na verdade é um ./lenet -scomando, a função principal é construir um modelo através da API, importar peso e salvar o modelo no local.
A função loadWeightsé introduzida no arquivo de pesos de entrada e retorna um std::map<std::string, Weights>objeto.
A função da API para criar uma rede e importar pesos é a seguinte
- Observe que, ao construir a rede, parece não ter nada a ver com o tamanho do lote

// Creat the engine using only the API and not any parser.
ICudaEngine* createLenetEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    
    
    // 首先要创建空白网络，即 INetworkDefinition 对象
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // 添加模型输入，指定名称、数据类型、shape
    // Create input tensor of shape { 1, 32, 32 } with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{
    
    1, INPUT_H, INPUT_W});
    assert(data);
    
    // 获取权重文件中的所有权重信息
    std::map<std::string, Weights> weightMap = loadWeights("../lenet5.wts");

    // 添加卷基层，输入参数包括：输入tensor、输出channel数量、卷积核尺寸、weight权重、bias权重
    // 注意，这里没有指定stride、padding等其他参数
    // Add convolution layer with 6 outputs and a 5x5 filter.
    IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 6, DimsHW{
    
    5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]);
    assert(conv1);
    // 指定stride
    conv1->setStrideNd(DimsHW{
    
    1, 1});

    // 添加relu，其实就是指定输入tensor（上一个卷积的输出）以及激活函数类型
    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu1);

    // 添加池化层，输入参数包括：输入tensor、池化层类型、卷积核尺寸
    // 注意，这里没有设置stride、padding等参数
    // Add max pooling layer with stride of 2x2 and kernel size of 2x2.
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kAVERAGE, DimsHW{
    
    2, 2});
    assert(pool1);
    // 设置stride
    pool1->setStrideNd(DimsHW{
    
    2, 2});

    // Add second convolution layer with 16 outputs and a 5x5 filter.
    IConvolutionLayer* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 16, DimsHW{
    
    5, 5}, weightMap["conv2.weight"], weightMap["conv2.bias"]);
    assert(conv2);
    conv2->setStrideNd(DimsHW{
    
    1, 1});

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    // Add second max pooling layer with stride of 2x2 and kernel size of 2x2>
    IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{
    
    2, 2});
    assert(pool2);
    pool2->setStrideNd(DimsHW{
    
    2, 2});

    // 添加全连接层，输入参数包括：输入tensor、输出神经元数量、weight权重、bias权重
    // Add fully connected layer
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 120, weightMap["fc1.weight"], weightMap["fc1.bias"]);
    assert(fc1);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu3 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
    assert(relu3);

    // Add second fully connected layer
    IFullyConnectedLayer* fc2 = network->addFullyConnected(*relu3->getOutput(0), 84, weightMap["fc2.weight"], weightMap["fc2.bias"]);
    assert(fc2);

    // Add activation layer using the ReLU algorithm.
    IActivationLayer* relu4 = network->addActivation(*fc2->getOutput(0), ActivationType::kRELU);
    assert(relu4);

    // Add third fully connected layer
    IFullyConnectedLayer* fc3 = network->addFullyConnected(*relu4->getOutput(0), OUTPUT_SIZE, weightMap["fc3.weight"], weightMap["fc3.bias"]);
    assert(fc3);

    // 添加Softmax，输入参数就是前一层的输出
    // Add softmax layer to determine the probability.
    ISoftMaxLayer* prob = network->addSoftMax(*fc3->getOutput(0));
    assert(prob);
    // 设置输出层的名称，后面调用Engine的时候要用到
    prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    // 设置 INetworkDefinition 的输出
    network->markOutput(*prob->getOutput(0));

    // Build engine
    // 设置 batch size，前面构建过程都跟batch size无关
    builder->setMaxBatchSize(maxBatchSize);
    // 参考 https://developer.nvidia.com/blog/speeding-up-deep-learning-inference-using-tensorrt/
    config->setMaxWorkspaceSize(1 << 20);
    // 创建 engine 对象，输入数据包括INetworkDefinition 和 IBuilderConfig
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    // network的作用就是创建engine，没用了就释放资源
    // Don't need the network any more
    network->destroy();

    // 权重也没用了，释放资源
    // Release host memory
    for (auto& mem : weightMap)
    {
    
    
        free((void*) (mem.second.values));
    }

    return engine;
}

Outros processos incluem

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    
    
    // Create builder
    // 创建相关builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // 这个就是上面介绍的函数
    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createLenetEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    // 序列化输出，将结果保存在 modelStream 中
    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

2.2. Modelo de Raciocínio

A essência é ./lenet -d.
As funções principais incluem: leitura de arquivos de mecanismo, preparação de dados de entrada, execução de raciocínio e liberação de memória.
O código relevante após a leitura do arquivo é o seguinte

// 准备输入数据
// 虽然代码注释中写了要减去均值，但其实这个是固定输入
// Subtract mean from image
float data[INPUT_H * INPUT_W];
for (int i = 0; i < INPUT_H * INPUT_W; i++)
    data[i] = 1.0;

// 模型推理所需的各种类创建
// IRuntime 不知道是啥
// ICudaEngine其实就是一个优化方案（即Engine文件解析），读取的Engine文件保存在trtModelStream中
// IExecutionContext 是推理上下文，管理整个推理流程
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);

// 具体执行
// 模型结果就保存在 prob 数组中
// 其他一些代码是inference time计算
// Run inference
float prob[OUTPUT_SIZE];
for (int i = 0; i < 1000; i++) {
    
    
    auto start = std::chrono::system_clock::now();
    doInference(*context, data, prob, 1);
    auto end = std::chrono::system_clock::now();
    //std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}

// 推理结束，释放资源
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

Como pode ser visto no código acima, na doInferencefunção de raciocínio do modelo real
- Os dados de entrada incluem contexto (incluindo memória e mecanismo, etc.), entrada, saída, tamanho do lote

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    
    
    // 先要获取优化计划，即Engine
    const ICudaEngine& engine = context.getEngine();

    // 不是特别明白，应该指的是有没有绑定输入与输出吧
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // 获取输入与输出对应的buffer编号
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // 开辟输入与输出的内存区域，后面要使用
    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    // 创建 cudaStream，这个好像是同步流
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // 从内存到显存，input是内存中的输入数据，buffers[inputIndex]是在显存中的存储区域
    // cudaMemcpyHostToDevice其实就是从内存到显存的意思
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    
    // 加入队列，启动cuda核进行计算
    context.enqueue(batchSize, buffers, stream, nullptr);
    
    // 将推理结果保存到内存
    // cudaMemcpyDeviceToHost就是显存到内存的意思
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    
    // 如果使用了多个cuda流，需要同步
    cudaStreamSynchronize(stream);

    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

Primeiros passos com TensorRT (6) tensorrtx lenet

Artigo Diretório

0. Prefácio

1. Processo de uso

2. Análise do código fonte

2.1. Criar arquivo do motor

2.2. Modelo de Raciocínio

Acho que você gosta