tensorrt + torch version of lenet totrtの基本的な知識

公式サイトドキュメント

APIドキュメント

0.インストール

1.tensorrtをインストールします

公式ウェブサイトから.debパッケージをダウンロードし、cudaバージョンに注意してください

sudo dpkg -i nv-tensorrt-repo-ubuntu1604-cuda10.0-trt7.0.0.11-ga-20191216_1-1_amd64.deb
sudo apt update
sudo apt install tensorrt

エンジンプランの互換性は、GPUのコンピューティング機能とTensorRTバージョンに依存し、CUDAとCUDNNのバージョンには依存しません。

2.opencvをインストールします

sudo apt-get update
sudo apt install libopencv-dev

apt-get installtensorrtがエラーを報告しました

https://github.com/NVIDIA/TensorRT/issues/792

 tensorrt : Depends: libnvinfer7 (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvinfer-plugin7 (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvparsers7 (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvonnxparsers7 (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvinfer-bin (= 7.0.0-1+cuda10.0) but it is not going to be installed
            Depends: libnvinfer-dev (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvinfer-plugin-dev (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvparsers-dev (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvonnxparsers-dev (= 7.0.0-1+cuda10.0) but 7.2.2-1+cuda11.1 is to be installed
            Depends: libnvinfer-samples (= 7.0.0-1+cuda10.0) but it is not going to be installed
            Depends: libnvinfer-doc (= 7.0.0-1+cuda10.0) but it is not going to be installed

mv /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/nvidia-ml.list.bak

apt-getにtensorrtをインストールするだけです

1.プロセスを最適化します。

プロダクションエンジン:おそらくc ++ apiまたは他のサードパーティ形式で記述されたネットワーク構造であり、NetworkDefinitionによって定義され、ビルダーを使用してモデルの重みをロードし、いくつかのパラメーターを最適化してから、エンジンを使用してモデルにシリアル化します。 。

推論:エンジンの逆シリアル化を使用し、実行環境を作成して、推論を実行するだけです。

垂直CONV、BN、およびReluがマージされ、同じ構造で重みが異なるレイヤーが水平方向のより広いレイヤーにマージされ、それによってcudaコアの使用が削減されていることがわかります。

2.レネットからtrtへのトーチバージョン

2.1トーチのバージョンコード:

lenet.py

# coding:utf-8
import torch
from torch import nn
from torch.nn import functional as F


class Lenet5(nn.Module):
    """
    for cifar10 dataset.
    """

    def __init__(self):
        super(Lenet5, self).__init__()

        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)
        self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # print('input: ', x.shape)
        x = F.relu(self.conv1(x))
        # print('conv1', x.shape)
        x = self.pool1(x)
        # print('pool1: ', x.shape)
        x = F.relu(self.conv2(x))
        # print('conv2', x.shape)
        x = self.pool1(x)
        # print('pool2', x.shape)
        x = x.view(x.size(0), -1)
        # print('view: ', x.shape)
        x = F.relu(self.fc1(x))
        # print('fc1: ', x.shape)
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x


def main():
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    print('cuda device count: ', torch.cuda.device_count())
    torch.manual_seed(1234)
    net = Lenet5()
    net = net.to('cuda:0')
    net.eval()
    import time
    st_time = time.time()
    nums = 10000
    for i in range(nums):
        tmp = torch.ones(1, 1, 32, 32).to('cuda:0')
        out = net(tmp)
        # print('lenet out shape:', out.shape)
    print('lenet out:', out)
    end_time = time.time()
    print('==cost time{}'.format((end_time - st_time)))
    torch.save(net, "lenet5.pth")


if __name__ == '__main__':
    main()

モデルの重みを.pthとして保存し、テスト時間は次のとおりです。

2.2.pthは.onnxとして保存されます

ネットワーク構造を簡単に表示するため

# coding:utf-8
import torch
from torch import nn
from torch.nn import functional as F

class Lenet5(nn.Module):
    """
    for cifar10 dataset.
    """

    def __init__(self):
        super(Lenet5, self).__init__()

        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0)
        self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # print('input: ', x.shape)
        x = F.relu(self.conv1(x))
        # print('conv1', x.shape)
        x = self.pool1(x)
        # print('pool1: ', x.shape)
        x = F.relu(self.conv2(x))
        # print('conv2', x.shape)
        x = self.pool1(x)
        # print('pool2', x.shape)
        x = x.view(x.size(0), -1)
        # print('view: ', x.shape)
        x = F.relu(self.fc1(x))
        # print('fc1: ', x.shape)
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x

def main():
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    print('cuda device count: ', torch.cuda.device_count())
    torch.manual_seed(1234)
    net = Lenet5()
    net = net.to('cuda:0')
    net.eval()
    import time
    st_time = time.time()
    nums = 10000
    for i in range(nums):
        tmp = torch.ones(1, 1, 32, 32).to('cuda:0')
        out = net(tmp)
        # print('lenet out shape:', out.shape)
    print('lenet out:', out)
    end_time = time.time()
    print('==cost time{}'.format((end_time - st_time)))
    torch.save(net, "lenet5.pth")

def model_onnx():
    input = torch.ones(1, 1, 32, 32, dtype=torch.float32).cuda()
    model = Lenet5()
    model = model.cuda()
    torch.onnx.export(model, input, "./lenet.onnx", verbose=True)

if __name__ == '__main__':
    # main()
    model_onnx()

onnxを変更した後、私はいくつかの問題に遭遇しました、そしてそれらのすべては基本的にこれで解決されました。


    torch.onnx.export(model,  # model being run
                      input,  # model input (or a tuple for multiple inputs)
                      "./xxxx.onnx",
                      opset_version=10,
                      verbose=False,  # store the trained parameter weights inside the model file
                      training=False,
                      do_constant_folding=True,
                      input_names=['input'],
                      output_names=['output']
                      )

2.3.pthは.wtsとして保存されます

モデルの重みをキーと値の形式で16進ファイルinference.pyとして保存します

import torch
from torch import nn
from lenet5 import Lenet5
import os
import struct

def main():
    print('cuda device count: ', torch.cuda.device_count())
    net = torch.load('lenet5.pth')
    net = net.to('cuda:0')
    net.eval()
    #print('model: ', net)
    #print('state dict: ', net.state_dict()['conv1.weight'])
    tmp = torch.ones(1, 1, 32, 32).to('cuda:0')
    #print('input: ', tmp)
    out = net(tmp)
    print('lenet out:', out)

    f = open("lenet5.wts", 'w')
    print('==net.state_dict().keys():', net.state_dict().keys())
    f.write("{}\n".format(len(net.state_dict().keys())))
    for k, v in net.state_dict().items():
        print('key: ', k)
        print('value: ', v.shape)
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {}".format(k, len(vr)))
        for vv in vr:
            # print('=vv:', vv)
            f.write(" ")
            # print(struct.pack(">f", float(vv)).hex())#
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")
    print('==f:', f)

def  test_struct():
    vv = 16
    print(struct.pack(">f", float(vv)))  #

if __name__ == '__main__':
    main()
    # test_struct()

2.4 .wtsを.engineに変換し、.engine推論を使用する

lenet.cpp 

#include <map>
#include <chrono>
#include <fstream>
#include "NvInfer.h"
#include "logging.h"
#include "cuda_runtime_api.h"


static const int INPUT_H=32;
static const int INPUT_W=32;
static const int BATCH_SIZE=32;
static const int OUTPUT_SIZE=10;
static const int INFER_NUMS=10000;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";

using namespace nvinfer1;
static Logger gLogger;



#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)


std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;
        
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

ICudaEngine* createLenetEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
    //开始定义网络 0U无符号整型0
    INetworkDefinition* network =  builder->createNetworkV2(0U);
    ITensor* input = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W});
    assert(input);

    std::map<std::string, Weights> weightMap = loadWeights("../lenet5.wts");//载入权重放入weightMap
    // std::cout<<weightMap["conv1.weight"]<<std::endl;  

    //卷积层
    IConvolutionLayer* conv1 = network->addConvolution(*input, 6, DimsHW{5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]);
    //设置步长
    assert(conv1);
    conv1->setStrideNd(DimsHW{1, 1});
    
    //激活层
    IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
    assert(relu1);
    
    //pooling层
    IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
    assert(pool1);
    pool1->setStrideNd(DimsHW{2, 2});

    //卷积层
    IConvolutionLayer* conv2 = network->addConvolution(*pool1->getOutput(0), 16, DimsHW{5, 5}, weightMap["conv2.weight"], weightMap["conv2.bias"]);
    //设置步长
    assert(conv2);
    conv2->setStrideNd(DimsHW{1, 1});    
    //激活层
    IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
    assert(relu2);

    //pooling层
    IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
    assert(pool2);
    pool2->setStrideNd(DimsHW{2, 2});

    //全连接
    IFullyConnectedLayer* fc1  = network->addFullyConnected(*pool2->getOutput(0), 120, weightMap["fc1.weight"], weightMap["fc1.bias"]);
    assert(fc1);
    //激活层
    IActivationLayer* relu3 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
    assert(relu3);

    //全连接
    IFullyConnectedLayer* fc2  = network->addFullyConnected(*relu3->getOutput(0), 84, weightMap["fc2.weight"], weightMap["fc2.bias"]);
    assert(fc2);
    //激活层
    IActivationLayer* relu4 = network->addActivation(*fc2->getOutput(0), ActivationType::kRELU);
    assert(relu4);

    //全连接
    IFullyConnectedLayer* fc3  = network->addFullyConnected(*relu4->getOutput(0), OUTPUT_SIZE, weightMap["fc3.weight"], weightMap["fc3.bias"]);
    assert(fc3);
    
    //分类层
    ISoftMaxLayer *prob = network->addSoftMax(*fc3->getOutput(0));
    assert(prob);
    prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*prob->getOutput(0));

    //构造engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1<<20);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    
    //放入engine 所以network可以销毁了
    network->destroy();
    // 释放资源
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
    //创建builder
    IBuilder* builder = createInferBuilder(gLogger);//网络入口 类似pytorch的model
    IBuilderConfig* config = builder->createBuilderConfig();

    //创建模型 搭建网络层
    ICudaEngine* engine = createLenetEngine(maxBatchSize, builder, config, DataType::kFLOAT);
    assert(engine!=nullptr);

    //序列化engine
    (*modelStream)= engine->serialize();

    //销毁对象   
    engine->destroy();
    builder->destroy();

}

void doInference(IExecutionContext& context, float* input, float *output, int batchSize)
{
    //使用传进来的context恢复engine。
    const ICudaEngine& engine = context.getEngine();
    //输入输出总共有两个,做一下验证
    assert(engine.getNbBindings()==2);
    //void 
    void* buffers[2];
    //获取与这个engine相关的输入输出tensor的索引s
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    //为输入输出tensor开辟显存。
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    //创建cuda流,用于管理数据复制,存取,和计算的并发操作
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    //从内存到显存,input是读入内存中的数据;buffers[inputIndex]是显存上的存储区域,用于存放输入数据
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    // //启动cuda核,异步执行推理计算
    context.enqueue(batchSize, buffers, stream, nullptr);
   //从显存到内存,buffers[outputIndex]是显存中的存储区,存放模型输出;output是内存中的数据
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    //如果使用了多个cuda流,需要同步
    cudaStreamSynchronize(stream);

    // Release stream and buffers    
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));

}
int main(int argc, char ** argv)  
{   
    if (argc!=2)
    {   
        std::cerr << "arguments not right!" << std::endl;
        std::cerr << "./lenet -s   // serialize model to plan file" << std::endl;
        std::cerr << "./lenet -d   // deserialize plan file and run inference" << std::endl;
        return -1;
    }
    
    //序列化模型为.engine文件
    if(std::string(argv[1])=="-s")
    {   
        IHostMemory* modelStream{nullptr};//modelStream是一块内存区域,用来保存序列化文件
        APIToModel(1, &modelStream);
        assert(modelStream!=nullptr);
        //变换为.engine文件
        std::ofstream p("lenet.engine");
        if (!p)
        {
            std::cerr<<"can not open plan file"<<std::endl;
            return -1;
        }
        p.write(reinterpret_cast<const char *>(modelStream->data()), modelStream->size());
        // p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
        //销毁对象
        modelStream->destroy();
    }
    else if (std::string(argv[1])=="-d")
    {      
        char *trtModelStream{nullptr};
        size_t size{0};

        std::ifstream file("lenet.engine", std::ios::binary);
        if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            trtModelStream = new char[size];
            assert(trtModelStream);
            file.read(trtModelStream, size);
            file.close();
        }
        else
        {
        return -1;
        }
        //模拟数据
        float data[INPUT_H*INPUT_W];
        for (int i=0;i<INPUT_W*INPUT_H;i++)
        {
            data[i] = 1.0;
        }
        //创建运行时环境IRuntime对象
        IRuntime* runtime = createInferRuntime(gLogger);
        assert(runtime !=nullptr);
        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream,size,nullptr);
        assert(engine !=nullptr);

        //创建上下文环境,主要用与inference函数中启动cuda核
        IExecutionContext* context = engine->createExecutionContext();
        assert(context !=nullptr);
        
        //开始推理, 模拟推理1000次,存储推理结果
        float prob[OUTPUT_SIZE];
        auto start = std::chrono::system_clock::now();//开始时间
        for (int i=0;i<INFER_NUMS;i++)
        {               
            // std::cout<<"data[i]:"<<data[i]<<std::endl;
            doInference(*context, data, prob, 1);            
        }
        auto end = std::chrono::system_clock::now();//结束时间
        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        
        context->destroy();
        engine->destroy();
        runtime->destroy();

        std::cout<<"prob:";
        for (int i=0;i<OUTPUT_SIZE;i++)
        {         
            std::cout<<prob[i]<<",";
        }   

    }
    else
    {
       return -1;
    }

    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.6)

project(lenet)

add_definitions(-std=c++11)

set(TARGET_NAME "lenet")

option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda/include)
link_directories(/usr/local/cuda/lib64)
# tensorrt
include_directories(/usr/include/x86_64-linux-gnu)
link_directories(/usr/lib/x86_64-linux-gnu)

FILE(GLOB SRC_FILES ${PROJECT_SOURCE_DIR}/lenet.cpp ${PROJECT_SOURCE_DIR}/include/*.h)

add_executable(${TARGET_NAME} ${SRC_FILES})
target_link_libraries(${TARGET_NAME} nvinfer)
target_link_libraries(${TARGET_NAME} cudart)

add_definitions(-O2 -pthread)

./lenet-sは.engineファイルに変換されます

./lenet-d推論用

推論時間:

時間はトーチの少なくとも4倍速いことがわかりますが、結果はほぼ同じです。

おすすめ

転載: blog.csdn.net/fanzonghao/article/details/112675500