篇章六:TensorRT部署自定义CNN模型
目录
PS:纯粹为学习分享经验,不参与商用价值运作,若有侵权请及时联系!!!
深度学习模型部署TensorRT加速(七):TensorRT部署一个图像分类模型
深度学习模型部署TensorRT加速(八):TensorRT部署目标检测YOLO模型
深度学习模型部署TensorRT加速(九):TensorRT部署TransFormer模型
本章内容:TensorRT部署自定义CNN模型
目前比较成熟的分类模型有VGG(Visual Geometry Group)、ResNet(Residual Network)、InceptionNet、MobileNet、DenseNet、SqueezeNet等,根据不同的应用场景和需求可以选择最适合的模型进行分类部署。并且此类模型一般具有成熟的封装库,可以直接在Torch上调用。而且针对不同模型,官方都给出了操作实例,可以直接搜索相关经典模型代码进行解析。
如果需要进行详细的开发和进一步的研究的话,则需自行手动搭建网络层,根据不同需求设置具体的神经网络层,也可直接自行搭建并封装加速,以下是自定义CNN的参考代码:
Python代码:
import os
import sys
import struct
import argparse
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
BATCH_SIZE = 1
INPUT_H = 224
INPUT_W = 224
OUTPUT_SIZE = 1000
BS = 1
INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"
EPS = 1e-5
WEIGHT_PATH_SMALL = "./mobilenetv3.wts"
ENGINE_PATH = "./mobilenetv3.engine"
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
#从文件中加载模型的权重。
def load_weights(file):
print(f"Loading weights: {file}")
assert os.path.exists(file), 'Unable to load weight file.'
weight_map = {}
#接下来,函数开始遍历文件中的每一行权重数据。对于每一行,它将行内容按空格进行分割,并提取权重的名称、权重的元素个数。
with open(file, "r") as f:
lines = [line.strip() for line in f]
count = int(lines[0])
assert count == len(lines) - 1
for i in range(1, count + 1):
splits = lines[i].split(" ")
name = splits[0]
cur_count = int(splits[1])
assert cur_count + 2 == len(splits)
values = []
for j in range(2, len(splits)):
# hex string to bytes to float
values.append(struct.unpack(">f", bytes.fromhex(splits[j])))
weight_map[name] = np.array(values, dtype=np.float32)
return weight_map
#用于在TensorRT网络中添加二维批量归一化层。
def add_batch_norm_2d(network, weight_map, input, layer_name, eps):
gamma = weight_map[layer_name + ".weight"]
beta = weight_map[layer_name + ".bias"]
mean = weight_map[layer_name + ".running_mean"]
var = weight_map[layer_name + ".running_var"]
var = np.sqrt(var + eps)
scale = gamma / var
shift = -mean / var * gamma + beta
return network.add_scale(input=input,
mode=trt.ScaleMode.CHANNEL,
shift=shift,
scale=scale)
#在TensorRT网络中添加H-Swish激活函数层。
def add_h_swish(network, input):
h_sig = network.add_activation(input, type=trt.ActivationType.HARD_SIGMOID)
assert h_sig
h_sig.alpha = 1.0 / 6.0
h_sig.beta = 0.5
hsw = network.add_elementwise(input, h_sig.get_output(0), trt.ElementWiseOperation.PROD)
assert hsw
return hsw
def conv_bn_h_swish(network, weight_map, input, outch, ksize, s, g, lname):
p = (ksize - 1) // 2
conv1 = network.add_convolution(input=input,
num_output_maps=outch,
kernel_shape=(ksize, ksize),
kernel=weight_map[lname + "0.weight"],
bias=trt.Weights()
)
assert conv1
conv1.stride = (s, s)
conv1.padding = (p, p)
conv1.num_groups = g
bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)
hsw = add_h_swish(network, bn1.get_output(0))
assert hsw
return hsw
def add_se_layer(network, weight_map, input, c, w, lname):
h = w
l1 = network.add_pooling(input=input,
type=trt.PoolingType.AVERAGE,
window_size=trt.DimsHW(w, h))
assert l1
l1.stride_nd = (w, h)
l2 = network.add_fully_connected(input=l1.get_output(0),
num_outputs=BS * c // 4,
kernel=weight_map[lname + "fc.0.weight"],
bias=weight_map[lname + "fc.0.bias"])
relu1 = network.add_activation(l2.get_output(0), type=trt.ActivationType.RELU)
l4 = network.add_fully_connected(input=relu1.get_output(0),
num_outputs=BS * c,
kernel=weight_map[lname + "fc.2.weight"],
bias=weight_map[lname + "fc.2.bias"])
se = add_h_swish(network, l4.get_output(0))
return se
def conv_seq_1(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname):
p = (k - 1) // 2
conv1 = network.add_convolution(input=input,
num_output_maps=hdim,
kernel_shape=(k, k),
kernel=weight_map[lname + "0.weight"],
bias=trt.Weights())
assert conv1
conv1.stride = (s, s)
conv1.padding = (p, p)
conv1.num_groups = hdim
bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)
if use_hs:
hsw = add_h_swish(network, bn1.get_output(0))
tensor3 = hsw.get_output(0)
else:
relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
tensor3 = relu1.get_output(0)
if use_se:
se1 = add_se_layer(network, weight_map, tensor3, hdim, w, lname + "3.")
tensor4 = se1.get_output(0)
else:
tensor4 = tensor3
conv2 = network.add_convolution(input=tensor4,
num_output_maps=output,
kernel_shape=(1, 1),
kernel=weight_map[lname + "4.weight"],
bias=trt.Weights())
bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "5", EPS)
assert bn2
return bn2
def conv_seq_2(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname):
p = (k - 1) // 2
conv1 = network.add_convolution(input=input,
num_output_maps=hdim,
kernel_shape=(1, 1),
kernel=weight_map[lname + "0.weight"],
bias=trt.Weights())
bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS)
if use_hs:
hsw1 = add_h_swish(network, bn1.get_output(0))
tensor3 = hsw1.get_output(0)
else:
relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU)
tensor3 = relu1.get_output(0)
conv2 = network.add_convolution(input=tensor3,
num_output_maps=hdim,
kernel_shape=(k, k),
kernel=weight_map[lname + "3.weight"],
bias=trt.Weights())
conv2.stride = (s, s)
conv2.padding = (p, p)
conv2.num_groups = hdim
bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "4", EPS)
if use_se:
se1 = add_se_layer(network, weight_map, bn2.get_output(0), hdim, w, lname + "5.")
tensor6 = se1.get_output(0)
else:
tensor6 = bn2.get_output(0)
if use_hs:
hsw2 = add_h_swish(network, tensor6)
tensor7 = hsw2.get_output(0)
else:
relu2 = network.add_activation(tensor6, type=trt.ActivationType.RELU)
tensor7 = relu2.get_output(0)
conv3 = network.add_convolution(input=tensor7,
num_output_maps=output,
kernel_shape=(1, 1),
kernel=weight_map[lname + "7.weight"],
bias=trt.Weights())
bn3 = add_batch_norm_2d(network, weight_map, conv3.get_output(0), lname + "8", EPS)
assert bn3
return bn3
def inverted_res(network, weight_map, input, lname, inch, outch, s, hidden, k, use_se, use_hs, w):
use_res_connect = (s == 1 and inch == outch)
if inch == hidden:
conv = conv_seq_1(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.")
else:
conv = conv_seq_2(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.")
if not use_res_connect:
return conv
ew3 = network.add_elementwise(input, conv.get_output(0), trt.ElementWiseOperation.SUM)
assert ew3
return ew3
def create_engine_small(max_batch_size, builder, config, dt):
weight_map = load_weights(WEIGHT_PATH_SMALL)
network = builder.create_network()
data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
assert data
ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.")
ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 2, 16, 3, 1, 0, 56)
ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 72, 3, 0, 0, 28)
ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 88, 3, 0, 0, 28)
ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 96, 5, 1, 1, 14)
ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 240, 5, 1, 1, 14)
ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 240, 5, 1, 1, 14)
ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 48, 1, 120, 5, 1, 1, 14)
ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 48, 48, 1, 144, 5, 1, 1, 14)
ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 48, 96, 2, 288, 5, 1, 1, 7)
ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 96, 96, 1, 576, 5, 1, 1, 7)
ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 96, 96, 1, 576, 5, 1, 1, 7)
ew2 = conv_bn_h_swish(network, weight_map, ir11.get_output(0), 576, 1, 1, 1, "conv.0.")
se1 = add_se_layer(network, weight_map, ew2.get_output(0), 576, 7, "conv.1.")
pool1 = network.add_pooling(input=se1.get_output(0),
type=trt.PoolingType.AVERAGE,
window_size=trt.DimsHW(7, 7))
assert pool1
pool1.stride_nd = (7, 7)
sw1 = add_h_swish(network, pool1.get_output(0))
fc1 = network.add_fully_connected(input=sw1.get_output(0),
num_outputs=1280,
kernel=weight_map["classifier.0.weight"],
bias=weight_map["classifier.0.bias"])
assert fc1
bn1 = add_batch_norm_2d(network, weight_map, fc1.get_output(0), "classifier.1", EPS)
sw2 = add_h_swish(network, bn1.get_output(0))
fc2 = network.add_fully_connected(input=sw2.get_output(0),
num_outputs=OUTPUT_SIZE,
kernel=weight_map["classifier.3.weight"],
bias=weight_map["classifier.3.bias"])
bn2 = add_batch_norm_2d(network, weight_map, fc2.get_output(0), "classifier.4", EPS)
sw3 = add_h_swish(network, bn2.get_output(0))
sw3.get_output(0).name = OUTPUT_BLOB_NAME
network.mark_output(sw3.get_output(0))
# Build Engine
builder.max_batch_size = max_batch_size
builder.max_workspace_size = 1 << 20
engine = builder.build_engine(network, config)
del network
del weight_map
return engine
def create_engine_large(max_batch_size, builder, config, dt):
weight_map = load_weights(WEIGHT_PATH_SMALL)
network = builder.create_network()
data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W))
assert data
ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.")
ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 1, 16, 3, 0, 0, 112)
ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 64, 3, 0, 0, 56)
ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 72, 3, 0, 0, 56)
ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 72, 5, 1, 0, 28)
ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 120, 5, 1, 0, 28)
ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 120, 5, 1, 0, 28)
ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 80, 2, 240, 3, 0, 1, 14)
ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 80, 80, 1, 200, 3, 0, 1, 14)
ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 80, 80, 1, 184, 3, 0, 1, 14)
ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 80, 80, 1, 184, 3, 0, 1, 14)
ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 80, 112, 1, 480, 3, 1, 1, 14)
ir12 = inverted_res(network, weight_map, ir11.get_output(0), "features.12.", 112, 112, 1, 672, 3, 1, 1, 14)
ir13 = inverted_res(network, weight_map, ir12.get_output(0), "features.13.", 112, 160, 1, 672, 5, 1, 1, 14)
ir14 = inverted_res(network, weight_map, ir13.get_output(0), "features.14.", 160, 160, 2, 672, 5, 1, 1, 7)
ir15 = inverted_res(network, weight_map, ir14.get_output(0), "features.15.", 160, 160, 1, 960, 5, 1, 1, 7)
ew2 = conv_bn_h_swish(network, weight_map, ir15.get_output(0), 960, 1, 1, 1, "conv.0.")
pool1 = network.add_pooling(input=ew2.get_output(0),
type=trt.PoolingType.AVERAGE,
window_size=trt.DimsHW(7, 7))
assert pool1
pool1.stride_nd = (7, 7)
sw1 = add_h_swish(network, pool1.get_output(0))
fc1 = network.add_fully_connected(input=sw1.get_output(0),
num_outputs=1280,
kernel=weight_map["classifier.0.weight"],
bias=weight_map["classifier.0.bias"])
assert fc1
sw2 = add_h_swish(network, fc1.get_output(0))
fc2 = network.add_fully_connected(input=sw2.get_output(0),
num_outputs=OUTPUT_SIZE,
kernel=weight_map["classifier.3.weight"],
bias=weight_map["classifier.3.bias"])
fc2.get_output(0).name = OUTPUT_BLOB_NAME
network.mark_output(fc2.get_output(0))
# Build Engine
builder.max_batch_size = max_batch_size
builder.max_workspace_size = 1 << 20
engine = builder.build_engine(network, config)
del network
del weight_map
return engine
""" 将TensorRT的API调用转换为模型。 """
def API_to_model(max_batch_size, model_type):
builder = trt.Builder(TRT_LOGGER)
config = builder.create_builder_config()
if model_type == "small":
engine = create_engine_small(max_batch_size, builder, config, trt.float32)
assert engine
else:
engine = create_engine_large(max_batch_size, builder, config, trt.float32)
assert engine
with open(ENGINE_PATH, "wb") as f:
f.write(engine.serialize())
del engine
del builder
del config
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-s", action='store_true')
parser.add_argument("-d", action='store_true')
parser.add_argument("-t", help='indicate small or large model')
args = parser.parse_args()
if not (args.s ^ args.d):
print(
"arguments not right!\n"
"python mobilenet_v2.py -s # serialize model to plan file\n"
"python mobilenet_v2.py -d # deserialize plan file and run inference"
)
sys.exit()
if args.s:
API_to_model(BATCH_SIZE, args.t)
else:
runtime = trt.Runtime(TRT_LOGGER)
assert runtime
with open(ENGINE_PATH, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
assert engine
context = engine.create_execution_context()
assert context
data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32)
inputs, outputs, bindings, stream = allocate_buffers(engine)
inputs[0].host = data
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}')
同样地,根据相同逻辑也可以写出C++代码:
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 299;
static const int INPUT_W = 299;
static const int OUTPUT_SIZE = 1000;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
using namespace nvinfer1;
static Logger gLogger;
// Load weights from files shared with TensorRT samples.
// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, Weights> weightMap;
// Open weights file
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file.");
// Read number of weight blobs
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while (count--)
{
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t size;
// Read name and type of blob
std::string name;
input >> name >> std::dec >> size;
wt.type = DataType::kFLOAT;
// Load blob
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
wt.count = size;
weightMap[name] = wt;
}
return weightMap;
}
IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
float *gamma = (float*)weightMap[lname + ".weight"].values;
float *beta = (float*)weightMap[lname + ".bias"].values;
float *mean = (float*)weightMap[lname + ".running_mean"].values;
float *var = (float*)weightMap[lname + ".running_var"].values;
int len = weightMap[lname + ".running_var"].count;
std::cout << "len " << len << std::endl;
float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
scval[i] = gamma[i] / sqrt(var[i] + eps);
}
Weights scale{DataType::kFLOAT, scval, len};
float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
}
Weights shift{DataType::kFLOAT, shval, len};
float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
for (int i = 0; i < len; i++) {
pval[i] = 1.0;
}
Weights power{DataType::kFLOAT, pval, len};
weightMap[lname + ".scale"] = scale;
weightMap[lname + ".shift"] = shift;
weightMap[lname + ".power"] = power;
IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
assert(scale_1);
return scale_1;
}
IActivationLayer* basicConv2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, DimsHW ksize, int s, DimsHW p, std::string lname) {
Weights emptywts{DataType::kFLOAT, nullptr, 0};
IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, ksize, weightMap[lname + "conv.weight"], emptywts);
assert(conv1);
conv1->setStrideNd(DimsHW{s, s});
conv1->setPaddingNd(p);
IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn", 1e-3);
IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU);
assert(relu1);
return relu1;
}
IConcatenationLayer* inceptionA(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname,
int pool_proj) {
IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");
IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 48, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch5x5_1.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 64, DimsHW{5, 5}, 1, DimsHW{2, 2}, lname + "branch5x5_2.");
IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_3.");
IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{1, 1});
pool1->setPaddingNd(DimsHW{1, 1});
pool1->setAverageCountExcludesPadding(false);
IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), pool_proj, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");
ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)};
IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4);
assert(cat1);
return cat1;
}
IConcatenationLayer* inceptionB(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 384, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3.");
IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3dbl_3.");
IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{2, 2});
ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)};
IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3);
assert(cat1);
return cat1;
}
IConcatenationLayer* inceptionC(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname,
int c7) {
IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");
IActivationLayer* relu2 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7_1.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7_2.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7_3.");
IActivationLayer* relu3 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7dbl_1.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_2.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_3.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_4.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_5.");
IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{1, 1});
pool1->setPaddingNd(DimsHW{1, 1});
pool1->setAverageCountExcludesPadding(false);
IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");
ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)};
IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4);
assert(cat1);
return cat1;
}
IConcatenationLayer* inceptionD(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1.");
relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 320, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3_2.");
IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7x3_1.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7x3_2.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7x3_3.");
relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch7x7x3_4.");
IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{2, 2});
ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)};
IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3);
assert(cat1);
return cat1;
}
IConcatenationLayer* inceptionE(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname) {
IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 320, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1.");
IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 384, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1.");
IActivationLayer* relu2a = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3_2a.");
IActivationLayer* relu2b = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3_2b.");
ITensor* inputTensors[] = {relu2a->getOutput(0), relu2b->getOutput(0)};
IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 2);
assert(cat1);
IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 448, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1.");
relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2.");
IActivationLayer* relu3a = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3dbl_3a.");
IActivationLayer* relu3b = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3dbl_3b.");
ITensor* inputTensors1[] = {relu3a->getOutput(0), relu3b->getOutput(0)};
IConcatenationLayer* cat2 = network->addConcatenation(inputTensors1, 2);
assert(cat2);
IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{1, 1});
pool1->setPaddingNd(DimsHW{1, 1});
pool1->setAverageCountExcludesPadding(false);
IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool.");
ITensor* inputTensors2[] = {relu1->getOutput(0), cat1->getOutput(0), cat2->getOutput(0), relu4->getOutput(0)};
IConcatenationLayer* cat3 = network->addConcatenation(inputTensors2, 4);
assert(cat3);
return cat3;
}
// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
INetworkDefinition* network = builder->createNetworkV2(0U);
// Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
assert(data);
std::map<std::string, Weights> weightMap = loadWeights("../inception.wts");
Weights emptywts{DataType::kFLOAT, nullptr, 0};
float shval[3] = {(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5};
float scval[3] = {0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5};
float pval[3] = {1.0, 1.0, 1.0};
Weights shift{DataType::kFLOAT, shval, 3};
Weights scale{DataType::kFLOAT, scval, 3};
Weights power{DataType::kFLOAT, pval, 3};
IScaleLayer* scale1 = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, power);
assert(scale1);
IActivationLayer* relu1 = basicConv2d(network, weightMap, *scale1->getOutput(0), 32, DimsHW{3, 3}, 2, DimsHW{0, 0}, "Conv2d_1a_3x3.");
relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 32, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_2a_3x3.");
relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 64, DimsHW{3, 3}, 1, DimsHW{1, 1}, "Conv2d_2b_3x3.");
IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
assert(pool1);
pool1->setStrideNd(DimsHW{2, 2});
relu1 = basicConv2d(network, weightMap, *pool1->getOutput(0), 80, DimsHW{1, 1}, 1, DimsHW{0, 0}, "Conv2d_3b_1x1.");
relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 192, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_4a_3x3.");
pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3});
pool1->setStrideNd(DimsHW{2, 2});
auto cat1 = inceptionA(network, weightMap, *pool1->getOutput(0), "Mixed_5b.", 32);
cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5c.", 64);
cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5d.", 64);
cat1 = inceptionB(network, weightMap, *cat1->getOutput(0), "Mixed_6a.");
cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6b.", 128);
cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6c.", 160);
cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6d.", 160);
cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6e.", 192);
cat1 = inceptionD(network, weightMap, *cat1->getOutput(0), "Mixed_7a.");
cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7b.");
cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7c.");
IPoolingLayer* pool2 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kAVERAGE, DimsHW{8, 8});
assert(pool2);
IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]);
assert(fc1);
fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
std::cout << "set name out" << std::endl;
network->markOutput(*fc1->getOutput(0));
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "build out" << std::endl;
// Don't need the network any more
network->destroy();
// Release host memory
for (auto& mem : weightMap)
{
free((void*) (mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
// Create builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
// Serialize the engine
(*modelStream) = engine->serialize();
// Close everything down
engine->destroy();
builder->destroy();
config->destroy();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
if (argc != 2) {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./inception -s // serialize model to plan file" << std::endl;
std::cerr << "./inception -d // deserialize plan file and run inference" << std::endl;
return -1;
}
// create a model using the API directly and serialize it to a stream
char *trtModelStream{nullptr};
size_t size{0};
if (std::string(argv[1]) == "-s") {
IHostMemory* modelStream{nullptr};
APIToModel(1, &modelStream);
assert(modelStream != nullptr);
std::ofstream p("inception.engine", std::ios::binary);
if (!p)
{
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();
return 1;
} else if (std::string(argv[1]) == "-d") {
std::ifstream file("inception.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
} else {
return -1;
}
// Subtract mean from image
static float data[3 * INPUT_H * INPUT_W];
for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
data[i] = 1.0;
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
// Run inference
static float prob[OUTPUT_SIZE];
for (int i = 0; i < 100; i++) {
auto start = std::chrono::system_clock::now();
doInference(*context, data, prob, 1);
auto end = std::chrono::system_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
// Print histogram of the output distribution
std::cout << "\nOutput:\n\n";
for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
{
std::cout << prob[i] << ", ";
if (i % 10 == 0) std::cout << i / 10 << std::endl;
}
std::cout << std::endl;
return 0;
}
总结:
TensorRT在部署自定义CNN模型中的优势体现在适配性强,高性能加速、低内存占用、多硬件支持、便捷的部署以及多精度计算的能力,使其成为高效且适用于各种硬件环境的首选解决方案。
PS:纯粹为学习分享经验,不参与商用价值运作,若有侵权请及时联系!!!
下篇内容预告:
-
深度学习模型部署TensorRT加速(七):TensorRT部署图像分类模型
-
深度学习模型部署TensorRT加速(八):TensorRT部署目标检测YOLO模型
-
深度学习模型部署TensorRT加速(九):TensorRT部署TransFormer模型