Neural Network Deployment
basic knowledge
The boss proposes standards, and each company has its own method of implementation
Neural Networks
Benefits and recommendations for deploying network improvements
onnxruntime running instance
Speed up your network with onnxruntime
import onnxruntime
import numpy as np
# -------------------------------------------------------------------
# Onnxruntime 需要你提供一个 feed dict 和 output 的名字才能跑推理
# feed dict 就是 input name: data 的形式表示的输入数据
# output name 和 input name 你如果不知道的话,用可视化工具打开 onnx 文件就可以看到了。
# -------------------------------------------------------------------
MODEL = 'model.onnx'
FEED_DICT = {
'input name': np.zeros(shape=[1, 3, 224, 224])}
OUTPUT_NAMES = ['output name']
session = onnxruntime.InferenceSession(MODEL, providers=['CUDAExecutionProvider'])
result = session.run(OUTPUT_NAMES, FEED_DICT)
onnx post-training quantization (PPQ)
import torch
import torch.utils.data
import torchvision
from absl import logging
# 装一下下面这个库
from pytorch_quantization import nn as quant_nn
logging.set_verbosity(logging.FATAL) # Disable logging as they are too noisy in notebook
from pytorch_quantization import quant_modules
# 调用这个 quant_modules.initialize()
# 然后你正常训练就行了 ...
quant_modules.initialize()
model = torchvision.models.resnet50()
model.cuda()
# Quantization Aware Training is based on Straight Through Estimator (STE) derivative approximation.
# It is some time known as “quantization aware training”.
# We don’t use the name because it doesn’t reflect the underneath assumption.
# If anything, it makes training being “unaware” of quantization because of the STE approximation.
# After calibration is done, Quantization Aware Training is simply select a training schedule and continue training the calibrated model.
# Usually, it doesn’t need to fine tune very long. We usually use around 10% of the original training schedule,
# starting at 1% of the initial training learning rate,
# and a cosine annealing learning rate schedule that follows the decreasing half of a cosine period,
# down to 1% of the initial fine tuning learning rate (0.01% of the initial training learning rate).
# Quantization Aware Training (Essentially a discrete numerical optimization problem) is not a solved problem mathematically.
# Based on our experience, here are some recommendations:
# For STE approximation to work well, it is better to use small learning rate.
# Large learning rate is more likely to enlarge the variance introduced by STE approximation and destroy the trained network.
# Do not change quantization representation (scale) during training, at least not too frequently.
# Changing scale every step, it is effectively like changing data format (e8m7, e5m10, e3m4, et.al) every step,
# which will easily affect convergence.
# https://github.com/NVIDIA/TensorRT/blob/main/tools/pytorch-quantization/examples/finetune_quant_resnet50.ipynb
def export_onnx(model, onnx_filename, batch_onnx):
model.eval()
quant_nn.TensorQuantizer.use_fb_fake_quant = True # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
opset_version = 13
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, 224, 224, device='cuda') #TODO: switch input dims by model
torch.onnx.export(model, dummy_input, onnx_filename, verbose=False, opset_version=opset_version, enable_onnx_checker=False, do_constant_folding=True)
return True
onnxruntime performance comparison
# ---------------------------------------------------------------
# 这个脚本向你展示了如何使用 Onnxruntime 对 PPQ 导出的模型进行推理
# Onnxruntime 提供一系列 providers 实现不同硬件上的神经网络推理
# CPUExecutionProvider, CUDAExecutionProvider 是 Onnxruntime 官方提供的
# TensortExecutionProvider 是 Nvidia 提供的
# 不同 Provider 对模型格式有不一样的要求,PPQ 导出的是 CPUExecutionProvider 格式的模型
# Onnxruntime 没写 INT8 算子的 CUDA 实现,因此当你的模型使用 Onnxruntime 进行部署时,如果使用
# CUDAExecutionProvider, 你无需考虑量化加速
# ---------------------------------------------------------------
import torchvision
import torch
import ppq
import ppq.api as API
calibration_dataloader = [torch.rand(size=[1, 3, 224, 224]).cuda()]
model = torchvision.models.shufflenet_v2_x1_0().cuda()
with API.ENABLE_CUDA_KERNEL():
quantized = API.quantize_torch_model(
model=model, calib_dataloader=calibration_dataloader,
calib_steps=8, input_shape=[1, 3, 224, 224], platform=ppq.TargetPlatform.ONNXRUNTIME)
API.export_ppq_graph(
quantized, platform=ppq.TargetPlatform.ONNXRUNTIME,
graph_save_to='Quantized.onnx')
API.export_ppq_graph(
quantized, platform=ppq.TargetPlatform.ONNX,
graph_save_to='FP32.onnx')
from ppq.utils.OnnxruntimeUtil import Benchmark, Profile
Benchmark('FP32.onnx', providers=['CPUExecutionProvider'])
Benchmark('Quantized.onnx', providers=['CPUExecutionProvider'])
Profile('FP32.onnx', providers=['CPUExecutionProvider'])
Profile('Quantized.onnx', providers=['CPUExecutionProvider'])