Installation and configuration environment
code running cloud GPU platform: public account artificial intelligence tips reply gpu
Tongji Zihao brother 2022-8-22 2023-4-28 2023-5-8
Install Pytorch
pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
Install ONNX
pip install onnx -i https://pypi.tuna.tsinghua.edu.cn/simple
Install the inference engine ONNX Runtime
pip install onnxruntime -i https://pypi.tuna.tsinghua.edu.cn/simple
Install other third-party toolkits
pip install numpy pandas matplotlib tqdm opencv-python pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
Verify installation configuration is successful
import torch
import onnx
import onnxruntime as ort
#验证安装配置成功
print('PyTorch版本:',torch.__version__)
print('ONNX版本:',onnx.__version__)
print('ONNX Runtime版本:',ort.__version__)
Download material files
https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/imagenet/imagenet_class_index.csv
https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/banana1.jpg
https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/video_4.mp4
Download ONNX model files
https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/fruit30/onnx/resnet18_imagenet.onnx
Deployment of the inference engine ONNX Runtime - Predicting a single image
. Use the inference engine ONNX Runtime to read the model file in the ONNX format and predict the single image file.
Brother Tongji Zihao https://space.bilibili.com/1900783
2022-8-22 2023-5-8
Application Scenario
The following code runs on the hardware that needs to be deployed (local PC, embedded development board, Raspberry Pi, Jetson Nano, server)
Just send the onnx model file to the deployment hardware, and install the ONNX Runtime environment. Use the following lines of code to run the model.
import onnxruntime
import torch
import torch.nn.functional as F
import pandas as pd
from PIL import Image # 用pillow载入
from torchvision import transforms
# 载入onnx模型,获取ONNX Runtime推理器
ort_session=onnxruntime.InferenceSession('resnet18_imagenet.onnx',
providers=['CUDAExecutionProvider'])
# # 构造随随机输入,获取输出结果
# x=torch.rand(1,3,256,256).numpy()
# print('random:',x.shape)
# # onnx runtime 输入
# ort_inputs={'input':x}
# # onnx runtime 输出
# ort_output=ort_session.run(['output'],ort_inputs)[0]
# # 注意:输入输出张量的名称需要和torch.onnx.export中设置的输入输出名对应
# print('random ort_output:',ort_output.shape)
# 载入一张真正的测试图像
img_path='banana1.jpg'
img_pil=Image.open(img_path)
# img_pil.show() #显示这张图象
# 测试集图像预处理-RCNN:缩放旋转、转Tensor、归一化
test_transform=transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485,0.456,0.406],
std=[0.229,0.224,0.225])
])
# 运行预处理
input_img=test_transform(img_pil)
# print('input_img_shape:',input_img.shape)
input_tensor=input_img.unsqueeze(0).numpy()
# print('input_img_tensor:',input_tensor.shape)
# 推理预测
# ONN Runtime 输入
ort_inputs={
'input':input_tensor}
# ONN Runtime 输出
pred_logits=ort_session.run(['output'],ort_inputs)[0]
pred_logits=torch.tensor(pred_logits)
# print('pred_logits:',pred_logits.shape)
# 对logit分数做softmax运算,得到置信度概率
pred_softmax=F.softmax(pred_logits,dim=1)
# print('pre_softmax:',pred_softmax.shape)
# 解析预测结果
# 取置信度最高的前n个结果
n=3
top_n=torch.topk(pred_softmax,n)
# print('top_n:',top_n)
# 预测结果
pred_ids=top_n.indices.numpy()[0]
# print('pre_ids:',pred_ids)
# 预测置信度
confs=top_n.values.numpy()[0]
# print('confs:',confs)
# 载入ID和 类别名称 对应关系
df=pd.read_csv('imagenet_class_index.csv')
idx_to_labels={
}
for idx,row in df.iterrows():
idx_to_labels[row['ID']]=row['class'] # 英文
# idx_to_labels[row['ID']] = row['Chinese']# 中文
# print('idx_to_labels:',idx_to_labels)
# 分别用英文和中文打印预测结果
for i in range(n):
class_name=idx_to_labels[pred_ids[i]] # 获取类别名称
confidence=confs[i]*100
text='{:<20}{:>.3f}%'.format(class_name,confidence)
print(text)
ImageNet-ONNX Runtime Deployment - Camera and Video - English
Use the ONNX Runtime inference engine to load the ImageNet pre-trained image classification onnx model to predict the real-time camera footage.
Brother Tongji Zihao: https://space.bilibili.com/1900783
Test running environment: Macbook Pro
Notes
This code needs to be run locally where the camera is connected and cannot be run on the cloud GPU platform.
run locally
pip install onnxruntime
Install onnx runtime and prepare onnx model files.
Video processing English template frame by frame
from PIL import Image
import onnxruntime
import torch
import torch.nn.functional as F
from torchvision import transforms
import pandas as pd
import cv2
import time
from tqdm import tqdm
# 处理帧函数
def process_frame(img_bgr):
'''
输入摄像头拍摄画面bgr-array,输出图像分类预测结果bgr-array
'''
# 载入 onnx 模型,获取 ONNX Runtime 推理器
cuda = torch.cuda.is_available()
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession('resnet18_imagenet.onnx',None,providers=providers)
# 载入ImageNet 1000图像分类标签
df = pd.read_csv('imagenet_class_index.csv')
idx_to_labels = {
}
for idx, row in df.iterrows():
idx_to_labels[row['ID']] = row['class']
# 图像预处理
# 测试集图像预处理-RCTN:缩放裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 记录该帧开始处理的时间
start_time = time.time()
## 画面转成 RGB 的 Pillow 格式
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # BGR转RGB
img_pil = Image.fromarray(img_rgb) # array 转 PIL
## 预处理
input_img = test_transform(img_pil) # 预处理
input_tensor = input_img.unsqueeze(0).numpy()
## onnx runtime 预测
ort_inputs = {
'input': input_tensor} # onnx runtime 输入
pred_logits = ort_session.run(['output'], ort_inputs)[0] # onnx runtime 输出
pred_logits = torch.tensor(pred_logits)
pred_softmax = F.softmax(pred_logits, dim=1) # 对 logit 分数做 softmax 运算
## 解析top-n预测结果的类别和置信度
top_n = torch.topk(pred_softmax, 5) # 取置信度最大的 n 个结果
pred_ids = top_n[1].cpu().detach().numpy().squeeze() # 解析预测类别
confs = top_n[0].cpu().detach().numpy().squeeze() # 解析置信度
# 在图像上写英文
for i in range(len(confs)):
pred_class = idx_to_labels[pred_ids[i]]
# 写字:图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
text = '{:<15} {:>.3f}'.format(pred_class, confs[i])
img_bgr = cv2.putText(img_bgr, text, (50, 160 + 80 * i), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 4,
cv2.LINE_AA)
# 记录该帧处理完毕的时间
end_time = time.time()
# 计算每秒处理图像帧数FPS
FPS = 1 / (end_time - start_time)
# 图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
img_bgr = cv2.putText(img_bgr, 'FPS ' + str(int(FPS)), (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 255), 4,
cv2.LINE_AA)
return img_bgr
def generate_video(input_path):
filehead = input_path.split('/')[-1]
output_path = "out-" + filehead
print('视频开始处理', input_path)
# 获取视频总帧数
cap = cv2.VideoCapture(input_path)
frame_count = 0
while (cap.isOpened()):
success, frame = cap.read()
frame_count += 1
if not success:
break
cap.release()
print('视频总帧数为', frame_count)
# cv2.namedWindow('Crack Detection and Measurement Video Processing')
cap = cv2.VideoCapture(input_path)
frame_size = (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# fourcc = int(cap.get(cv2.CAP_PROP_FOURCC))
# fourcc = cv2.VideoWriter_fourcc(*'XVID')
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
out = cv2.VideoWriter(output_path, fourcc, fps, (int(frame_size[0]), int(frame_size[1])))
# 进度条绑定视频总帧数
with tqdm(total=frame_count - 1) as pbar:
try:
while (cap.isOpened()):
success, frame = cap.read()
if not success:
break
# 处理帧
# frame_path = './temp_frame.png'
# cv2.imwrite(frame_path, frame)
try:
frame = process_frame(frame)
except:
print('报错!', error)
pass
if success == True:
# cv2.imshow('Video Processing', frame)
out.write(frame)
# 进度条更新一帧
pbar.update(1)
# if cv2.waitKey(1) & 0xFF == ord('q'):
# break
except:
print('中途中断')
pass
cv2.destroyAllWindows()
out.release()
cap.release()
print('视频已保存', output_path)
generate_video(input_path='video_4.mp4')
Call the camera to obtain the English template of each frame
from PIL import Image
import onnxruntime
import torch
import torch.nn.functional as F
from torchvision import transforms
import pandas as pd
import cv2
import time
# 处理帧函数
def process_frame(img_bgr):
'''
输入摄像头拍摄画面bgr-array,输出图像分类预测结果bgr-array
'''
# 载入 onnx 模型,获取 ONNX Runtime 推理器
cuda = torch.cuda.is_available()
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession('resnet18_imagenet.onnx',None,providers=providers)
# 载入ImageNet 1000图像分类标签
df = pd.read_csv('imagenet_class_index.csv')
idx_to_labels = {
}
for idx, row in df.iterrows():
idx_to_labels[row['ID']] = row['class']
# 图像预处理
# 测试集图像预处理-RCTN:缩放裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 记录该帧开始处理的时间
start_time = time.time()
## 画面转成 RGB 的 Pillow 格式
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # BGR转RGB
img_pil = Image.fromarray(img_rgb) # array 转 PIL
## 预处理
input_img = test_transform(img_pil) # 预处理
input_tensor = input_img.unsqueeze(0).numpy()
## onnx runtime 预测
ort_inputs = {
'input': input_tensor} # onnx runtime 输入
pred_logits = ort_session.run(['output'], ort_inputs)[0] # onnx runtime 输出
pred_logits = torch.tensor(pred_logits)
pred_softmax = F.softmax(pred_logits, dim=1) # 对 logit 分数做 softmax 运算
## 解析top-n预测结果的类别和置信度
top_n = torch.topk(pred_softmax, 5) # 取置信度最大的 n 个结果
pred_ids = top_n[1].cpu().detach().numpy().squeeze() # 解析预测类别
confs = top_n[0].cpu().detach().numpy().squeeze() # 解析置信度
# 在图像上写英文
for i in range(len(confs)):
pred_class = idx_to_labels[pred_ids[i]]
# 写字:图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
text = '{:<15} {:>.3f}'.format(pred_class, confs[i])
img_bgr = cv2.putText(img_bgr, text, (50, 160 + 80 * i), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 4,
cv2.LINE_AA)
# 记录该帧处理完毕的时间
end_time = time.time()
# 计算每秒处理图像帧数FPS
FPS = 1 / (end_time - start_time)
# 图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
img_bgr = cv2.putText(img_bgr, 'FPS ' + str(int(FPS)), (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 255), 4,
cv2.LINE_AA)
return img_bgr
cap = cv2.VideoCapture(1)
# 打开cap
cap.open(0)
# 无限循环,直到break被触发
while cap.isOpened():
# 获取画面
success, frame = cap.read()
if not success: # 如果获取画面不成功,则退出
print('获取画面不成功,退出')
break
## 逐帧处理
frame = process_frame(frame)
# 展示处理后的三通道图像
cv2.imshow('my_window', frame)
key_pressed = cv2.waitKey(60) # 每隔多少毫秒毫秒,获取键盘哪个键被按下
# print('键盘上被按下的键:', key_pressed)
if key_pressed in [ord('q'), 27]: # 按键盘上的q或esc退出(在英文输入法下)
break
# 关闭摄像头
cap.release()
# 关闭图像窗口
cv2.destroyAllWindows()
# 按键盘上的q键退出
ImageNet-ONNX Runtime Deployment-Camera and Video-Chinese
Use ImageNet to pre-train the image classification model to predict the real-time camera footage.
This code needs to be run locally where the camera is connected and cannot be run on the cloud GPU platform.
Brother Tongji Zihao: https://space.bilibili.com/1900783
Test running environment: Macbook Pro
imports Chinese fonts
https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/SimHei.ttf
Video frame by frame processing Chinese template
from PIL import Image, ImageFont, ImageDraw
import onnxruntime
import torch
import torch.nn.functional as F
from torchvision import transforms
import pandas as pd
import cv2
import numpy as np
import time
from tqdm import tqdm
# 处理帧函数
def process_frame(img_bgr):
'''
输入摄像头拍摄画面bgr-array,输出图像分类预测结果bgr-array
'''
# 导入中文字体,指定字体大小
font = ImageFont.truetype('SimHei.ttf', 32)
# 载入 onnx 模型,获取 ONNX Runtime 推理器
cuda = torch.cuda.is_available()
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession('resnet18_imagenet.onnx',None,providers=providers)
# 载入ImageNet 1000图像分类标签
df = pd.read_csv('imagenet_class_index.csv')
idx_to_labels = {
}
for idx, row in df.iterrows():
idx_to_labels[row['ID']] = row['Chinese']
# 图像预处理
# 测试集图像预处理-RCTN:缩放裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 记录该帧开始处理的时间
start_time = time.time()
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # BGR转RGB
img_pil = Image.fromarray(img_rgb) # array 转 PIL
## 预处理
input_img = test_transform(img_pil) # 预处理
input_tensor = input_img.unsqueeze(0).numpy()
## onnx runtime 预测
ort_inputs = {
'input': input_tensor} # onnx runtime 输入
pred_logits = ort_session.run(['output'], ort_inputs)[0] # onnx runtime 输出
pred_logits = torch.tensor(pred_logits)
pred_softmax = F.softmax(pred_logits, dim=1) # 对 logit 分数做 softmax 运算
## 解析图像分类预测结果
n = 5
top_n = torch.topk(pred_softmax, n) # 取置信度最大的 n 个结果
pred_ids = top_n[1].cpu().detach().numpy().squeeze() # 解析出类别
confs = top_n[0].cpu().detach().numpy().squeeze() # 解析出置信度
## 在图像上写中文
draw = ImageDraw.Draw(img_pil)
for i in range(len(confs)):
pred_class = idx_to_labels[pred_ids[i]]
# 写中文:文字坐标,中文字符串,字体,rgba颜色
text = '{:<15} {:>.3f}'.format(pred_class, confs[i]) # 中文字符串
draw.text((50, 100 + 50 * i), text, font=font, fill=(255, 0, 0, 1))
img_rgb = np.array(img_pil) # PIL 转 array
img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) # RGB转BGR
# 记录该帧处理完毕的时间
end_time = time.time()
# 计算每秒处理图像帧数FPS
FPS = 1 / (end_time - start_time)
# 图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
img_bgr = cv2.putText(img_bgr, 'FPS ' + str(int(FPS)), (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 255), 4,
cv2.LINE_AA)
return img_bgr
def generate_video(input_path='videos/robot.mp4'):
filehead = input_path.split('/')[-1]
output_path = "out-" + filehead
print('视频开始处理', input_path)
# 获取视频总帧数
cap = cv2.VideoCapture(input_path)
frame_count = 0
while (cap.isOpened()):
success, frame = cap.read()
frame_count += 1
if not success:
break
cap.release()
print('视频总帧数为', frame_count)
# cv2.namedWindow('Crack Detection and Measurement Video Processing')
cap = cv2.VideoCapture(input_path)
frame_size = (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# fourcc = int(cap.get(cv2.CAP_PROP_FOURCC))
# fourcc = cv2.VideoWriter_fourcc(*'XVID')
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
out = cv2.VideoWriter(output_path, fourcc, fps, (int(frame_size[0]), int(frame_size[1])))
# 进度条绑定视频总帧数
with tqdm(total=frame_count - 1) as pbar:
try:
while (cap.isOpened()):
success, frame = cap.read()
if not success:
break
# 处理帧
# frame_path = './temp_frame.png'
# cv2.imwrite(frame_path, frame)
try:
frame = process_frame(frame)
except:
print('报错!', error)
pass
if success == True:
# cv2.imshow('Video Processing', frame)
out.write(frame)
# 进度条更新一帧
pbar.update(1)
# if cv2.waitKey(1) & 0xFF == ord('q'):
# break
except:
print('中途中断')
pass
cv2.destroyAllWindows()
out.release()
cap.release()
print('视频已保存', output_path)
generate_video(input_path='video_4.mp4')
Call the camera to obtain the Chinese template of each frame
from PIL import Image, ImageFont, ImageDraw
import onnxruntime
import torch
import torch.nn.functional as F
from torchvision import transforms
import pandas as pd
import cv2
import numpy as np
import time
from tqdm import tqdm
# 处理帧函数
def process_frame(img_bgr):
'''
输入摄像头拍摄画面bgr-array,输出图像分类预测结果bgr-array
'''
# 导入中文字体,指定字体大小
font = ImageFont.truetype('SimHei.ttf', 32)
# 载入 onnx 模型,获取 ONNX Runtime 推理器
cuda = torch.cuda.is_available()
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession('resnet18_imagenet.onnx',None,providers=providers)
# 载入ImageNet 1000图像分类标签
df = pd.read_csv('imagenet_class_index.csv')
idx_to_labels = {
}
for idx, row in df.iterrows():
idx_to_labels[row['ID']] = row['Chinese']
# 图像预处理
# 测试集图像预处理-RCTN:缩放裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 记录该帧开始处理的时间
start_time = time.time()
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # BGR转RGB
img_pil = Image.fromarray(img_rgb) # array 转 PIL
## 预处理
input_img = test_transform(img_pil) # 预处理
input_tensor = input_img.unsqueeze(0).numpy()
## onnx runtime 预测
ort_inputs = {
'input': input_tensor} # onnx runtime 输入
pred_logits = ort_session.run(['output'], ort_inputs)[0] # onnx runtime 输出
pred_logits = torch.tensor(pred_logits)
pred_softmax = F.softmax(pred_logits, dim=1) # 对 logit 分数做 softmax 运算
## 解析图像分类预测结果
n = 5
top_n = torch.topk(pred_softmax, n) # 取置信度最大的 n 个结果
pred_ids = top_n[1].cpu().detach().numpy().squeeze() # 解析出类别
confs = top_n[0].cpu().detach().numpy().squeeze() # 解析出置信度
## 在图像上写中文
draw = ImageDraw.Draw(img_pil)
for i in range(len(confs)):
pred_class = idx_to_labels[pred_ids[i]]
# 写中文:文字坐标,中文字符串,字体,rgba颜色
text = '{:<15} {:>.3f}'.format(pred_class, confs[i]) # 中文字符串
draw.text((50, 100 + 50 * i), text, font=font, fill=(255, 0, 0, 1))
img_rgb = np.array(img_pil) # PIL 转 array
img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) # RGB转BGR
# 记录该帧处理完毕的时间
end_time = time.time()
# 计算每秒处理图像帧数FPS
FPS = 1 / (end_time - start_time)
# 图片,添加的文字,左上角坐标,字体,字体大小,颜色,线宽,线型
img_bgr = cv2.putText(img_bgr, 'FPS ' + str(int(FPS)), (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 255), 4,
cv2.LINE_AA)
return img_bgr
# 获取摄像头,传入0表示获取系统默认摄像头
cap = cv2.VideoCapture(1)
# 打开cap
cap.open(0)
# 无限循环,直到break被触发
while cap.isOpened():
# 获取画面
success, frame = cap.read()
if not success: # 如果获取画面不成功,则退出
print('获取画面不成功,退出')
break
## 逐帧处理
frame = process_frame(frame)
# 展示处理后的三通道图像
cv2.imshow('my_window', frame)
key_pressed = cv2.waitKey(60) # 每隔多少毫秒毫秒,获取键盘哪个键被按下
# print('键盘上被按下的键:', key_pressed)
if key_pressed in [ord('q'), 27]: # 按键盘上的q或esc退出(在英文输入法下)
break
# 关闭摄像头
cap.release()
# 关闭图像窗口
cv2.destroyAllWindows()
# 按键盘上的q键退出