Face mask detection (including running code + data set)
- The purpose of this tutorial is to let developers understand the complete process in deep learning, which includes:
1. Data set import and preprocessing process
2. Network model selection and parameter setting process
3. Model training and export process
4. Model loading/optimization and draw inferences
Project source code and data set download:
https://download.csdn.net/download/kunhe0512/85360655
-
This tutorial uses the following main software and hardware environments:
1.NVIDIA Xavier NX
2.Jetpack 4.6
3.TensorRT 8.0.1
4.Pytorch 1.10.0
5.Python 3.6.9
6.Opencv 4.1.1 -
Experiment content:
- The experimental content of this tutorial is to use the deep learning method to complete the task of mask detection.
- The detection target categories are: Background, face, mask, mask_weared_incorrect
- During the experiment, the dataset of OpenImages CVS format and the model of SSD-mobilenet are used.
- This experiment uses Pytorch for model training, converts the trained model into ONNX format, and finally uses TensorRT for inference
- For more exciting content, please scan the QR code below to join the NVIDIA Developer Program
start experiment
1. Import the required tool library
#1
import os
import sys
sys.executable
import logging
import argparse
import itertools
import torch
from torch.utils.data import DataLoader, ConcatDataset
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
from vision.utils.misc import str2bool, Timer, freeze_net_layers, store_labels
from vision.ssd.ssd import MatchPrior
from vision.ssd.vgg_ssd import create_vgg_ssd
from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd
from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite
from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite
from vision.datasets.voc_dataset import VOCDataset
from vision.datasets.open_images import OpenImagesDataset
from vision.nn.multibox_loss import MultiboxLoss
from vision.ssd.config import vgg_ssd_config
from vision.ssd.config import mobilenetv1_ssd_config
from vision.ssd.config import squeezenet_ssd_config
from vision.ssd.data_preprocessing import TrainAugmentation, TestTransform
2. Use GPU to complete training
#2
DEVICE = torch.device("cuda:0")
torch.backends.cudnn.benchmark = True
3. Set the training method
#3
def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
net.train(True)
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
for i, data in enumerate(loader):
images, boxes, labels = data
images = images.to(device)
boxes = boxes.to(device)
labels = labels.to(device)
optimizer.zero_grad()
confidence, locations = net(images)
regression_loss, classification_loss = criterion(confidence, locations, labels, boxes) # TODO CHANGE BOXES
loss = regression_loss + classification_loss
loss.backward()
optimizer.step()
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
if i and i % debug_steps == 0:
avg_loss = running_loss / debug_steps
avg_reg_loss = running_regression_loss / debug_steps
avg_clf_loss = running_classification_loss / debug_steps
print(
f"Epoch: {epoch}, Step: {i}/{len(loader)}, " +
f"Avg Loss: {avg_loss:.4f}, " +
f"Avg Regression Loss {avg_reg_loss:.4f}, " +
f"Avg Classification Loss: {avg_clf_loss:.4f}"
)
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
4. Set the test method
#4
def test(loader, net, criterion, device):
net.eval()
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
num = 0
for _, data in enumerate(loader):
images, boxes, labels = data
images = images.to(device)
boxes = boxes.to(device)
labels = labels.to(device)
num += 1
with torch.no_grad():
confidence, locations = net(images)
regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
loss = regression_loss + classification_loss
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
return running_loss / num, running_regression_loss / num, running_classification_loss / num
5. Set training parameters
#5
net_name = "mb1-ssd"
datasets = []
datasets_path = ["data/mask"]
model_dir = "models/mask/"
voc_or_open_images = "open_images"
batch_size = 4
num_epochs = 6
validation_epochs = 2
num_workers = 2
lr = 0.01
base_net_lr = 0.001
extra_layers_lr = 0.01
momentum=0.9
weight_decay=5e-4
6. Load the dataset
#6
timer = Timer()
create_net = create_mobilenetv1_ssd
config = mobilenetv1_ssd_config
# create data transforms for train/test/val
train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
target_transform = MatchPrior(config.priors, config.center_variance,
config.size_variance, 0.5)
test_transform = TestTransform(config.image_size, config.image_mean, config.image_std)
# load datasets (could be multiple)
print("Prepare training datasets.")
for dataset_path in datasets_path:
if voc_or_open_images == 'voc':
dataset = VOCDataset(dataset_path, transform=train_transform,target_transform=target_transform)
label_file = os.path.join(model_dir, "labels.txt")
store_labels(label_file, dataset.class_names)
num_classes = len(dataset.class_names)
elif voc_or_open_images == 'open_images':
dataset = OpenImagesDataset(dataset_path,transform=train_transform, target_transform=target_transform,dataset_type="train", balance_data=False)
label_file = os.path.join(model_dir, "labels.txt")
store_labels(label_file, dataset.class_names)
print(dataset)
num_classes = len(dataset.class_names)
else:
raise ValueError(f"Dataset type is not supported.")
datasets.append(dataset)
7. Split the loaded dataset into training set and validation set
#7
# create training dataset
print(f"Stored labels into file {label_file}.")
train_dataset = ConcatDataset(datasets)
print("Train dataset size: {}".format(len(train_dataset)))
train_loader = DataLoader(train_dataset, batch_size,num_workers=num_workers,shuffle=True)
# create validation dataset
print("Prepare Validation datasets.")
if voc_or_open_images == "voc":
val_dataset = VOCDataset(dataset_path, transform=test_transform,target_transform=target_transform, is_test=True)
elif voc_or_open_images == 'open_images':
val_dataset = OpenImagesDataset(dataset_path,transform=test_transform, target_transform=target_transform,dataset_type="test")
print(val_dataset)
print("Validation dataset size: {}".format(len(val_dataset)))
val_loader = DataLoader(val_dataset, batch_size,num_workers = num_workers,shuffle=False)
8. Create the network model
#8
# create the network
print("Build network.")
net = create_net(num_classes)
min_loss = -10000.0
last_epoch = -1
params = [
{'params': net.base_net.parameters(), 'lr': base_net_lr},
{'params': itertools.chain(
net.source_layer_add_ons.parameters(),
net.extras.parameters()
), 'lr': extra_layers_lr},
{'params': itertools.chain(
net.regression_headers.parameters(),
net.classification_headers.parameters()
)}
]
9. Define whether to use a pretrained model or
- We have designed three modes here:
1. Start training from scratch, just assign your model path to base_net: base_net = “path/to/the/basic/model”
2. Use half of the previous training and not finish training To continue training, just assign the model path to resume: resume = "path/to/the/resume/model"
3. Using our pre-trained model, just assign the model path to pretrained_ssd: pretrained_ssd = "path/to/the/pretrained_ssd/model" - If you don't know what model you want to choose, you can assign None to resume, base_net and pretrained_ssd, and the training will automatically start from scratch.
#9
# load a previous model checkpoint (if requested)
timer.start("Load Model")
resume=None
base_net = None
pretrained_ssd = "models/face-mask-pretrain-model.pth"
if resume:
print(f"Resume from the model {resume}")
net.load(resume)
elif base_net:
print(f"Init from base net {base_net}")
net.init_from_base_net(base_net)
elif pretrained_ssd:
print(f"Init from pretrained ssd {pretrained_ssd}")
net.init_from_pretrained_ssd(pretrained_ssd)
print(f'Took {timer.end("Load Model"):.2f} seconds to load the model.')
10. Start training the model
#10
# move the model to GPU
net.to(DEVICE)
# define loss function and optimizer
criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,center_variance=0.1, size_variance=0.2, device=DEVICE)
optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=weight_decay)
print(f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, "+ f"Extra Layers learning rate: {extra_layers_lr}.")
# set learning rate policy
print("Uses CosineAnnealingLR scheduler.")
scheduler = CosineAnnealingLR(optimizer, 100, last_epoch=last_epoch)
# train for the desired number of epochs
print(f"Start training from epoch {last_epoch + 1}.")
for epoch in range(last_epoch + 1, num_epochs):
scheduler.step()
train(train_loader, net, criterion, optimizer,device=DEVICE, debug_steps=10, epoch=epoch)
if epoch % validation_epochs == 0 or epoch == num_epochs - 1:
val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
print(
f"Epoch: {epoch}, " +
f"Validation Loss: {val_loss:.4f}, " +
f"Validation Regression Loss {val_regression_loss:.4f}, " +
f"Validation Classification Loss: {val_classification_loss:.4f}"
)
model_path = os.path.join(model_dir, f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth")
net.save(model_path)
print(f"Saved model {model_path}")
print("Task done, exiting program.")
11. Convert the trained model to ONNX format
#11
!python3 onnx_export.py --model-dir=models/mask
12. Optimize the converted ONNX format with TensorRT to generate a TensorRT inference engine
Note here that Onnx2TensorRT needs to be installed
#12
!onnx2trt models/mask/ssd-mobilenet.onnx -o models/TRT_ssd_mobilenet_v1_face2.bin
13. Load the tool library required for engine inference
#13
import sys
import time
import argparse
import cv2
import pycuda.autoinit
import numpy as np
from utils.ssd_classes import get_cls_dict
from utils.camera import add_camera_args, Camera
from utils.display import open_window, set_display, show_fps
from utils.visualization import BBoxVisualization
import ctypes
import tensorrt as trt
import pycuda.driver as cuda
14. Design engine input and output preprocessing and postprocessing methods
#14
def do_nms(det, boxes, confs, clss):
drop = False
if len(boxes) <= 0:
boxes.append((det[0],det[1],det[2],det[3]))
confs.append(det[4])
clss.append(det[5])
return boxes, confs, clss
for i in range(0,len(boxes)):
bbox = boxes[i]
xx1 = np.maximum(det[0], bbox[0])
yy1 = np.maximum(det[1], bbox[1])
xx2 = np.minimum(det[2], bbox[2])
yy2 = np.minimum(det[3], bbox[3])
w = np.maximum(0.0, xx2-xx1+1)
h = np.maximum(0.0, yy2-yy1+1)
area_det = (det[2]-det[0]+1)*(det[3]-det[1]+1)
area_bbox = (bbox[2]-bbox[0]+1)*(bbox[3]-bbox[1]+1)
inter = w*h
ovr = inter / (area_det + area_bbox - inter)
if ovr > 0.6 and not drop:
if det[4] > confs[i]:
boxes[i] = ((det[0],det[1],det[2],det[3]))
confs[i] = det[4]
clss[i] = det[5]
drop = True
if not drop:
boxes.append((det[0],det[1],det[2],det[3]))
confs.append(det[4])
clss.append(det[5])
return boxes, confs, clss
def _preprocess_trt(img, shape=(300, 300)):
"""Preprocess an image before TRT SSD inferencing."""
img = cv2.resize(img, shape)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.transpose((2, 0, 1)).astype(np.float32)
img *= (2.0/255.0)
img -= 1.0
return img
def _postprocess_trt(img, output, conf_th, output_layout):
"""Postprocess TRT SSD output."""
img_h, img_w, _ = img.shape
boxes, confs, clss, results = [], [], [],[]
#print(((len(output[1]))/4+1))
#print("len(outputs[0]): "+str(len(output[0]))+" len(outputs[1]): "+str(len(output[1])))
for n in range(0, int((len(output[1]))/4)):
maxScore = -1000.0000
maxClass = 0
for m in range(0, 4):
score = output[0][n*4+m]
#print(score)
if score < conf_th:
continue
if m <= 0:
continue
if( score > maxScore):
maxScore = score
maxClass = m
#if(maxClass < 0):
# continue
index = int(n)
if maxScore < conf_th:
continue
#print(str(output[1][n*4+0])+" "+str(output[1][n*4+1])+" "+str(output[1][n*4+2])+" "+str(output[1][n*4+3]))
x1 = int(output[1][n*4+0] * img_w)
y1 = int(output[1][n*4+1] * img_h)
x2 = int(output[1][n*4+2] * img_w)
y2 = int(output[1][n*4+3] * img_h)
det = [x1,y1,x2,y2,maxScore,maxClass,n]
boxes, confs, clss = do_nms(det, boxes, confs, clss)
return boxes, confs, clss
15. Define the loading of the inference engine of the SSD-mobilenet v1 model
- When we have optimized the engine, we can write the optimized engine to the hard disk in the form of a file, which we call a serialized file or a PLAN file
- The next time we want to use the optimized engine directly, we can generate an executable engine by reading the serialized file on the hard disk and deserialize it using the deserialize_cuda_engine() method
- Using serialized files to generate executable engines can save us a lot of time
- The serialized files of engines generated on different platforms (software or hardware platforms) cannot be directly used. The serialized files of engines generated on the same platform (software and hardware platforms) or on the same device
#15
class TrtSSD(object):
"""TrtSSD class encapsulates things needed to run TRT SSD."""
#加载自定义组建,这里如果TensorRT版本小于7.0需要额外生成flattenconcat的自定义组件库
def _load_plugins(self):
trt.init_libnvinfer_plugins(self.trt_logger, '')
#加载通过Transfer Learning Toolkit生成的推理引擎
def _load_engine(self):
TRTbin = 'models/TRT_%s.bin' % self.model
with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
return runtime.deserialize_cuda_engine(f.read())
#通过加载的引擎,生成可执行的上下文
def _create_context(self):
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * \
self.engine.max_batch_size
##注意:这里的host_mem需要时用pagelocked memory,以免内存被释放
host_mem = cuda.pagelocked_empty(size, np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(cuda_mem))
if self.engine.binding_is_input(binding):
self.host_inputs.append(host_mem)
self.cuda_inputs.append(cuda_mem)
else:
self.host_outputs.append(host_mem)
self.cuda_outputs.append(cuda_mem)
return self.engine.create_execution_context()
#初始化引擎
def __init__(self, model, input_shape, output_layout=7):
"""Initialize TensorRT plugins, engine and conetxt."""
self.model = model
self.input_shape = input_shape
self.output_layout = output_layout
self.trt_logger = trt.Logger(trt.Logger.INFO)
self._load_plugins()
self.engine = self._load_engine()
self.host_inputs = []
self.cuda_inputs = []
self.host_outputs = []
self.cuda_outputs = []
self.bindings = []
self.stream = cuda.Stream()
self.context = self._create_context()
#释放引擎,释放GPU显存,释放CUDA流
def __del__(self):
"""Free CUDA memories."""
del self.stream
del self.cuda_outputs
del self.cuda_inputs
#利用生成的可执行上下文执行推理
def detect(self, img, conf_th=0.3):
"""Detect objects in the input image."""
img_resized = _preprocess_trt(img, self.input_shape)
np.copyto(self.host_inputs[0], img_resized.ravel())
#将处理好的图片从CPU内存中复制到GPU显存
cuda.memcpy_htod_async(
self.cuda_inputs[0], self.host_inputs[0], self.stream)
#开始执行推理任务
self.context.execute_async(
batch_size=1,
bindings=self.bindings,
stream_handle=self.stream.handle)
#将推理结果输出从GPU显存复制到CPU内存
cuda.memcpy_dtoh_async(
self.host_outputs[1], self.cuda_outputs[1], self.stream)
cuda.memcpy_dtoh_async(
self.host_outputs[0], self.cuda_outputs[0], self.stream)
self.stream.synchronize()
output = self.host_outputs
#print("len(outputs[0]): "+str(len(self.host_outputs[0]))+" len(outputs[1]): "+str(len(self.host_outputs[1])))
#for x in self.host_outputs[0]:
# print(str(x),end=' ')
#for x in self.host_outputs[1]:
# print(str(x),end=' ')
return _postprocess_trt(img, output, conf_th, self.output_layout)
16. Set up the model library
- 1. Multiple model libraries are defined here. We choose face mask detection, which is the last ssd_mobilenet_v1_face2
- 2. Here also defines the inputs to our model (300,300)
#16
INPUT_HW = (300, 300)
SUPPORTED_MODELS = [
'ssd_mobilenet_v1_coco',
'ssd_mobilenet_v1_egohands',
'ssd_mobilenet_v2_coco',
'ssd_mobilenet_v2_egohands',
'ssd_mobilenet_v2_face',
'ssd_resnet18_5th',
'ssd_mobilenet_v1_face2',
'ssd_mobilenet_v1_fruit'
]
17. Start defining methods to read the data and draw the output visualization onto the image
- The detect_one() method is to detect a single image, and the detect_video() method is to detect the video
- Note: The fps value printed here includes the time when the image is written to the result video. If the function of writing the video to the result video is canceled, the speed will be greatly improved
#17-1
def detect_video(video, trt_ssd, conf_th, vis,result_file_name):
full_scrn = False
fps = 0.0
tic = time.time()
frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = video.get(cv2.CAP_PROP_FPS)
#print(str(frame_width)+str(frame_height))
##定义输入编码
fourcc = cv2.VideoWriter_fourcc('M', 'P', '4', 'V')
videoWriter = cv2.VideoWriter('result.AVI', fourcc, fps, (frame_width,frame_height))
##开始循环检测,并将结果写到result.mp4中
while True:
ret,img = video.read()
if img is not None:
boxes, confs, clss = trt_ssd.detect(img, conf_th)
#print("boxes,confs,clss: "+ str(boxes)+" "+ str(confs)+" "+str(clss))
img = vis.draw_bboxes(img, boxes, confs, clss)
videoWriter.write(img)
toc = time.time()
curr_fps = 1.0 / (toc - tic)
fps = curr_fps if fps == 0.0 else (fps*0.95 + curr_fps*0.05)
tic = toc
print("\rfps: "+str(fps),end="")
else:
break
#17-2
def detect_one(img, trt_ssd, conf_th, vis):
full_scrn = False
tic = time.clock()
##开始检测,并将结果写到result.jpg中
boxes, confs, clss = trt_ssd.detect(img, conf_th)
toc = time.clock()
curr_fps = (toc - tic)
#print("boxes: "+str(boxes))
#print("clss: "+str(clss))
#print("confs: "+str(confs))
img = vis.draw_bboxes(img, boxes, confs, clss)
cv2.imwrite("result.jpg",img)
print("time: "+str(curr_fps)+"(sec)")
18. Define the main() function to detect a single image**
- You can upload images to the current folder by yourself, and change the filename to the name of the image you want to test
- face refers to a face without a mask, face_mask refers to a face with a mask, mask_weared_incorrect refers to a face with a mask but an irregular face
#18-1
def main_one():
filename = "mask.jpg"
result_file_name = str(filename)
img = cv2.imread(filename)
cls_dict = get_cls_dict("ssd_mobilenet_v1_face2".split('_')[-1])
model_name ="ssd_mobilenet_v1_face2"
trt_ssd = TrtSSD(model_name, INPUT_HW)
vis = BBoxVisualization(cls_dict)
print("start detection!")
detect_one(img, trt_ssd, conf_th=0.5, vis=vis)
cv2.destroyAllWindows()
print("finish!")
#18-2
from IPython.display import Image
main_one()
Image("result.jpg")
19. Define the main() function to detect the video
- You can upload the video to the current folder by yourself, and change the filename to the name of the video you want to test
- In the detection video part, since the detection results need to be written to the hard disk, the time will be doubled. If you want to get data similar to the single detection, you can comment out the read and write statements.
- face refers to a face without a mask, face_mask refers to a face with a mask, and mask_weared_incorrect refers to a face with a mask but not standardized)
#19-1
def main_loop():
filename = "face_mask_test_video.mp4"
result_file_name = str(filename)
video = cv2.VideoCapture(filename)
cls_dict = get_cls_dict("ssd_mobilenet_v1_face2".split('_')[-1])
model_name ="ssd_mobilenet_v1_face2"
trt_ssd = TrtSSD(model_name, INPUT_HW)
vis = BBoxVisualization(cls_dict)
print("start detection!")
detect_video(video, trt_ssd, conf_th=0.8, vis=vis, result_file_name=result_file_name)
video.release()
cv2.destroyAllWindows()
print("\nfinish!")
#19-2
main_loop()
20. Transcode the resulting video to be able to view it in Jupyter Notebook
- The GPU-accelerated transcoding technology is used here to convert the output video to MP4 format, which is significantly faster than simply using the CPU for transcoding.
#20
!rm result-ffmpeg4.mp4
!ffmpeg -i result.AVI -vcodec libx264 -f mp4 result-ffmpeg4.mp4
21. View the resulting video
#21
from IPython.display import Video
Video("result-ffmpeg4.mp4")