Bleiben Sie mit Kaiming auf dem Laufenden: Zeichnen Sie den gesamten Prozess des Slowfast-Modells zur Erkennung wiederkehrenden Verhaltens auf (mit detailliertem Code).

Autor丨Fu Huihui, Zhou Yuchen
Herausgeber丨Jishi Platform

Vorwort

In den letzten Jahren gab es immer mehr Untersuchungen zur Erkennung menschlicher Handlungen auf der Grundlage von Deep Learning. slowfastDas Modell schlägt ein schnelles und langsames Zweikanalnetzwerk vor, das bei Datensätzen zur Handlungserkennung sehr gut funktioniert. In diesem Artikel werden die SlowfastDatenvorbereitung, das Training und und zur Argumentation slowfastverwenden onnx, wobei der Schwerpunkt auf der Einführung liegt. Es Slowfastverwendet TensorrtInferenz, verwendet yolov5und deepsortführt Personenverfolgung durch und verwendet C++die Bereitstellung.

1. Datenaufbereitung

1.1 Video zuschneiden

Bereiten Sie mehrere Sätze von Videodaten vor, wobei IN_DATA_DIRsich das ursprüngliche Videodatenspeicherverzeichnis und OUT_DATA_DIRdas Zielvideodatenspeicherverzeichnis befinden. Dieser Schritt stellt sicher, dass alle Videos die gleiche Länge haben

IN_DATA_DIR="/project/train/src_repo/data/video"
OUT_DATA_DIR="/project/train/src_repo/data/splitvideo"
str="_"
if [[ ! -d "${OUT_DATA_DIR}" ]]; then
  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";
  mkdir -p ${OUT_DATA_DIR}
fi

for video in $(ls -A1 -U ${
     
     IN_DATA_DIR}/*)
do 
    for i in {
    
    0..10}
    do 
      index=$(expr $i \* 10)
      out_name="${OUT_DATA_DIR}/${i}${str}${video##*/}"
      if [ ! -f "${out_name}" ]; then
        ffmpeg -ss ${index} -t 80 -i "${video}" "${out_name}"
      fi
    done
done

1.2 Keyframes extrahieren

Der Schlüsselrahmen dient zum Extrahieren eines Rahmens aus jeder Sekunde des Videos. Dies IN_DATA_DIRist das Verzeichnis des in Schritt 1 erhaltenen Videos OUT_DATA_DIRund das Speicherverzeichnis des extrahierten Schlüsselrahmens

#切割图片,每秒1帧
IN_DATA_DIR="/project/train/src_repo/data/splitvideo/"
OUT_DATA_DIR="/project/train/src_repo/data/splitimages/"
 
if [[ ! -d "${OUT_DATA_DIR}" ]]; then
  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";
  mkdir -p ${OUT_DATA_DIR}
fi
 
for video in $(ls -A1 -U ${
     
     IN_DATA_DIR}/*)
do
  video_name=${video##*/}
 
  if [[ $video_name = *".webm" ]]; then
    video_name=${video_name::-5}
  else
    video_name=${video_name::-4}
  fi
 
  out_video_dir=${OUT_DATA_DIR}/${video_name}/
  mkdir -p "${out_video_dir}"
 
  out_name="${out_video_dir}/${video_name}_%06d.jpg"
 
  ffmpeg -i "${video}" -r 1 -q:v 1 "${out_name}"
done
 

1.3 Video teilen

Teilen Sie das in Schritt 1 generierte Video in ffmpegBilder auf, 30 Bilder pro Sekunde, IN_DATA_DIRum das Videoverzeichnis und OUT_DATA_DIRdas Ergebnisverzeichnis zu speichern

IN_DATA_DIR="/project/train/src_repo/video"
OUT_DATA_DIR="/project/train/src_repo/spiltvideo"

if [[ ! -d "${OUT_DATA_DIR}" ]]; then
  echo "${OUT_DATA_DIR} doesn't exist. Creating it.";
  mkdir -p ${OUT_DATA_DIR}
fi

for video in $(ls -A1 -U ${
     
     IN_DATA_DIR}/*)
do
  out_name="${OUT_DATA_DIR}/${video##*/}"
  if [ ! -f "${out_name}" ]; then
    ffmpeg -ss 0 -t 100 -i "${video}" "${out_name}"
  fi
done

1.4 Dateiverzeichnis

ava  #一级文件夹,用来存放视频信息
—person_box_67091280_iou90 #二级文件夹,用来存放目标检测信息文件夹
——ava_detection_train_boxes_and_labels_include_negative_v2.2.csv #二级文件夹下文件,用来存放目标检测的信息,用于训练
——ava_detection_val_boxes_and_labels.csv #二级文件夹下文件,用来存放目标检测的信息,用于测试
—ava_action_list_v2.2_for_activitynet_2019.pbtxt #一级文件夹下的文件,用来存放标签信息
—ava_val_excluded_timestamps_v2.2.csv #一级文件夹下的文件,用来没有人物的帧,在训练过程中会抛弃这些帧
—ava_train_v2.2.csv #一级文件夹下的文件,用来存放训练数据,关键帧的信息
—ava_val_v2.2.csv  #一级文件夹下的文件,用来存放验证数据,关键帧的信息

frame_lists  #一级文件夹,存放1.3中生成的图片的路径
—train.csv
—val.csv

frames  #一级文件夹,存放1.3中生成的图片
—A
——A_000001.jpg
——A_0000012.jpg
——A_000090.jpg
—B
——B_000001.jpg
——B_0000012.jpg
——B_000090.jpg

2. Vorbereitung der Umgebung

2.1 Umgebungsvorbereitung

pip install iopath
pip install fvcore
pip install simplejson
pip install pytorchvideo

2.2 detectron2Installation

!python -m pip install pyyaml==5.1
import sys, os, distutils.core
# Note: This is a faster way to install detectron2 in Colab, but it does not include all functionalities.
# See https://detectron2.readthedocs.io/tutorials/install.html for full installation instructions
!git clone 'https://github.com/facebookresearch/detectron2'
dist = distutils.core.run_setup("./detectron2/setup.py")
!python -m pip install {
    
    ' '.join([f"'{x}'" for x in dist.install_requires])}
sys.path.insert(0, os.path.abspath('./detectron2'))

3. slowfastAusbildung

3.1 Schulung

python tools/run_net.py --cfg configs/AVA/SLOWFAST_32x2_R50_SHORT.yaml

SLOWFAST_32x2_R50_SHORT.yaml

TRAIN:
  ENABLE: Fasle
  DATASET: ava
  BATCH_SIZE: 8 #64
  EVAL_PERIOD: 5
  CHECKPOINT_PERIOD: 1
  AUTO_RESUME: True
  CHECKPOINT_FILE_PATH: '/content/SLOWFAST_32x2_R101_50_50.pkl'  #预训练模型地址
  CHECKPOINT_TYPE: pytorch
DATA:
  NUM_FRAMES: 32
  SAMPLING_RATE: 2
  TRAIN_JITTER_SCALES: [256, 320]
  TRAIN_CROP_SIZE: 224
  TEST_CROP_SIZE: 224
  INPUT_CHANNEL_NUM: [3, 3]
  PATH_TO_DATA_DIR: '/content/ava'
DETECTION:
  ENABLE: True
  ALIGNED: True
AVA:
  FRAME_DIR: '/content/ava/frames'   #数据准备阶段生成的目录
  FRAME_LIST_DIR: '/content/ava/frame_lists'
  ANNOTATION_DIR: '/content/ava/annotations'
  DETECTION_SCORE_THRESH: 0.5
  FULL_TEST_ON_VAL: True
  TRAIN_PREDICT_BOX_LISTS: [
    "ava_train_v2.2.csv",
    "person_box_67091280_iou90/ava_detection_train_boxes_and_labels_include_negative_v2.2.csv",
  ]
  TEST_PREDICT_BOX_LISTS: [
    "person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"]
  
  
SLOWFAST:
  ALPHA: 4
  BETA_INV: 8
  FUSION_CONV_CHANNEL_RATIO: 2
  FUSION_KERNEL_SZ: 7
RESNET:
  ZERO_INIT_FINAL_BN: True
  WIDTH_PER_GROUP: 64
  NUM_GROUPS: 1
  DEPTH: 50
  TRANS_FUNC: bottleneck_transform
  STRIDE_1X1: False
  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]]
  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]]
NONLOCAL:
  LOCATION: [[[], []], [[], []], [[], []], [[], []]]
  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
  INSTANTIATION: dot_product
  POOL: [[[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]]]
BN:
  USE_PRECISE_STATS: False
  NUM_BATCHES_PRECISE: 20
SOLVER:
  BASE_LR: 0.1
  LR_POLICY: steps_with_relative_lrs
  STEPS: [0, 10, 15, 20]
  LRS: [1, 0.1, 0.01, 0.001]
  MAX_EPOCH: 20
  MOMENTUM: 0.9
  WEIGHT_DECAY: 1e-7
  WARMUP_EPOCHS: 5.0
  WARMUP_START_LR: 0.000125
  OPTIMIZING_METHOD: sgd
MODEL:
  NUM_CLASSES: 1
  ARCH: slowfast
  MODEL_NAME: SlowFast
  LOSS_FUNC: bce
  DROPOUT_RATE: 0.5
  HEAD_ACT: sigmoid
TEST:
  ENABLE: False
  DATASET: ava
  BATCH_SIZE: 8
DATA_LOADER:
  NUM_WORKERS: 0
  PIN_MEMORY: True
NUM_GPUS: 1
NUM_SHARDS: 1
RNG_SEED: 0
OUTPUT_DIR: .

3.2 Häufige Fehler beim Training

1. slowfast/datasets/ava_helper.pyÄndern Sie die AVA_VALID_FRAMESLänge Ihres Videos

2. pytorchvideo.layers.distributedFehlerberichterstattung

from pytorchvideo.layers.distributed import ( # noqa
ImportError: cannot import name 'cat_all_gather' from 'pytorchvideo.layers.distributed' 
(/site-packages/pytorchvideo/layers/distributed.py)

3. pytorchvideo.lossesFehlerberichterstattung

File "SlowFast/slowfast/models/losses.py", line 11, in
from pytorchvideo.losses.soft_target_cross_entropy import (
ModuleNotFoundError: No module named 'pytorchvideo.losses'

Fehler 2, 3 können durch einen Blick auf den Referenzlink eins behoben werden

4. slowfastPrognose

Die erste Methode: Verwenden Sie offizielle Skripte zur Begründung

python tools/run_net.py --cfg demo/AVA/SLOWFAST_32x2_R101_50_50.yaml

Der zweite Typ: Aufgrund von detectron2Installationsproblemen und einer Reihe von Problemen nach der Bereitstellung können Sie yolov5Plus slowfastzur Begründung verwenden

Lassen Sie uns zunächst slowfastden Argumentationsprozess verstehenFlussdiagramm für langsames, schnelles Denken

Schritt 1: Lesen Sie kontinuierlich 64 Bilder und beurteilen Sie, ob 64 Bilder erfüllt sind

while was_read:
    frames=[]
    seq_length=64
    while was_read and len(frames) < seq_length:
        was_read, frame =cap.read()
        frames.append(frame)

Schritt 2: Verwenden Sie yolov5 zur Zielerkennung

1. yolov5Inferenzcode, sys.path.insertPfad und Gewichtspfad weightsändern

import argparse
import os
import platform
import shutil
import time
from pathlib import Path
import sys
import json
sys.path.insert(1, '/content/drive/MyDrive/yolov5/')
import cv2
import torch
import torch.backends.cudnn as cudnn
import numpy as np
import argparse
import time
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
from models.common import DetectMultiBackend
from utils.augmentations import letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging
from utils.torch_utils import select_device
# ####### 参数设置
conf_thres = 0.6
iou_thres = 0.5
#######
imgsz = 640
weights = "/content/yolov5l.pt"
device = '0'
stride = 32
names = ["person"]
import os
def init():
    # Initialize
    global imgsz, device, stride
    set_logging()
    device = select_device('0')
    half = device.type != 'cpu'  # half precision only supported on CUDA
    model = DetectMultiBackend(weights, device=device, dnn=False)
    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine
    imgsz = check_img_size(imgsz, s=stride)  # check img_size
    model.half()  # to FP16
    model.eval()
    return model

def process_image(model, input_image=None, args=None, **kwargs):
    img0 = input_image
    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]
    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)
    img = img.half()
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]
    pred = model(img, augment=False, val=True)[0]
    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)
    result=[]
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        if det is not None and len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
            for *xyxy, conf, cls in det:
                if cls==0:
                    result.append([float(xyxy[0]),float(xyxy[1]),float(xyxy[2]),float(xyxy[3])])
    if len(result)==0:
      return None
    return torch.from_numpy(np.array(result))

2 .bboxVorverarbeitung

def scale_boxes(size, boxes, height, width):
    """
    Scale the short side of the box to size.
    Args:
        size (int): size to scale the image.
        boxes (ndarray): bounding boxes to peform scale. The dimension is
        `num boxes` x 4.
        height (int): the height of the image.
        width (int): the width of the image.
    Returns:
        boxes (ndarray): scaled bounding boxes.
    """
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return boxes

    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
        boxes *= float(new_height) / height
    else:
        new_width = int(math.floor((float(width) / height) * size))
        boxes *= float(new_width) / width
    return boxes

Schritt 3: Bildvorverarbeitung

1. ResizeBildgröße

def scale(size, image):
    """
    Scale the short side of the image to size.
    Args:
        size (int): size to scale the image.
        image (array): image to perform short side scale. Dimension is
            `height` x `width` x `channel`.
    Returns:
        (ndarray): the scaled image with dimension of
            `height` x `width` x `channel`.
    """
    height = image.shape[0]
    width = image.shape[1]
    # print(height,width)
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return image
    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
    else:
        new_width = int(math.floor((float(width) / height) * size))
    img = cv2.resize(
        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
    )
    # print(new_width, new_height)
    return img.astype(np.float32)

2. Normalisierung

def tensor_normalize(tensor, mean, std, func=None):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    if func is not None:
        tensor = func(tensor)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor

3. Daten erstellen slowund eingebenfast

Die Hauptidee besteht darin, 32 Frames aus den 64 Frames der Bilddaten als fastEingabe auszuwählen und fastdann 8 Frames als slowEingabe auszuwählen, und T H W C -> C T H Wdie endgültige fast_pathwayDimension ist (b,3,32,h,w) slow_pathwaydie Dimension von(b,3,8,h,w)

def process_cv2_inputs(frames):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()
    print(index)
    inputs = torch.index_select(inputs, 1, index)
    fast_pathway = inputs
    slow_pathway = torch.index_select(
            inputs,
            1,
            torch.linspace(
                0, inputs.shape[1] - 1, inputs.shape[1] // 4
            ).long(),
        )
    frame_list = [slow_pathway, fast_pathway]
    print(np.shape(frame_list[0]))
    inputs = [inp.unsqueeze(0) for inp in frame_list]
    return inputs

5. slowfast onnxBegründung

onnx5.1 Dateien exportieren

import os
import sys
from collections import OrderedDict
import torch
import argparse
work_root = os.path.split(os.path.realpath(__file__))[0]
from slowfast.config.defaults import get_cfg
import slowfast.utils.checkpoint as cu
from slowfast.models import build_model


def parser_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--cfg",
        dest="cfg_file",
        type=str,
        default=os.path.join(
            work_root, "/content/drive/MyDrive/SlowFast/demo/AVA/SLOWFAST_32x2_R101_50_50.yaml"),
        help="Path to the config file",
    )
    parser.add_argument(
        '--half',
        type=bool,
        default=False,
        help='use half mode',
    )
    parser.add_argument(
        '--checkpoint',
        type=str,
        default=os.path.join(work_root,
                             "/content/SLOWFAST_32x2_R101_50_50.pkl"),
        help='test model file path',
    )
    parser.add_argument(
        '--save',
        type=str,
        default=os.path.join(work_root, "/content/SLOWFAST_head.onnx"),
        help='save model file path',
    )
    return parser.parse_args()


def main():
    args = parser_args()
    print(args)
    cfg_file = args.cfg_file
    checkpoint_file = args.checkpoint
    save_checkpoint_file = args.save
    half_flag = args.half
    cfg = get_cfg()
    cfg.merge_from_file(cfg_file)
    cfg.TEST.CHECKPOINT_FILE_PATH = checkpoint_file
    print(cfg.DATA)
    print("export pytorch model to onnx!\n")
    device = "cuda:0"
    with torch.no_grad():
        model = build_model(cfg)
        model = model.to(device)
        model.eval()
        cu.load_test_checkpoint(cfg, model)
        if half_flag:
            model.half()
        fast_pathway= torch.randn(1, 3, 32, 256, 455)
        slow_pathway= torch.randn(1, 3, 8, 256, 455)
        bbox=torch.randn(32,5).to(device)
        fast_pathway = fast_pathway.to(device)
        slow_pathway = slow_pathway.to(device)
        inputs = [slow_pathway, fast_pathway]
        for p in model.parameters():
        	p.requires_grad = False
        torch.onnx.export(model, (inputs,bbox), save_checkpoint_file, input_names=['slow_pathway','fast_pathway','bbox'],output_names=['output'], opset_version=12)
        onnx_check()


def onnx_check():
    import onnx
    args = parser_args()
    print(args)
    onnx_model_path = args.save
    model = onnx.load(onnx_model_path)
    onnx.checker.check_model(model)


if __name__ == '__main__':
    main()

5.2 onnxBegründung

import torch
import math
import onnxruntime
from torchvision.ops import roi_align
import argparse
import os
import platform
import shutil
import time
from pathlib import Path
import sys
import json
sys.path.insert(1, '/content/drive/MyDrive/yolov5/')
import cv2
import torch
import torch.backends.cudnn as cudnn
import numpy as np
import argparse
import time
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
from models.common import DetectMultiBackend
from utils.augmentations import letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging
from utils.torch_utils import select_device
# ####### 参数设置
conf_thres = 0.6
iou_thres = 0.5
#######
imgsz = 640
weights = "/content/yolov5l.pt"
device = '0'
stride = 32
names = ["person"]
import os
def init():
    # Initialize
    global imgsz, device, stride
    set_logging()
    device = select_device('0')
    half = device.type != 'cpu'  # half precision only supported on CUDA
    model = DetectMultiBackend(weights, device=device, dnn=False)
    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine
    imgsz = check_img_size(imgsz, s=stride)  # check img_size
    model.half()  # to FP16
    model.eval()
    return model

def process_image(model, input_image=None, args=None, **kwargs):
    img0 = input_image
    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]
    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)
    img = img.half()
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]
    pred = model(img, augment=False, val=True)[0]
    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)
    result=[]
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        if det is not None and len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
            for *xyxy, conf, cls in det:
                if cls==0:
                    result.append([float(xyxy[0]),float(xyxy[1]),float(xyxy[2]),float(xyxy[3])])
    if len(result)==0:
      return None
    for i in range(32-len(result)):
      result.append([float(0),float(0),float(0),float(0)])
    return torch.from_numpy(np.array(result))
def scale(size, image):
    """
    Scale the short side of the image to size.
    Args:
        size (int): size to scale the image.
        image (array): image to perform short side scale. Dimension is
            `height` x `width` x `channel`.
    Returns:
        (ndarray): the scaled image with dimension of
            `height` x `width` x `channel`.
    """
    height = image.shape[0]
    width = image.shape[1]
    # print(height,width)
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return image
    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
    else:
        new_width = int(math.floor((float(width) / height) * size))
    img = cv2.resize(
        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
    )
    # print(new_width, new_height)
    return img.astype(np.float32)
def tensor_normalize(tensor, mean, std, func=None):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    if func is not None:
        tensor = func(tensor)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor
def scale_boxes(size, boxes, height, width):
    """
    Scale the short side of the box to size.
    Args:
        size (int): size to scale the image.
        boxes (ndarray): bounding boxes to peform scale. The dimension is
        `num boxes` x 4.
        height (int): the height of the image.
        width (int): the width of the image.
    Returns:
        boxes (ndarray): scaled bounding boxes.
    """
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return boxes

    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
        boxes *= float(new_height) / height
    else:
        new_width = int(math.floor((float(width) / height) * size))
        boxes *= float(new_width) / width
    return boxes
def process_cv2_inputs(frames):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()
    print(index)
    inputs = torch.index_select(inputs, 1, index)
    fast_pathway = inputs
    slow_pathway = torch.index_select(
            inputs,
            1,
            torch.linspace(
                0, inputs.shape[1] - 1, inputs.shape[1] // 4
            ).long(),
        )
    frame_list = [slow_pathway, fast_pathway]
    print(np.shape(frame_list[0]))
    inputs = [inp.unsqueeze(0) for inp in frame_list]
    return inputs
#加载模型
yolov5=init()
slowfast = onnxruntime.InferenceSession('/content/SLOWFAST_32x2_R101_50_50.onnx')
#加载数据开始推理
cap = cv2.VideoCapture("/content/atm_125.mp4")
was_read=True
while was_read:
    frames=[]
    seq_length=64
    while was_read and len(frames) < seq_length:
        was_read, frame =cap.read()
        frames.append(frame)
    
    bboxes = process_image(yolov5,frames[64//2])
    if bboxes is not None:
      frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames]
      frames = [scale(256, frame) for frame in frames]
      inputs = process_cv2_inputs(frames)
      if bboxes is not None:
          bboxes = scale_boxes(256,bboxes,1080,1920)
          index_pad = torch.full(
              size=(bboxes.shape[0], 1),
              fill_value=float(0),
              device=bboxes.device,
          )
          # Pad frame index for each box.
          bboxes = torch.cat([index_pad, bboxes], axis=1)
      for i in range(len(inputs)):
        inputs[i] = inputs[i].numpy()
      if bboxes is not None:
          outputs = slowfast.run(None, {
    
    'slow_pathway': inputs[0],'fast_pathway':inputs[1],'bbox':bboxes})
          for i in range(80):
            if outputs[0][0][i]>0.3:
              print(i)
          print(np.shape(prd))
    else:
        print("没有检测到任何人物")

6 slowfast python TensorrtArgumentation

6.1 ExportierenTensorrt

Als nächstes zur Innovation dieses Artikels

Zu Beginn dieses Artikels wurde versucht, den direkten onnxExport als zu verwenden Tensorrt, und der Export schlug fehl. Der Grund für die Feststellung ist, dass er noch nicht roi_alignimplementiert wurde Tensorrt( roi_alignwird in der nächsten Version implementiert Tensorrt).

Wenn Sie sich das exportierte Diagramm ansehen onnx, werden Sie feststellen, dass es roi_alignnur headteilweise verwendet wird.RoiAilgn

Daher schlagen wir die folgenden Ideen vor, wie in der folgenden Abbildung gezeigt: roi_ailgnTeilen Sie die Module separat und ohne TensorrtBeschleunigung auf und slowfastteilen Sie sie in zwei Netzwerke auf, wobei das Hauptnetzwerk zum Extrahieren von Merkmalen verwendet wird und headder Netzwerkteil für die Aktionsklassifizierung verantwortlich ist .Netzwerk-Flussdiagramm

6.2 TensorrtBegründungscode

import ctypes
import os
import numpy as np
import cv2
import random
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import threading
import time


class TrtInference():
    _batch_size = 1
    def __init__(self, model_path=None, cuda_ctx=None):
        self._model_path = model_path
        if self._model_path is None:
            print("please set trt model path!")
            exit()
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx is None:
            self.cuda_ctx = cuda.Device(0).make_context()
        if self.cuda_ctx:
            self.cuda_ctx.push()
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self._load_plugins()
        self.engine = self._load_engine()
        try:
            self.context = self.engine.create_execution_context()
            self.stream = cuda.Stream()
            for index, binding in enumerate(self.engine):
                if self.engine.binding_is_input(binding):
                    batch_shape = list(self.engine.get_binding_shape(binding)).copy()
                    batch_shape[0] = self._batch_size
                    self.context.set_binding_shape(index, batch_shape)
            self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()
        except Exception as e:
            raise RuntimeError('fail to allocate CUDA resources') from e
        finally:
            if self.cuda_ctx:
                self.cuda_ctx.pop()

    def _load_plugins(self):
        pass

    def _load_engine(self):
        with open(self._model_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):
        host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \
            [], [], [], [], []
        for index, binding in enumerate(self.engine):
            size = trt.volume(self.context.get_binding_shape(index)) * \
                   self.engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(cuda_mem))
            if self.engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)
        return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

    def destroy(self):
        """Free CUDA memories and context."""
        del self.cuda_outputs
        del self.cuda_inputs
        del self.stream
        if self.cuda_ctx:
            self.cuda_ctx.pop()
            del self.cuda_ctx

    def inference(self, inputs):
        np.copyto(self.host_inputs[0], inputs[0].ravel())
        np.copyto(self.host_inputs[1], inputs[1].ravel())
        if self.cuda_ctx:
            self.cuda_ctx.push()
        cuda.memcpy_htod_async(
            self.cuda_inputs[0], self.host_inputs[0], self.stream)
        cuda.memcpy_htod_async(
            self.cuda_inputs[1], self.host_inputs[1], self.stream)
        self.context.execute_async(
            batch_size=1,
            bindings=self.bindings,
            stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(
            self.host_outputs[0], self.cuda_outputs[0], self.stream)
        cuda.memcpy_dtoh_async(
            self.host_outputs[1], self.cuda_outputs[1], self.stream)
        self.stream.synchronize()
        if self.cuda_ctx:
            self.cuda_ctx.pop()
        output = [self.host_outputs[0],self.host_outputs[1]]
        return output


class TrtInference_head():
    _batch_size = 1
    def __init__(self, model_path=None, cuda_ctx=None):
        self._model_path = model_path
        if self._model_path is None:
            print("please set trt model path!")
            exit()
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx is None:
            self.cuda_ctx = cuda.Device(0).make_context()
        if self.cuda_ctx:
            self.cuda_ctx.push()
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self._load_plugins()
        self.engine = self._load_engine()
        try:
            self.context = self.engine.create_execution_context()
            self.stream = cuda.Stream()
            for index, binding in enumerate(self.engine):
                if self.engine.binding_is_input(binding):
                    batch_shape = list(self.engine.get_binding_shape(binding)).copy()
                    batch_shape[0] = self._batch_size
                    self.context.set_binding_shape(index, batch_shape)
            self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()
        except Exception as e:
            raise RuntimeError('fail to allocate CUDA resources') from e
        finally:
            if self.cuda_ctx:
                self.cuda_ctx.pop()

    def _load_plugins(self):
        pass

    def _load_engine(self):
        with open(self._model_path, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):
        host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \
            [], [], [], [], []
        for index, binding in enumerate(self.engine):
            size = trt.volume(self.context.get_binding_shape(index)) * \
                   self.engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(cuda_mem))
            if self.engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)
        return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

    def destroy(self):
        """Free CUDA memories and context."""
        del self.cuda_outputs
        del self.cuda_inputs
        del self.stream
        if self.cuda_ctx:
            self.cuda_ctx.pop()
            del self.cuda_ctx

    def inference(self, inputs):
        np.copyto(self.host_inputs[0], inputs[0].ravel())
        np.copyto(self.host_inputs[1], inputs[1].ravel())
        if self.cuda_ctx:
            self.cuda_ctx.push()
        cuda.memcpy_htod_async(
            self.cuda_inputs[0], self.host_inputs[0], self.stream)
        cuda.memcpy_htod_async(
            self.cuda_inputs[1], self.host_inputs[1], self.stream)
        self.context.execute_async(
            batch_size=1,
            bindings=self.bindings,
            stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(
            self.host_outputs[0], self.cuda_outputs[0], self.stream)
        self.stream.synchronize()
        if self.cuda_ctx:
            self.cuda_ctx.pop()
        output = self.host_outputs[0]
        return output

import torch
import math
from torchvision.ops import roi_align
import argparse
import os
import platform
import shutil
import time
from pathlib import Path
import sys
import json
sys.path.insert(1, '/content/drive/MyDrive/yolov5/')
import cv2
import torch
import torch.backends.cudnn as cudnn
import numpy as np
import argparse
import time
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
from models.common import DetectMultiBackend
from utils.augmentations import letterbox
from utils.general import check_img_size, non_max_suppression, scale_coords, set_logging
from utils.torch_utils import select_device
# ####### 参数设置
conf_thres = 0.89
iou_thres = 0.5
#######
imgsz = 640
weights = "/content/yolov5l.pt"
device = '0'
stride = 32
names = ["person"]
import os
def init():
    # Initialize
    global imgsz, device, stride
    set_logging()
    device = select_device('0')
    half = device.type != 'cpu'  # half precision only supported on CUDA
    model = DetectMultiBackend(weights, device=device, dnn=False)
    stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine
    imgsz = check_img_size(imgsz, s=stride)  # check img_size
    model.half()  # to FP16
    model.eval()
    return model

def process_image(model, input_image=None, args=None, **kwargs):
    img0 = input_image
    img = letterbox(img0, new_shape=imgsz, stride=stride, auto=True)[0]
    img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    img = np.ascontiguousarray(img)

    img = torch.from_numpy(img).to(device)
    img = img.half()
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]
    pred = model(img, augment=False, val=True)[0]
    pred = non_max_suppression(pred, conf_thres, iou_thres, agnostic=False)
    result=[]
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        if det is not None and len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
            for *xyxy, conf, cls in det:
                if cls==0:
                    result.append([float(xyxy[0]),float(xyxy[1]),float(xyxy[2]),float(xyxy[3])])
    if len(result)==0:
      return None
    for i in range(32-len(result)):
      result.append([float(0),float(0),float(0),float(0)])
    return torch.from_numpy(np.array(result))
def scale(size, image):
    """
    Scale the short side of the image to size.
    Args:
        size (int): size to scale the image.
        image (array): image to perform short side scale. Dimension is
            `height` x `width` x `channel`.
    Returns:
        (ndarray): the scaled image with dimension of
            `height` x `width` x `channel`.
    """
    height = image.shape[0]
    width = image.shape[1]
    # print(height,width)
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return image
    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
    else:
        new_width = int(math.floor((float(width) / height) * size))
    img = cv2.resize(
        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
    )
    # print(new_width, new_height)
    return img.astype(np.float32)
def tensor_normalize(tensor, mean, std, func=None):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    if func is not None:
        tensor = func(tensor)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor
def scale_boxes(size, boxes, height, width):
    """
    Scale the short side of the box to size.
    Args:
        size (int): size to scale the image.
        boxes (ndarray): bounding boxes to peform scale. The dimension is
        `num boxes` x 4.
        height (int): the height of the image.
        width (int): the width of the image.
    Returns:
        boxes (ndarray): scaled bounding boxes.
    """
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return boxes

    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
        boxes *= float(new_height) / height
    else:
        new_width = int(math.floor((float(width) / height) * size))
        boxes *= float(new_width) / width
    return boxes
def process_cv2_inputs(frames):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()
    print(index)
    inputs = torch.index_select(inputs, 1, index)
    fast_pathway = inputs
    slow_pathway = torch.index_select(
            inputs,
            1,
            torch.linspace(
                0, inputs.shape[1] - 1, inputs.shape[1] // 4
            ).long(),
        )
    frame_list = [slow_pathway, fast_pathway]
    print(np.shape(frame_list[0]))
    inputs = [inp.unsqueeze(0) for inp in frame_list]
    return inputs
#加载模型
yolov5=init()
slowfast = TrtInference('/content/SLOWFAST_32x2_R101_50_50.engine',None)
head = TrtInference_head('/content/SLOWFAST_head.engine',None)

#加载数据开始推理
cap = cv2.VideoCapture("/content/atm_125.mp4")
was_read=True
while was_read:
    frames=[]
    seq_length=64
    while was_read and len(frames) < seq_length:
        was_read, frame =cap.read()
        frames.append(frame)
    
    bboxes = process_image(yolov5,frames[64//2])
    if bboxes is not None:
      frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames]
      frames = [scale(256, frame) for frame in frames]
      inputs = process_cv2_inputs(frames)
      print(bboxes)
      if bboxes is not None:
          bboxes = scale_boxes(256,bboxes,1080,1920)
          index_pad = torch.full(
              size=(bboxes.shape[0], 1),
              fill_value=float(0),
              device=bboxes.device,
          )
          # Pad frame index for each box.
          bboxes = torch.cat([index_pad, bboxes], axis=1)
      for i in range(len(inputs)):
        inputs[i] = inputs[i].numpy()
      if bboxes is not None:
          outputs=slowfast.inference(inputs)
          outputs[0]=outputs[0].reshape(1,2048,16,29)
          outputs[1]=outputs[1].reshape(1,256,16,29)
          outputs[0]=torch.from_numpy(outputs[0])
          outputs[1]=torch.from_numpy(outputs[1])
          outputs[0]=roi_align(outputs[0],bboxes.to(dtype=outputs[0].dtype),7,1.0/16,0,True)
          outputs[1]=roi_align(outputs[1],bboxes.to(dtype=outputs[1].dtype),7,1.0/16,0,True)
          outputs[0] = outputs[0].numpy()
          outputs[1] = outputs[1].numpy()
          prd=head.inference(outputs)
          prd=prd.reshape(32,80)
          for i in range(80):
            if prd[0][i]>0.3:
              print(i)
    else:
        print("没有检测到任何人物")

Durch Lesen des obigen Codespython_tensorrt

slow_pathwayUnd fast_pathwayüber slowfastdas Hauptmodell, über reshapedie roi_alignerforderliche Dimension, werden reshapedas Endergebnis bboxund die entsprechenden Parameter eingebracht , um die vom Modell benötigten Eingaben roi_alignzu erhalten .head

7. slowfast C++ tensorrtBereitstellung

7.1 yolov5 C++Objekterkennung

yolov5Dieser Artikel wird nicht vorgestellt, ich verwende direkt yolov5 tensorrtden Code , der mit der Plattform geliefert wird

https://github.com/ExtremeMart/ev_sdk_demo4.0_pedestrian_intrusion_yolov5

7.2 deepsort C++Zielverfolgung

Dieser Artikel bezieht sich auf den folgenden deepsortCode

https://github.com/RichardoMrMu/deepsort-tensorrt

Da dieser Teil nicht im Mittelpunkt dieses Artikels steht, müssen Sie nur wissen, wie Sie diesen Teil des Codes verwenden, die CmakeLists-Datei schreiben und sie wie folgt im Code verwendendeepsort

#include "deepsort.h" 
/**
	DeepSortBox 为yolov5识别的结果
	DeepSortBox 结构
	{
		x1,
		y1,
		x2,
		y2,
		score,
		label,
		trackID
	}
	img 为原始的图片
	最终结果存放在DeepSortBox中
*/
DS->sort(img, DeepSortBox); 

7.3 slowfast C++Zielaktionserkennung

Betriebsumgebung:

Tensorrt8.4

opencv4.1.1

cudnn8.0

cuda11.1

Dokumentenvorbereitung:

body.onnx

head.onnx

Flussdiagramm für langsames, schnelles Denken

TensorrtWir implementieren den Argumentationscode weiterhin gemäß dem vorhergesagten Flussdiagramm

Visualisieren Sie onnxEingabe body.onnxund Ausgabebodyonnx

head.onnxEingabe und Ausgabefrontal

Schritt 1: Laden des Modells

Wird geladen body.onnxund head.onnxübergeben , und der Inferenzlaufraum wird Tensorrtgeöffnet . Der Code lautet wie folgtTensorrt

void loadheadOnnx(const std::string strModelName)
{
    Logger gLogger;
    //根据tensorrt pipeline 构建网络
    IBuilder* builder = createInferBuilder(gLogger);
    builder->setMaxBatchSize(1);
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);  
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
    parser->parseFromFile(strModelName.c_str(), static_cast<int>(ILogger::Severity::kWARNING));
    IBuilderConfig* config = builder->createBuilderConfig();
    config->setMaxWorkspaceSize(1ULL << 30);    
    m_CudaheadEngine = builder->buildEngineWithConfig(*network, *config);    

    std::string strTrtName = strModelName;
    size_t sep_pos = strTrtName.find_last_of(".");
    strTrtName = strTrtName.substr(0, sep_pos) + ".trt";
    IHostMemory *gieModelStream = m_CudaheadEngine->serialize();
    std::string serialize_str;
    std::ofstream serialize_output_stream;
    serialize_str.resize(gieModelStream->size());   
    memcpy((void*)serialize_str.data(),gieModelStream->data(),gieModelStream->size());
    serialize_output_stream.open(strTrtName.c_str());
    serialize_output_stream<<serialize_str;
    serialize_output_stream.close();
    m_CudaheadContext = m_CudaheadEngine->createExecutionContext();
    parser->destroy();
    network->destroy();
    config->destroy();
    builder->destroy();
}

Schritt 2: Platz für Eingabe- und Ausgabedaten schaffen

body.onnxDie Dimension der Eingabe ist slow_pathwaysum , wobei T 8 ist, die Ausgabe ist , die Dimension ist 32, die Ausgabe ist die Eingabe (32,2048,7,7) und (32,256,7,7), die Ausgabe ist (32). ,80) Die spezifische Code-Implementierung lautet wie folgt:fast_pathway(B,C,T,H,W)slow_pathway(B,2048,16,29)fast_pathway(B,256,16,29)``,head

	slow_pathway_InputIndex = m_CudaslowfastEngine->getBindingIndex(slow_pathway_NAME);
    fast_pathway_InputIndex = m_CudaslowfastEngine->getBindingIndex(fast_pathway_NAME);
    slow_pathway_OutputIndex = m_CudaslowfastEngine->getBindingIndex(slow_pathway_OUTPUT);
    fast_pathway_OutputIndex = m_CudaslowfastEngine->getBindingIndex(fast_pathway_OUTPUT); 
    dims_i = m_CudaslowfastEngine->getBindingDimensions(slow_pathway_InputIndex);
    SDKLOG(INFO)<<slow_pathway_InputIndex<<" "<<fast_pathway_InputIndex<<" "<<slow_pathway_OutputIndex<<" "<<fast_pathway_OutputIndex;
    SDKLOG(INFO) << "slow_pathway dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3]<< " " << dims_i.d[4];
    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3]* dims_i.d[4];
    cudaMalloc(&slowfast_ArrayDevMemory[slow_pathway_InputIndex], size * sizeof(float));
    slowfast_ArrayHostMemory[slow_pathway_InputIndex] = malloc(size * sizeof(float));
    slowfast_ArraySize[slow_pathway_InputIndex]=size* sizeof(float);
    
    dims_i = m_CudaslowfastEngine->getBindingDimensions(fast_pathway_InputIndex);
    SDKLOG(INFO) << "fast_pathway dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3]<< " " << dims_i.d[4];
    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3]* dims_i.d[4];
    cudaMalloc(&slowfast_ArrayDevMemory[fast_pathway_InputIndex], size * sizeof(float));
    slowfast_ArrayHostMemory[fast_pathway_InputIndex] = malloc(size * sizeof(float));
    slowfast_ArraySize[fast_pathway_InputIndex]=size* sizeof(float);
    
    
    dims_i = m_CudaslowfastEngine->getBindingDimensions(slow_pathway_OutputIndex);
    SDKLOG(INFO) << "slow_out dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3];
    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3];
    cudaMalloc(&slowfast_ArrayDevMemory[slow_pathway_OutputIndex], size * sizeof(float));
    slowfast_ArrayHostMemory[slow_pathway_OutputIndex] = malloc(size * sizeof(float));
    slowfast_ArraySize[slow_pathway_OutputIndex]=size* sizeof(float);
    
    
    
    dims_i = m_CudaslowfastEngine->getBindingDimensions(fast_pathway_OutputIndex);
    SDKLOG(INFO) << "fast_out dims " << dims_i.d[0] << " " << dims_i.d[1] << " " << dims_i.d[2] << " " << dims_i.d[3];
    size = dims_i.d[0] * dims_i.d[1] * dims_i.d[2] * dims_i.d[3];
    cudaMalloc(&slowfast_ArrayDevMemory[fast_pathway_OutputIndex], size * sizeof(float));
    slowfast_ArrayHostMemory[fast_pathway_OutputIndex] = malloc(size * sizeof(float));
    slowfast_ArraySize[fast_pathway_OutputIndex]=size* sizeof(float);
    
    
    
    size=32*2048*7*7;
    cudaMalloc(&ROIAlign_ArrayDevMemory[0], size * sizeof(float));
    ROIAlign_ArrayHostMemory[0] = malloc(size * sizeof(float));
    ROIAlign_ArraySize[0]=size* sizeof(float);
    
    size=32*256*7*7;
    cudaMalloc(&ROIAlign_ArrayDevMemory[1], size * sizeof(float));
    ROIAlign_ArrayHostMemory[1] = malloc(size * sizeof(float));
    ROIAlign_ArraySize[1]=size* sizeof(float);
    
    
    size=32*80;
    cudaMalloc(&ROIAlign_ArrayDevMemory[2], size * sizeof(float));
    ROIAlign_ArrayHostMemory[2] = malloc(size * sizeof(float));
    ROIAlign_ArraySize[2]=size* sizeof(float);
    size=32*5;
    boxes_data= malloc(size * sizeof(float));
    dims_i = m_CudaheadEngine->getBindingDimensions(0);

Schritt 3: Vorverarbeitung der Eingabedaten

Da ich nicht die dynamische Größe der exportierten Datei verwendet habe, wurde zunächst onnxdie Größe des Eingabebilds bestimmt. size=256*455(Dieses Ergebnis ist eine proportionale Skalierung von 1080 * 1920.) slowfastDie Modellanforderung besteht darin, dass RGBdas Bild BGRkonvertiert werden muss 256*455 RGBbis resize256*455. Der Code ist wie folgt implementiert

		cv::Mat framesimg = img.clone();
        cv::cvtColor(framesimg, framesimg, cv::COLOR_BGR2RGB);
        int height = framesimg.rows;
        int width = framesimg.cols;
        // 对图像进行预处理
        //cv2.COLOR_BGR2RGB
        int size=256;
        int new_width = width;
        int new_height = height;
        if ((width <= height && width == size) || (height <= width and height == size)){
            
        }
        else{
            new_width = size;
            new_height = size;
            if(width<height){
                new_height = int((float(height) / width) * size);
            }else{  
                new_width = int((float(width) / height) * size);
            }
            cv::resize(framesimg, framesimg, cv::Size{new_width,new_height},cv::INTER_LINEAR);
        } 

Zweitens wird das Bild normalisiert und CTHWin der Reihenfolge angeordnet, wobei Cder Kanal, Tdie Reihenfolge des Bildes, Hdie Länge des Bildes Wund die Breite des Bildes sind. Da slowfastes zwei Eingaben gibt, ist eine Eingabe fast_pathwayein Bild von 32 Frames, Die Dimension ist (b,c,T,h,w), wobei Tsie 32 beträgt, Sie müssen also alle zwei Frames Bilddaten hinzufügen. fast_pathwayDie andere Eingabe ist ein slow_pathwayBild mit 8 Frames, die Dimension ist (b,c,T,h,w), wobei T 8 ist, Sie müssen also Bilddaten hinzufügen Für alle vier Frames slow_pathwaylautet der spezifische Code wie folgt

		float *data=(float *)slowfast_ArrayHostMemory[fast_pathway_InputIndex];
        new_width =  framesimg.cols;
        new_height = framesimg.rows;
        for (size_t c = 0; c < 3; c++)
        {
            for (size_t  h = 0; h < new_height; h++)
            {
                for (size_t w = 0; w < new_width; w++)
                {
                    float v=((float)framesimg.at<cv::Vec3b>(h, w)[c]) / 255.0f;
                    v -= 0.45;
                    v /= 0.225;
                    data[c*32*256*455+fast_index* new_width * new_height + h * new_width + w] =v;
                }
            }
        }
        fast_index++;
        if(frames==0||frames==8||frames==16||frames==26||frames==34||frames==44||frames==52||frames==63){
            data=(float *)slowfast_ArrayHostMemory[slow_pathway_InputIndex];
            for (size_t c = 0; c < 3; c++)
            {
                for (size_t  h = 0; h < new_height; h++)
                {
                    for (size_t w = 0; w < new_width; w++)
                    {
                       float v=((float)framesimg.at<cv::Vec3b>(h, w)[c]) / 255.0f;
                        v -= 0.45;
                        v /= 0.225;
                        data[c*8*256*455+slow_index* new_width * new_height + h * new_width + w] =v;
                    }
                }
            }  
            slow_index++;
        }

Schritt 4: roi_alignErkennen

Wie im vorherigen Abschnitt beschrieben, ist roi_align in der aktuellen Version nicht in Tensorrt implementiert, roi_align ist jedoch in Torchvision.ops implementiert, und der Python-Argumentationscode kann direkt aufgerufen werden. Der C ++ - Code muss roi_align implementieren. Das spezifische Prinzip wird hier nicht erläutert. Es kann einfach davon ausgegangen werden, dass der spezifische Prozess von roi_align der Prozess des Zuschneidens und Änderns der Größe ist. Die der bbox entsprechenden Features werden aus der Feature-Map extrahiert und die Features extrahiert werden auf 7*7 verkleinert. Der spezifische Code wird wie folgt implementiert

void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
                     const int height, const int width, const int channels,
                     const int aligned_height, const int aligned_width, const float * bottom_rois,
                     float* top_data)
{
    const int output_size = num_rois * aligned_height * aligned_width * channels;

    int idx = 0;
    for (idx = 0; idx < output_size; ++idx)
    {
        int pw = idx % aligned_width;
        int ph = (idx / aligned_width) % aligned_height;
        int c = (idx / aligned_width / aligned_height) % channels;
        int n = idx / aligned_width / aligned_height / channels;  

        float roi_batch_ind = 0; 
        float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
        float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
        float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
        float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 
        float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
        float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
        float bin_size_h = roi_height / (aligned_height - 1.);
        float bin_size_w = roi_width / (aligned_width - 1.);

        float h = (float)(ph) * bin_size_h + roi_start_h;
        float w = (float)(pw) * bin_size_w + roi_start_w;

        int hstart = fminf(floor(h), height - 2); 
        int wstart = fminf(floor(w), width - 2);

        int img_start = roi_batch_ind * channels * height * width; 
        if (h < 0 || h >= height || w < 0 || w >= width)  
        {
            top_data[idx] = 0.; 
        }
        else
        {
            float h_ratio = h - (float)(hstart); 
            float w_ratio = w - (float)(wstart);
            int upleft = img_start + (c * height + hstart) * width + wstart;
            
            int upright = upleft + 1;
            int downleft = upleft + width; 
            int downright = downleft + 1; 

            top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
                + bottom_data[upright] * (1. - h_ratio) * w_ratio
                + bottom_data[downleft] * h_ratio * (1. - w_ratio)
                + bottom_data[downright] * h_ratio * w_ratio;  
        }
    }
}

Schritt 5: Begründung

Zuerst werden die Step3in bodyfür die Inferenz verwendet, das Inferenzergebnis wird Step4zum roi_alignExtrahieren bboxder entsprechenden Merkmale verwendet und schließlich werden die extrahierten Merkmale für die headInferenz unter Verwendung des Modells verwendet, um zu erhalten output. Der spezifische Code wird wie folgt implementiert

cudaMemcpyAsync(slowfast_ArrayDevMemory[slow_pathway_InputIndex], slowfast_ArrayHostMemory[slow_pathway_InputIndex], slowfast_ArraySize[slow_pathway_InputIndex], cudaMemcpyHostToDevice, m_CudaStream);
    cudaMemcpyAsync(slowfast_ArrayDevMemory[fast_pathway_InputIndex], slowfast_ArrayHostMemory[fast_pathway_InputIndex], slowfast_ArraySize[fast_pathway_InputIndex], cudaMemcpyHostToDevice, m_CudaStream);
    m_CudaslowfastContext->enqueueV2(slowfast_ArrayDevMemory , m_CudaStream, nullptr);    
   cudaMemcpyAsync(slowfast_ArrayHostMemory[slow_pathway_OutputIndex], slowfast_ArrayDevMemory[slow_pathway_OutputIndex], slowfast_ArraySize[slow_pathway_OutputIndex], cudaMemcpyDeviceToHost, m_CudaStream);
    cudaMemcpyAsync(slowfast_ArrayHostMemory[fast_pathway_OutputIndex], slowfast_ArrayDevMemory[fast_pathway_OutputIndex], slowfast_ArraySize[fast_pathway_OutputIndex], cudaMemcpyDeviceToHost, m_CudaStream);
    cudaStreamSynchronize(m_CudaStream);  
    data=(float*)slowfast_ArrayHostMemory[fast_pathway_OutputIndex];
    ROIAlignForwardCpu((float*)slowfast_ArrayHostMemory[slow_pathway_OutputIndex], 0.0625, 32,16,29, 2048,7, 7, (float*)boxes_data,       (float*)ROIAlign_ArrayHostMemory[0]);
    ROIAlignForwardCpu((float*)slowfast_ArrayHostMemory[fast_pathway_OutputIndex], 0.0625, 32,16,29, 256,7, 7, (float*)boxes_data,       (float*)ROIAlign_ArrayHostMemory[1]);
    data=(float*)ROIAlign_ArrayHostMemory[0];
    cudaMemcpyAsync(ROIAlign_ArrayDevMemory[0], ROIAlign_ArrayHostMemory[0], ROIAlign_ArraySize[0], cudaMemcpyHostToDevice, m_CudaStream);
    cudaMemcpyAsync(ROIAlign_ArrayDevMemory[1], ROIAlign_ArrayHostMemory[1], ROIAlign_ArraySize[1], cudaMemcpyHostToDevice, m_CudaStream);
    m_CudaheadContext->enqueueV2(ROIAlign_ArrayDevMemory, m_CudaStream, nullptr); 
    cudaMemcpyAsync(ROIAlign_ArrayHostMemory[2], ROIAlign_ArrayDevMemory[2], ROIAlign_ArraySize[2], cudaMemcpyDeviceToHost, m_CudaStream);
    cudaStreamSynchronize(m_CudaStream); 

Referenzlink

1. https://blog.csdn.net/y459541195/article/details/126278476
2. https://blog.csdn.net/WhiffeYF/article/details/115581800
3. https://github.com/facebookresearch/SlowFast

おすすめ

転載: blog.csdn.net/Extremevision/article/details/127800673