YOLOv7 instance segmentation

environment

  • ubuntu 18.04 64bit

  • python 3.8

  • pytorch1.8.2 + cu111

Introduction

The previous YOLOv7 mentioned that in addition to target detection, YOLOv7it will also , but the author only opened the pose estimation model at that time. The good news is that recently, YOLOv7the author has released a model for instance segmentation. In this article, we will take a look at the performance YOLOv7in instance segmentation.

practice

The code for instance segmentation is stored maskin and is still being improved. We directly download the latest code on the branch

git clone https://github.com/WongKinYiu/yolov7.git -b mask yolov7-mask
cd yolov7-mask

The split part depends facebookon detectron2, and detectron2requires torcha version greater than 1.8. Since I have been using 1.7.1 before, a new environment needs to be created here

# 创建新的虚拟环境
conda create -n pytorch1.8 python=3.8
conda activate pytorch1.8

# 安装 torch 1.8.2
pip install torch==1.8.2 torchvision==0.9.2 torchaudio===0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111

# 修改requirements.txt,将其中的torch和torchvision注释掉
pip install -r requirements.txt

# 安装detectron2
git clone https://github.com/facebookresearch/detectron2
cd detectron2
python setup.py install
cd ..

Then download the instance segmentation model https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-mask.pt, put the model into the source code directory

The example code for splitting is in tools/instance.ipynband can be run directly jupyter notebookin

If you need to convert ipynbthe file into pythona file, just execute

jupyter nbconvert --to python tools/instance.ipynb

8ba48b0e7ff2f3a489647a1d212eca4c.png

tools/instance.pyCopy the generated to the root directory of the source code, and then modify the last display part of the file to save the result picture

import matplotlib.pyplot as plt
import torch
import cv2
import yaml
from torchvision import transforms
import numpy as np

from utils.datasets import letterbox
from utils.general import non_max_suppression_mask_conf

from detectron2.modeling.poolers import ROIPooler
from detectron2.structures import Boxes
from detectron2.utils.memory import retry_if_cuda_oom
from detectron2.layers import paste_masks_in_image

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
with open('data/hyp.scratch.mask.yaml') as f:
    hyp = yaml.load(f, Loader=yaml.FullLoader)
weigths = torch.load('yolov7-mask.pt')
model = weigths['model']
model = model.half().to(device)
_ = model.eval()

image = cv2.imread('inference/images/horses.jpg')  # 504x378 image
image = letterbox(image, 640, stride=64, auto=True)[0]
image_ = image.copy()
image = transforms.ToTensor()(image)
image = torch.tensor(np.array([image.numpy()]))
image = image.to(device)
image = image.half()

output = model(image)

inf_out, train_out, attn, mask_iou, bases, sem_output = output['test'], output['bbox_and_cls'], output['attn'], output['mask_iou'], output['bases'], output['sem']

bases = torch.cat([bases, sem_output], dim=1)
nb, _, height, width = image.shape
names = model.names
pooler_scale = model.pooler_scale
pooler = ROIPooler(output_size=hyp['mask_resolution'], scales=(pooler_scale,), sampling_ratio=1, pooler_type='ROIAlignV2', canonical_level=2)

output, output_mask, output_mask_score, output_ac, output_ab = non_max_suppression_mask_conf(inf_out, attn, bases, pooler, hyp, conf_thres=0.25, iou_thres=0.65, merge=False, mask_iou=None)

pred, pred_masks = output[0], output_mask[0]
base = bases[0]
bboxes = Boxes(pred[:, :4])
original_pred_masks = pred_masks.view(-1, hyp['mask_resolution'], hyp['mask_resolution'])
pred_masks = retry_if_cuda_oom(paste_masks_in_image)( original_pred_masks, bboxes, (height, width), threshold=0.5)
pred_masks_np = pred_masks.detach().cpu().numpy()
pred_cls = pred[:, 5].detach().cpu().numpy()
pred_conf = pred[:, 4].detach().cpu().numpy()
nimg = image[0].permute(1, 2, 0) * 255
nimg = nimg.cpu().numpy().astype(np.uint8)
nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)
nbboxes = bboxes.tensor.detach().cpu().numpy().astype(np.int)
pnimg = nimg.copy()

for one_mask, bbox, cls, conf in zip(pred_masks_np, nbboxes, pred_cls, pred_conf):
    if conf < 0.25:
        continue
    color = [np.random.randint(255), np.random.randint(255), np.random.randint(255)]


    pnimg[one_mask] = pnimg[one_mask] * 0.5 + np.array(color, dtype=np.uint8) * 0.5
    pnimg = cv2.rectangle(pnimg, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
    #label = '%s %.3f' % (names[int(cls)], conf)
    #t_size = cv2.getTextSize(label, 0, fontScale=0.5, thickness=1)[0]
    #c2 = bbox[0] + t_size[0], bbox[1] - t_size[1] - 3
    #pnimg = cv2.rectangle(pnimg, (bbox[0], bbox[1]), c2, color, -1, cv2.LINE_AA)  # filled
    #pnimg = cv2.putText(pnimg, label, (bbox[0], bbox[1] - 2), 0, 0.5, [255, 255, 255], thickness=1, lineType=cv2.LINE_AA)

cv2.imwrite("instance_result.jpg", pnimg)

I found 2 test pictures to see the effect

4c4b7f64af9330a95e5d27237dde0d52.jpeg

61289a2968583b6948a2911dd2e0b7ae.jpeg

Similarly, a code for video file or camera detection is also provided here

import torch
import cv2
import yaml
from torchvision import transforms
import numpy as np

from utils.datasets import letterbox
from utils.general import non_max_suppression_mask_conf

from detectron2.modeling.poolers import ROIPooler
from detectron2.structures import Boxes
from detectron2.utils.memory import retry_if_cuda_oom
from detectron2.layers import paste_masks_in_image


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
with open('data/hyp.scratch.mask.yaml') as f:
    hyp = yaml.load(f, Loader=yaml.FullLoader)
weigths = torch.load('yolov7-mask.pt')
model = weigths['model']
model = model.half().to(device)
_ = model.eval()

cap = cv2.VideoCapture('vehicle_test.mp4')
if (cap.isOpened() == False):
    print('open failed.')
    exit(-1)

# 分辨率
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

# 图片缩放
vid_write_image = letterbox(cap.read()[1], (frame_width), stride=64, auto=True)[0]
resize_height, resize_width = vid_write_image.shape[:2]

# 保存结果视频
out = cv2.VideoWriter("result_instance.mp4",
                      cv2.VideoWriter_fourcc(*'mp4v'), 30,
                      (resize_width, resize_height))

while(cap.isOpened):
    flag, image = cap.read()
    if flag:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = letterbox(image, frame_width, stride=64, auto=True)[0]
        image_ = image.copy()
        image = transforms.ToTensor()(image)
        image = torch.tensor(np.array([image.numpy()]))
        image = image.to(device)
        image = image.half()

        with torch.no_grad():
            output = model(image)
        
        inf_out, train_out, attn, mask_iou, bases, sem_output = output['test'], output['bbox_and_cls'], output['attn'], output['mask_iou'], output['bases'], output['sem']
    
        bases = torch.cat([bases, sem_output], dim=1)
        nb, _, height, width = image.shape
        names = model.names
        pooler_scale = model.pooler_scale
        pooler = ROIPooler(output_size=hyp['mask_resolution'], scales=(pooler_scale,), sampling_ratio=1, pooler_type='ROIAlignV2', canonical_level=2)

        output, output_mask, output_mask_score, output_ac, output_ab = non_max_suppression_mask_conf(inf_out, attn, bases, pooler, hyp, conf_thres=0.25, iou_thres=0.65, merge=False, mask_iou=None)

        pred, pred_masks = output[0], output_mask[0]
        base = bases[0]
        bboxes = Boxes(pred[:, :4])
        original_pred_masks = pred_masks.view(-1, hyp['mask_resolution'], hyp['mask_resolution'])
        pred_masks = retry_if_cuda_oom(paste_masks_in_image)( original_pred_masks, bboxes, (height, width), threshold=0.5)
        pred_masks_np = pred_masks.detach().cpu().numpy()
        pred_cls = pred[:, 5].detach().cpu().numpy()
        pred_conf = pred[:, 4].detach().cpu().numpy()
        nimg = image[0].permute(1, 2, 0) * 255
        nimg = nimg.cpu().numpy().astype(np.uint8)
        nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)
        nbboxes = bboxes.tensor.detach().cpu().numpy().astype(np.int)
        pnimg = nimg.copy()

        for one_mask, bbox, cls, conf in zip(pred_masks_np, nbboxes, pred_cls, pred_conf):
            if conf < 0.25:
                continue
            
            color = [np.random.randint(255), np.random.randint(255), np.random.randint(255)]

            pnimg[one_mask] = pnimg[one_mask] * 0.5 + np.array(color, dtype=np.uint8) * 0.5
            pnimg = cv2.rectangle(pnimg, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)

        cv2.imshow('YOLOv7 mask', pnimg)
        out.write(pnimg)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    else:
        break
    

cap.release()
cv2.destroyAllWindows()

References

  • https://github.com/WongKinYiu/yolov7

  • https://xugaoxiang.com/2022/07/21/yolov7

Guess you like

Origin blog.csdn.net/djstavaV/article/details/126357677