yolov5--4.0--model compression

The 4.0 version of yolov5 has also been updated for a while, and the specific updates are as follows;

nn.SiLU() activations replace nn.LeakyReLU(0.1) and nn.Hardswish() activations throughout the model, simplifying the architecture as we now only have one single activation function used everywhere rather than the two types before.

In general the changes result in smaller models (89.0M params -> 87.7M YOLOv5x), faster inference times (6.9ms -> 6.0ms), and improved mAP (49.2 -> 50.1) for all models except YOLOv5s, which reduced mAP slightly (37.0 -> 36.8). In general the largest models benefit the most from this update. YOLOv5x in particular is now above 50.0 mAP at --img-size 640, which may be the first time this is possible at 640 resolution for any architecture I’m aware of (correct me if I’m wrong though).

It is the use of SiLU to replace LeakyReLU and Hardswish, which simplifies the network structure, but the model has become larger. After 109 epochs with 26771 samples and 15 categories, a 667.2M model is obtained. Obviously, this model It's still relatively large.
Therefore, it is necessary to perform quantitative compression of the model according to the activation function of the network, and export the fp16 model. The specific script is as follows:

import os
import torch

import torch
import torch.nn as nn
from tqdm import tqdm


def autopad(k, p=None):  
    # Pad to 'same'
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p


class Conv(nn.Module):
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
        super(Conv, self).__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p),
                              groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = nn.SiLU() if act else nn.Identity()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

    def fuseforward(self, x):
        return self.act(self.conv(x))


class Ensemble(nn.ModuleList):
    def __init__(self):
        super(Ensemble, self).__init__()

    def forward(self, x, augment=False):
        y = []
        for module in self:
            y.append(module(x, augment)[0])
        y = torch.cat(y, 1)
        return y, None



def attempt_load(weights, map_location=None):
    model = Ensemble()
    for w in weights if isinstance(weights, list) else [weights]:
        # attempt_download(w)
        model.append(torch.load(w, map_location=map_location)['model'].float().fuse().eval())  # load FP32 model

    for m in model.modules():
        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
            m.inplace = True  # pytorch 1.7.0 compatibility
        elif type(m) is Conv:
            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility

    if len(model) == 1:
        return model[-1]  # return model
    else:
        print('Ensemble created with %s\n' % weights)
        for k in ['names', 'stride']:
            setattr(model, k, getattr(model[-1], k))
        return model  # return ensemble


def select_device(device='', batch_size=None):
    # device = 'cpu' or '0' or '0,1,2,3'
    cpu_request = device.lower() == 'cpu'
    if device and not cpu_request:  # if device requested other than 'cpu'
        os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable
        assert torch.cuda.is_available(
        ), 'CUDA unavailable, invalid device %s requested' % device  # check availablity

    cuda = False if cpu_request else torch.cuda.is_available()
    if cuda:
        c = 1024 ** 2  # bytes to MB
        ng = torch.cuda.device_count()
        if ng > 1 and batch_size:  # check that batch_size is compatible with device_count
            assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (
                batch_size, ng)
        x = [torch.cuda.get_device_properties(i) for i in range(ng)]
        s = f'Using torch {torch.__version__} '
        for i in range(0, ng):
            if i == 1:
                s = ' ' * len(s)

    return torch.device('cuda:0' if cuda else 'cpu')


if __name__ == '__main__':

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--in_weights', type=str,
                        default='./last.pt', help='initial weights path')
    parser.add_argument('--out_weights', type=str,
                        default='quantification.pt', help='output weights path')
    parser.add_argument('--device', type=str, default='0', help='device')
    opt = parser.parse_args()

    device = select_device(opt.device)
    model = attempt_load(opt.in_weights, map_location=device)
    model.to(device).eval()
    model.half()

    torch.save(model, opt.out_weights)
    print('done.')

    print('-[INFO] before: {} kb, after: {} kb'.format(
        os.path.getsize(opt.in_weights), os.path.getsize(opt.out_weights)))

Using version 4.0, and self.act = nn.SiLU(), after quantization and compression, the exported model is changed from 667.2M to 166M. After loading the official detetct.py to execute, the inference time is 0.460s.
Insert picture description here
Use the original model for inference The time is 0.553s.

Use version 4.0, and self.act = nn.Hardswish(). After quantization and compression, the exported model changes from 667.2M to 166M. After loading the official detetct.py to execute, the inference time is 0.415s
Insert picture description here
Note:
Modify the experimental.py file in models in the project folder, otherwise an error will be reported:

def attempt_load(weights, map_location=None):
    # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
    model = Ensemble()
    for w in weights if isinstance(weights, list) else [weights]:
        attempt_download(w)
        model.append(torch.load(w, map_location=map_location).float().fuse().eval())

yolov5--4.0--model compression

Guess you like