PyTorch Trick Highlights

-1、Dataset与DataLoader

1. Use your own dataset

The torch.utils.data.DataLoader and torch.utils.data.Dataset provided by PyTorch allow you to use pre-downloaded datasets or make your own. Dataset is used to store samples and their corresponding labels, and DataLoader can provide an iterator for datasets for easy access to samples.

The following provides a self-contained Fashion
MNIST dataset, including 60,000 training samples and 10,000 test samples. Each example consists of a 28×28 grayscale image (feature map) and labels for one of 10 classes.

import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor,Lambda
import matplotlib.pyplot as plt
import numpy as np

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
    target_transform = Lambda(lambda y: torch.zeros(
    10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
    target_transform = Lambda(lambda y: torch.zeros(
    10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
)
# 执行时,会下载并解压训练、测试数据集到 data/FashionMNIST/raw 目录
## root是要存储训练/测试数据的路径
## train指定数据集为训练集或测试集,
## download=True表示如果在root无从获取数据集,则从网上下载。
## transform和target_transform分别指定特征图和标签数据类型变换

All TorchVision datasets have two parameters, where transform is used to modify feature maps and target_transform is used to modify labels. The torchvision.transforms module provides several commonly used transformations, such as ToTensor() and Lambda below.

FashionMNIST features are in the PIL image format and labels are integers. For training, we need to normalize the features into tensors and represent the labels as one-hot encoded tensors. To do these transformations we use ToTensor and Lambda.

ToTensor()
ToSensor converts a PIL image or NumPy ndarray to a floating point tensor (FloatTensor). And the pixel values ​​of the image are limited in [0,1].

Lambda Transforms
Lambda transformations apply any user-defined lambda function. Here, we define a function to convert an integer to a one-hot encoded tensor. It first creates a zero tensor of size class_num (the number of labels in the dataset) and calls scatter_ which assigns a value of 1 at the index given by label y.

2. Self-built data set

There are 10 pictures, 5 fake (marked with 0) 5 true (marked with 1)

Link: https://pan.baidu.com/s/1xGm6IMhq8zBQYZZCwwf3Aw Extraction code: 1111
insert image description here
insert image description here


import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, ToPILImage, Lambda
import matplotlib.pyplot as plt
import os
import pandas as pd
from PIL import Image

img_path = '.\zoro'
label_path = '.\label_zoro.xlsx'


class ZoroDataset(Dataset):
    def __init__(self, label_file, img_dir, transform=None, target_transform=None):
        # 读取标签文件
        self.labels = pd.read_excel(label_file)
        # 定义文件目录
        self.img_dir = img_dir
        # 定义transform
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        '''返回数据集中的样本数'''
        return len(self.labels)

    def __getitem__(self, index):
        '''获取数据的方法,会和Dataloader连用'''
        # 获取图片路径,0表示Excel文件的第一列
        img_path = os.path.join(self.img_dir, self.labels.iloc[index, 0])
        # 读取图片
        image = Image.open(img_path)
        # 获取图片对应的标签,1表示Excel文件的第二列
        y_label = int(self.labels.iloc[index, 1])
        # 如果使用时附加了transform参数,则对图片应用转换
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            y_label = self.target_transform(y_label)
        return image, y_label


def img_show(img):
    '''将img转化为PIL图像格式后展示'''
    to_pil_image = ToPILImage()
    img = to_pil_image(img)
    plt.imshow(img)


# 这里标签采用one-hot编码只是为了展示效果,不考虑实际意义
dataset = ZoroDataset(label_file=label_path, img_dir=img_path, transform=ToTensor(),
                      target_transform=Lambda(lambda y: torch.zeros(
                          2, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1)))

train_features, train_labels = dataset[0]
# img_show(train_features)
# print('one-hot标签:', train_labels)
# plt.show()
batch_size = 2
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
print('数据集样本数:', len(dataset))

for epoch in range(2):
    data_iter = iter(train_dataloader)
    for i, (train_features, train_labels) in enumerate(data_iter):
        plt.figure(i)
        for j in range(batch_size):
            plt.subplot(int(f"1{batch_size}{j + 1}"))
            # print(train_features.size())
            img = train_features[j]
            label = train_labels[j]
            img_show(img)
            print(f"Label: {label}")
        plt.show()

Iterate and visualize datasets

labels_map = {
    
    
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    print('one-hot:',label)
    index=torch.nonzero(label)[0][0]   # one-hot转整数标签
    plt.title(labels_map[index.item()])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

insert image description here

0. Calculation model Param, and FLOPS

Install thop library: pip install thop

from torchvision.models import resnet50
from thop import profile

model = resnet50()
input = torch.randn(1, 3, 224, 224)
flops, params = profile(model, inputs=(input, ))

# 另一种计算方式
total = sum([param.nelement() for param in self.net.parameters()])

1. Multi-card training (specify GPU number)

Set the currently used GPU device to be only device 0, and the device name is /gpu:0:

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Set the currently used GPU devices as two devices No. 0 and No. 1, and the names are /gpu:0 and /gpu:1 in turn:

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 

According to the order, it means that the No. 0 device is used first, and then the No. 1 device is used.
Commands specifying the GPU need to be placed in front of a series of operations related to the neural network.
Multi-GPU computing is divided into single-machine multi-card and multi-machine multi-card. The implementation of the two on pytorch is not the same, because when there are multiple machines, communication protocols and other settings between multiple machines are required.

It is very easy for pytorch to implement multiple cards on a single machine. The basic principle is: add the data that we read in one batch at a time, and its size is [16, 10, 5], and we have four cards that can be used. Then the calculation process follows the following steps:

1.假设我们有4个GPU可以用,pytorch先把模型同步放到4个GPU中。
2.那么首先将数据分为4份,按照次序放置到四个GPU的模型中,每一份大小为[4, 10, 5];   
3. 每个GPU分别进行前项计算过程;
4.前向过程计算完后,pytorch再从四个GPU中收集计算后的结果假设[4, 10, 5],然后再按照次序将其拼接起来[16, 10, 5],计算loss。
整个过程其实就是 同步模型参数→分别前向计算→计算损失→梯度反传
import torch
import torch.nn as nn

model = Model()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
# 假设就一个数据
data = torch.rand([16, 10, 5])

# 前向计算要求数据都放进GPU0里面
# device = torch.device('cuda:0')
# data = data.to(device)
data = data.cuda()

# 将网络同步到多个GPU
model_p = torch.nn.DataParalle(model.cuda(), device_ids=[0, 1,  2, 3])
logits = model_p(inputs)
  
# 接下来计算loss
loss = crit(logits, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()

2. View the output of each layer of the model

Keras has a neat API to view the output dimensions of each layer of the model, which is very useful when debugging the network. This is now also possible in PyTorch.
It is very simple to use, as follows:

from torchsummary import summary
summary(your_model, input_size=(channels, H, W))

input_size is set according to the input size of your own network model.

3. Gradient Clipping

import torch.nn as nn

outputs = model(data)
loss= loss_fn(outputs, target)
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)
optimizer.step()

Parameters of nn.utils.clip_grad_norm_:
parameters – a variable-based iterator that will perform gradient normalization
max_norm – the maximum norm of the gradient
norm_type – the type of the specified norm, the default is L2

nn.utils.clip_grad_value_(net.linear.weight, clip_value=1.1)
print("grad after clip:"+str(net.linear.weight.grad))
# grad after clip:tensor([[1.1000]])

The output obtained is the same as expected, the gradient is 120 before the clip, and the gradient is 1.1 after the clip:
@不ellipsoidalellipse proposed: Gradient clipping consumes a lot of extra computing time on some tasks, you can move to the comment area to view details .

4. Extend tensor dimension

Because the data dimensions during training are generally (batch_size, c, h, w), and only one image is input during testing, it is necessary to expand the dimension. There are several ways to expand the dimension:

import cv2
import torch

image = cv2.imread(img_path)
image = torch.tensor(image)
print(image.size())

img = image.unsqueeze(dim=0)  
print(img.size())

img = img.squeeze(dim=0)
print(img.size())

tensor.unsqueeze(dim): Dimension expansion, dim specifies which dimension to expand.
tensor.squeeze(dim): Remove the dimension specified by dim and whose size is 1. When the dimension is greater than 1, squeeze() does not work. When dim is not specified, all dimensions whose size is 1 are removed. or

import cv2
import torch

image = cv2.imread(img_path)
image = torch.tensor(image)
print(image.size())

img = image.view(1, *image.size())
print(img.size())

# output:
# torch.Size([h, w, c])
# torch.Size([1, h, w, c])

or

import cv2
import numpy as np

image = cv2.imread(img_path)
print(image.shape)
img = image[np.newaxis, :, :, :]
print(img.shape)

# output:
# (h, w, c)
# (1, h, w, c)

5. One-hot encoding

1. Pytorch built-in one_hot function
Upgrade Pytorch to version 1.2 and try the one_hot function, which is really convenient. Use torch.nn.functional.one_hot directly.

import torch.nn.functional as F
import torch

tensor =  torch.arange(0, 5) % 3  # tensor([0, 1, 2, 0, 1])
one_hot = F.one_hot(tensor)

# 输出:
# tensor([[1, 0, 0],
#         [0, 1, 0],
#         [0, 0, 1],
#         [1, 0, 0],
#         [0, 1, 0]])

You can also specify the number of categories yourself:

tensor =  torch.arange(0, 5) % 3  # tensor([0, 1, 2, 0, 1])
one_hot = F.one_hot(tensor, num_classes=5)

# 输出:
# tensor([[1, 0, 0, 0, 0],
#         [0, 1, 0, 0, 0],
#         [0, 0, 1, 0, 0],
#         [1, 0, 0, 0, 0],
#         [0, 1, 0, 0, 0]])

Upgrade Pytorch (cpu version) command: conda install pytorch torchvision -c pytorch

2. Manual one-hot
will automatically convert the label to onehot when using the cross-entropy loss function in PyTorch, so there is no need to manually convert it, but using MSE requires manual conversion to onehot encoding.

import torch
class_num = 8
batch_size = 4

def one_hot(label):
    """
    将一维列表转换为独热编码
    """
    label = label.resize_(batch_size, 1)
    m_zeros = torch.zeros(batch_size, class_num)
    # 从 value 中取值,然后根据 dim 和 index 给相应位置赋值
    onehot = m_zeros.scatter_(1, label, 1)  # (dim,index,value)

    return onehot.numpy()  # Tensor -> Numpy

label = torch.LongTensor(batch_size).random_() % class_num  # 对随机数取余
print(one_hot(label))

# output:
[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]

6. Prevent memory explosion when verifying the model

There is no need for derivation when verifying the model, that is, no gradient calculation is required. Turning off autograd can improve the speed and save memory. If it is not closed, it may burst the video memory.

with torch.no_grad():
    # 使用model进行预测的代码
    pass

The reason for using torch.cuda.empty_cache() is updated.
This is the original answer:
There may be more and more useless temporary variables during Pytorch training, resulting in out of memory. You can use the following statement to clean up these unnecessary variables.
The explanation on the official website is:
Releases all unoccupied cached memory currently held by the caching allocator so that those can be used in other GPU application and visible innvidia-smi. torch.cuda.empty_cache() means that
PyTorch's cache allocator will be allocated in advance Some fixed video memory, even if the tensors have not actually used up this video memory, this video memory cannot be used by other applications. This allocation process is triggered by the first CUDA memory access.
The function of torch.cuda.empty_cache() is to release the unoccupied cache video memory currently held by the cache allocator, so that these video memory can be used by other GPU applications and visible through the nvidia-smi command. Note that using this command will not release the video memory occupied by tensors.
For unused data variables, Pytorch can automatically recycle to release the corresponding video memory.
For more detailed optimization, see Optimizing Video Memory Usage and Video Memory Utilization.

7. Learning rate decay


import torch.optim as optim
from torch.optim import lr_scheduler

# 训练前的初始化
optimizer = optim.Adam(net.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, 10, 0.1)  # # 每过10个epoch,学习率乘以0.1

# 训练过程中
for n in n_epoch:
    scheduler.step()
    ...

The value of the learning rate can be viewed at any time: optimizer.param_groups[0]['lr'].
There are other ways to update the learning rate:
1. Customize the update formula:
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch:1/(epoch+1))
2. Update the learning rate independent of epoch:
lr_scheduler.ReduceLROnPlateau( )
provides a method to dynamically decrease the learning rate based on some measured values ​​during training, and its parameter description can be found everywhere.
A reminder is that the parameter mode='min' or 'max' depends on the optimized loss or accuracy, that is,
scheduler.step(loss) or scheduler.step(acc).

8. Freeze the parameters of some layers

Reference: https://www.zhihu.com/question/311095447/answer/589307812

8-1

: When loading the pre-trained model, we sometimes want to freeze the first few layers so that their parameters do not change during training.
We need to know the name of each layer first, and print it through the following code:


net = Network()  # 获取自定义网络结构
for name, value in net.named_parameters():
    print('name: {0},\t grad: {1}'.format(name, value.requires_grad))

Assume that the first few layers of information are as follows


name: cnn.VGG_16.convolution1_1.weight,	 grad: True
name: cnn.VGG_16.convolution1_1.bias,	 grad: True
name: cnn.VGG_16.convolution1_2.weight,	 grad: True
name: cnn.VGG_16.convolution1_2.bias,	 grad: True
name: cnn.VGG_16.convolution2_1.weight,	 grad: True
name: cnn.VGG_16.convolution2_1.bias,	 grad: True
name: cnn.VGG_16.convolution2_2.weight,	 grad: True
name: cnn.VGG_16.convolution2_2.bias,	 grad: True

Then we define a list of layers to freeze:


no_grad = [
    'cnn.VGG_16.convolution1_1.weight',
    'cnn.VGG_16.convolution1_1.bias',
    'cnn.VGG_16.convolution1_2.weight',
    'cnn.VGG_16.convolution1_2.bias'
]

The freezing method is as follows:


net = Net.CTPN()  # 获取网络结构
for name, value in net.named_parameters():
    if name in no_grad:
        value.requires_grad = False
    else:
        value.requires_grad = True

After freezing, we print the information of each layer:

name: cnn.VGG_16.convolution1_1.weight,	 grad: False
name: cnn.VGG_16.convolution1_1.bias,	 grad: False
name: cnn.VGG_16.convolution1_2.weight,	 grad: False
name: cnn.VGG_16.convolution1_2.bias,	 grad: False
name: cnn.VGG_16.convolution2_1.weight,	 grad: True
name: cnn.VGG_16.convolution2_1.bias,	 grad: True
name: cnn.VGG_16.convolution2_2.weight,	 grad: True
name: cnn.VGG_16.convolution2_2.bias,	 grad: True

It can be seen that the requires_grad of the weight and bias of the first two layers are both False, indicating that they are not trainable.
Finally, when defining the optimizer, only the parameters of the layers whose requires_grad is True are updated.

optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.01)

8-2. Fast version (vgg16 as an example)

# import torchvision.models as models
# import torch
# import torch.nn as nn

# class Net(nn.Module):
#     def __init__(self, model):
#         super(Net, self).__init__()
#         # -2表示去掉model的后两层
#         self.vgg_layer = nn.Sequential(*list(model.children())[:-2])
#         self.transion_layer = nn.ConvTranspose2d(2048, 2048, kernel_size=14, stride=3)
#         self.pool_layer = nn.MaxPool2d(32)
#         self.Linear_layer = nn.Linear(2048, 8)

#     def forward(self, x):
#         x = self.resnet_layer(x)
#         x = self.transion_layer(x)
#         x = self.pool_layer(x)
#         #将一个多行的Tensor,拼接成一行,-1指在不告诉函数有多少列
#         x = x.view(x.size(0), -1)
#         x = self.Linear_layer(x)
#         return x

# vgg = models.vgg16(pretrained=True)
# model = Net(vgg)

8-3. Fine-tuned version

import torchvision.models as models
import torch
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

class CNN(nn.Module):

# 加载model
resnet50 = models.resnet50(pretrained=True)
#3 4 6 3 分别表示layer1 2 3 4 中Bottleneck模块的数量。res101则为3 4 23 3 
cnn = CNN(Bottleneck, [3, 4, 6, 3])
# # 读取参数
pretrained_dict = resnet50.state_dict()
model_dict = cnn.state_dict()
# # 将pretrained_dict里不属于model_dict的键剔除掉
pretrained_dict = {
    
    k: v for k, v in pretrained_dict.items() if k in model_dict}
# # 更新现有的model_dict
model_dict.update(pretrained_dict)
# # 加载我们真正需要的state_dict
cnn.load_state_dict(model_dict)
# print(resnet50)
# print(cnn)

input = torch.rand(2,3,512,512)
out = cnn(input)

9. Use different learning rates for different layers

We use different learning rates for different layers of the model.
Still using this model as an example:


net = Network()  # 获取自定义网络结构
for name, value in net.named_parameters():
    print('name: {}'.format(name))

# 输出:
# name: cnn.VGG_16.convolution1_1.weight
# name: cnn.VGG_16.convolution1_1.bias
# name: cnn.VGG_16.convolution1_2.weight
# name: cnn.VGG_16.convolution1_2.bias
# name: cnn.VGG_16.convolution2_1.weight
# name: cnn.VGG_16.convolution2_1.bias
# name: cnn.VGG_16.convolution2_2.weight
# name: cnn.VGG_16.convolution2_2.bias

To set different learning rates for convolution1 and convolution2, first separate them, that is, put them in different lists:


conv1_params = []
conv2_params = []

for name, parms in net.named_parameters():
    if "convolution1" in name:
        conv1_params += [parms]
    else:
        conv2_params += [parms]

# 然后在优化器中进行如下操作:
optimizer = optim.Adam(
    [
        {
    
    "params": conv1_params, 'lr': 0.01},
        {
    
    "params": conv2_params, 'lr': 0.001},
    ],
    weight_decay=1e-3,
)

We divide the model into two parts and store them in a list, each part corresponds to a dictionary above, and set different learning rates in the dictionary. When the two parts have the same other parameters, put the parameter outside the list as a global parameter, as above weight_decay.
It is also possible to set a global learning rate outside the list. When a local learning rate is set in each part of the dictionary, the learning rate is used, otherwise the global learning rate outside the list is used.

10. Model-related operations (weight file cropping)

This content is more, I wrote an article: https://zhuanlan.zhihu.com/p/73893187
Weight clipping: Sometimes you only need to use a part of other people's models, and you want to load this part of the pre-training weights, you need weights file cropping

import torch
from thop import profile
# 加载自己的模型
from swin0 import model,Trans_init_weights   
from collections import OrderedDict


trans= model(224,4)

# 载入原始权重文件
ckpt = torch.load('/home/xzz/桌面/cascade_mask_rcnn_swin_base_patch4_window7.pth')['state_dict']
swin_base_backbone = {
    
    }

# 先找到所需层数的序号,比如需要4--349层:
for i in range(4,349):
    keys = list(ckpt.keys())[i].replace('backbone.','')
    swin_base_backbone[keys] = ckpt[list(ckpt.keys())[i]]

# 生成新的权重
swin_base_tinydict = OrderedDict(swin_base_backbone)
# 载入新的权重
trans.load_state_dict(swin_base_tinydict)
# 保存新的权重
torch.save(swin_base_tinydict,'home/f/new.pth')


# ckpt2 = trans.state_dict()
# x = torch.rand(2,128,80,80)
# print('# generator parameters:', sum(param.numel() for param in trans.parameters()))
# out = trans(x)

11. Network parameter initialization

The initialization of the neural network is an important basic link in the training process, which will have an important impact on the performance, convergence, and convergence speed of the model.
Two commonly used initialization operations are described below.

(1) For some more flexible initialization methods, numpy can be used.
For the custom initialization method, sometimes tensor is not as powerful and flexible as numpy, so you can use numpy to implement the initialization method, and then convert it to tensor for use.


for layer in net1.modules():
    if isinstance(layer, nn.Linear): # 判断是否是线性层
        param_shape = layer.weight.shape
        layer.weight.data = torch.from_numpy(np.random.normal(0, 0.5, size=param_shape)) 
        # 定义为均值为 0,方差为 0.5 的正态分布

(2) Use pytorch's built-in torch.nn.init method.
(2-1) Uniform distribution of xavier
insert image description here

torch.nn.init.xavier_uniform_(tensor, gain=1)

Also known as Glorot initialization.

>>> w = torch.empty(3, 5)
>>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))

(2-2) xavier normal distribution
insert image description here

torch.nn.init.xavier_normal_(tensor, gain=1)

Also known as Glorot initialization.
(2-3) Kaiming uniform distribution

torch.nn.init.kaiming_uniform_
 (tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')

insert image description here
The default is fan_in mode, fan_in can maintain the magnitude of the weight variance of forward propagation, and fan_out can maintain the magnitude of the weight variance of back propagation.

>>> w = torch.empty(3, 5)
>>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')

(2-4) Kaiming normal distribution

torch.nn.init.kaiming_normal_
 (tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')

insert image description here

w = torch.empty(3, 5)
 nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')

example:

for name, m in self.named_modules():
            if any(map(lambda x: isinstance(m, x), [nn.Linear, nn.Conv1d, nn.Conv2d])):
                nn.init.kaiming_uniform_(m.weight, mode='fan_in')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

Example of a linear layer:

class LinearNet(nn.Module):
    def __init__(self, features_in=1, features_out=1):
        super().__init__()
        self.linear = nn.Linear(features_in, features_out)
        self._init_weight()

    def forward(self, x):
        return self.linear(x)
#其中self._init_weight()函数负责将w初始化为1,并将b初始化为0

def _init_weight(self):
    nn.init.constant_(self.linear.weight, val=1)
    nn.init.constant_(self.linear.bias, val=0)

12. Load the built-in pre-trained model

The submodules of the torchvision.models module contain the following models:
AlexNet
VGG
ResNet
SqueezeNet
DenseNet
The method of importing these models is:


import torchvision.models as models
resnet18 = models.resnet18()(pretrained=True)
alexnet = models.alexnet()
vgg16 = models.vgg16()

There is a very important parameter pretrained, which defaults to False, which means that only the structure of the model is imported, and the weights are randomly initialized. If pretrained is True, it means that the imported model is pre-trained on the ImageNet dataset.
More models can be viewed: https://pytorch-cn.readthedocs.io/zh/latest/torchvision/torchvision-models/

13. Other commonly used functions

1 CPU and GPU conversion

Use .to(device) as little as possible , and use zeros_like / ones_like instead

a = torch.zeros_like(b)
torch.ones_like()
torch.rand_like()
torch.randn_like()
torch.randint_like()
torch.empty_like()
torch.full_like()

2 Register Buffer

Sometimes a model or loss function needs to have parameters that are pre-set and used when calling forward, for example it could be a "weights" parameter which scales the loss or some fixed tensor which doesn't change but is changed every time use.

class ModuleWithCustomValues(nn.Module):
    def __init__(self, weights, alpha):
        super().__init__()
        self.register_buffer("weights", torch.tensor(weights))
        self.register_buffer("alpha", torch.tensor(alpha))
    
    def forward(self, x):
        return x * self.weights + self.alpha

m = ModuleWithCustomValues(
    weights=[1.0, 2.0], alpha=1e-4
)
m(torch.tensor([1.23, 4.56]))
tensor([1.2301, 9.1201])

3 vector distance

Euclidean distance between two tensors : torch.cdist


points1 = torch.rand(32)
points2 = torch.rand(42) 
torch.cdist(points1, points2, p=2.0)
tensor.size(34)

Cosine similarity : F.cosine_similarity

import torch.nn.functional as F
batch_of_vectors = torch.rand((4, 64))
similarity_matrix = F.cosine_similarity(batch_of_vectors.unsqueeze(1), batch_of_vectors.unsqueeze(0), dim=2)

similarity_matrix:   tensor.size(44)

4 Normoalize normalization

batch= torch.rand((4, 64))
normalized_batch = F.normalize(batch, p=2.0, dim=1)

torch.Size([4, 64])

5 linear layer + block technique (torch.chunk)

Create a single linear layer that splits the output into N blocks ( instead of multiple linear layers ). This approach usually results in higher performance,

d = 1024
batch = torch.rand((8, d))
layers = nn.Linear(d, 128, bias=False), nn.Linear(d, 128, bias=False), nn.Linear(d, 128, bias=False)
one_layer = nn.Linear(d, 128 * 3, bias=False)
%%timeit
o1 = layers[0](batch)
o2 = layers[1](batch)
o3 = layers[2](batch) 

289 µs ± 30.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
替代为

%%timeit
o1, o2, o3 = torch.chunk(one_layer(batch), 3, dim=1)

202 µs ± 8.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

6 Masked filter tensor

The traditional method uses: torch.masked_select

data = torch.rand((3, 3)).requires_grad_()
mask = data > data.mean()
torch.masked_select(data, mask)

nask:tensor([[False,  True,  True],
        [ True, False,  True],
        [False, False, False]])
tensor([0.7170, 0.7713, 0.9458, 0.6711], grad_fn=<MaskedSelectBackward>)

You can also use data[mask] or data * mask directly (fill 0)

data[mask]
tensor([0.7170, 0.7713, 0.9458, 0.6711], grad_fn=<IndexBackward>)


data * mask
tensor([[0.0000, 0.7170, 0.7713],
        [0.9458, 0.0000, 0.6711],
        [0.0000, 0.0000, 0.0000]], grad_fn=<MulBackward0>)

7 torch.where

Join two tensors together: if the condition is true , take elements from the first tensor, and if the condition is false , take elements from the second tensor.

x = torch.tensor([1, 2, 3, 4, 5], requires_grad=True)
y =torch.tensor([-1, -2, =3, =4, =5], requires_grad=True)
condition_or_mask = x <= 3.0
torch.where(condition_or_mask, x, y)
tensor([ 1,  2,  3, -4, -5], grad_fn=<SWhereBackward>)

8 Tensor.scatter

Fills a tensor with another tensor at a given position . One-dimensional tensor example

data = torch.tensor([1, 2, 3, 4, 5])
index = torch.tensor([0, 1, 4])
values = torch.tensor([-1, -2, -3, -4, -5])
data.scatter(0, index, values)
tensor([-1, -2,  3,  4, -3])

Two-dimensional tensor example: the shape of index is related to the shape of values, and the value in index corresponds to the position in data.

data = torch.zeros((4, 4)).float()
index = torch.tensor([ [0, 1],
    [2, 3],
    [0, 3],
    [1, 2]])
values = torch.arange(1, 9).float().view(4, 2)
values, data.scatter(1, index, values)
(tensor([[1., 2.],
        [3., 4.],
        [5., 6.],
        [7., 8.]]),
tensor([[1., 2., 0., 0.],
        [0., 0., 3., 4.],
        [5., 0., 0., 6.],
        [0., 7., 8., 0.]]))

9 Image interpolation (F.interpolate)

img = Image.open("./cat.jpg")
 F.interpolate(to_tensor(img).unsqueeze(0),  # batch of size 1
                  mode="bilinear", 
                  scale_factor=2.0, 
                  align_corners=False)

10 Make the image into a grid (torchvision.utils.make_grid)

insert image description here

from torchvision.utils import make_grid
from torchvision.transforms.functional import to_tensor, to_pil_image
from PIL import Image
img = Image.open("./cat.jpg")
to_pil_image(
    make_grid(
        [to_tensor(i) for i in [img, img, img]],
         nrow=2, # number of images in single row
         padding=5 # "frame" size
     )
)

14 Weight file trimming

Used to intercept part of the required weight file and modify the keys name

import collections
import torch
ckpt = torch.load('/home/ubuntu/YOLOX-main/yolox_l_backbone.pth')
ckpt_keys = list(ckpt)
# ckpt_keys = list(ckpt['state_dict'])
new_dict = collections.OrderedDict()
num_layer = len(ckpt_keys)
# for i in range(321,349):
for i in range(0,num_layer):    
    # new_dict[ckpt_keys[i]] = ckpt['model'][ckpt_keys[i]]
    new_dict[ckpt_keys[i].replace('backbone.','')] = ckpt[ckpt_keys[i]]
    # new_dict[ckpt_keys[i][9:]] = ckpt['model'][ckpt_keys[i]]

torch.save(new_dict,'/home/ubuntu/YOLOX-main/yolox_l_backbone2.pth')

15 sklearn for clustering

1. K-means clustering

The algorithm is also called k-means clustering algorithm, which uses distance as the evaluation index of similarity, that is,
the closer the distance between two objects, the greater the similarity. The specific steps are:

① First determine a k value, that is, we hope to cluster the data set to obtain k sets.
② Randomly select k data points from the data set as centroids.
③ For each point in the data set, calculate its distance from each centroid (such as Euclidean distance), and divide it into the set to which centroid belongs to which centroid is closest
.
④ After putting all the data into sets, there are a total of k sets. Then recalculate the centroid (data mean) for each set.
⑤ If the distance between the newly calculated centroid and the original centroid is less than a certain set threshold (indicating that the position of the recalculated centroid does not change much, tends to be stable,
or converges), we can consider that the clustering has reached the expectation As a result,
the algorithm terminates.
⑥ If the distance between the new centroid and the original centroid changes greatly, 3~5 steps need to be iterated.

import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans

# 1. 函数:加载文件 
  # 可用data = load_data('data_multivar.txt')
def load_data(input_file):
    X = []
    with open(input_file,'r') as f:
    for line in f.readlines():
    data = [float(x) for x in line.split(',')]
    X.append(data)
return np.array(X)

# 3. 初始类簇4
num_clusters = 4
# 4. 绘制
plt.figure()
plt.scatter(data[:,0], data[:,1],marker='o', facecolors='none', edgecolors='k', s=30)
x_min, x_max = min(data[:, 0]) - 1, max(data[:,0]) + 1
y_min, y_max = min(data[:, 1]) - 1, max(data[:,1]) + 1
plt.title('Input data')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

# 5. 训练模型
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(data)
plt.scatter(data[:,0], data[:,1], marker='o', facecolors='none', edgecolors='k', s=30)
centroids = kmeans.cluster_centers_ # 获取质心
# 6. 绘图
plt.scatter(centroids[:,0], centroids[:,1], marker='o', s=200,linewidths=3, color='k', zorder=10, facecolors='black')
x_min, x_max = min(data[:, 0]) - 1, max(data[:, 0]) + 1
y_min, y_max = min(data[:, 1]) - 1, max(data[:, 1]) + 1
plt.title('Centoids and boundaries obtained using KMeans')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

insert image description here
(1) Advantages
1. The principle is relatively simple, the implementation is also very easy, and the convergence speed is fast.
2. It works better when the resulting clusters are dense and the clusters are distinct from each other.
3. The main parameter that needs to be adjusted is the number of clusters k.
(2) Disadvantages
1. The K value needs to be given in advance, and it is very difficult to estimate the K value in many cases.
2. The K-Means algorithm is sensitive to the initially selected centroid points, and the clustering results obtained by different random seed points are
completely different, which has a great influence on the results.
3. Sensitive to noise and outlier comparison. Used to detect outliers.
4. With the iterative method, only a local optimal solution may be obtained, but not a global optimal solution.

To determine the best value of k, traverse with the evaluation score:

scores = []
range_values = np.arange(2, 10) # 初始聚类个数范围[2, 9]
for i in range_values:
    # 训练模型
    kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
    kmeans.fit(data)
    score = metrics.silhouette_score(data, kmeans.labels_,metric='euclidean', sample_size=len(data))
    print ("\n聚类个数 =", i)
    print ("\n轮廓系数得分 =", score)
    scores.append(score)
    
# 绘制得分柱状图
plt.figure()
plt.bar(range_values, scores, width=0.6, color='k', align='center')
plt.title('Silhouette score')

insert image description here

2. Mean Shift Algorithm

The key operation of the Mean Shift algorithm is to calculate the drift vector of the center point through the change of data density in the region of interest, so as to
move the center point for the next iteration until it reaches the maximum density (the center point remains unchanged).
This operation can be performed starting from each data point, and in this process, the number of times the data appearing in the region of interest is counted.
This parameter will be used as the basis for classification at the end.
insert image description here
Algorithm implementation:

① Randomly select a point from the unmarked data points as the starting center point center;
② Find all the data points that appear in the area with the center as the radius radius, and consider these points to belong to the same cluster C. At the same time,
add 1 to the number of occurrences of data points recorded in this cluster.
③ Take the center as the center point, calculate the vector from the center to each element in the set M, and add these vectors to obtain the vector shift.
④ center = center + shift. That is, the center moves along the direction of shift, and the moving distance is ||shift||.
⑤ Repeat steps 2, 3, and 4 until the shift is very small (that is, iterate until convergence), and remember the center at this time. Note that
all points encountered in this iterative process should be classified into cluster C.
⑥ If the distance between the center of the current cluster C and the center of other existing cluster C2 is less than the threshold at the time of convergence, then merge C2 and C, and the number of occurrences of data
points will also be merged accordingly. Otherwise, take C as the new cluster.
⑦ Repeat 1, 2, 3, 4, 5 until all points are marked as visited.
⑧ Classification: According to each class and the visit frequency of each point, take the class with the highest visit frequency as the class of the current point set.

import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets import make_blobs

# 1 生成样本数据
X, _ = make_blobs(n_samples=500,cluster_std=0.6)

# 2 创建MeanShift对象
ms = MeanShift()
ms.fit(X)
labels = ms.labels_ 
cluster_centers = ms.cluster_centers_ 
print("质心:\n", cluster_centers)
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("不同的聚类数量 : %d \n" % n_clusters_)
# 3 绘图
import matplotlib.pyplot as plt
plt.scatter(X[:,0], X[:,1], marker='o', facecolors='none', edgecolors='k', s=30)
plt.scatter(cluster_centers[:,0], cluster_centers[:,1], marker='o', 
        s=150, linewidths=3, color='k', zorder=10, facecolors='blue')
x_min, x_max = min(X[:, 0]) - 1, max(X[:, 0]) + 1
y_min, y_max = min(X[:, 1]) - 1, max(X[:, 1]) + 1
plt.title('Centoids and boundaries obtained using KMeans')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

Running results:
insert image description here
(1) Advantages
1. Unlike the K-Means algorithm, the Mean Shift algorithm can automatically determine the number of categories.
2. Not affected by the divorce value.
3. There is no local minimum.
(2) Disadvantages
1. It does not perform well in high-dimensional spatial data.
2. The number of clusters cannot be specified

def mean_shift(data, radius=2.5):
    clusters = []
    for i in range(len(data)):
    # 假设每个数据点为初始 聚类中心center
        cluster_centroid = data[i] # 初始center
        if i==0:
            print("旧的质心:",cluster_centroid)
        cluster_frequence = np.zeros(len(data)) # 初始每个数据点的聚类频率属性
        # 遍历数据点
        while True:
             temp_data = []
             for j in range(len(data)): # 每次都遍历所有元素
                 v = data[j] # 获取第j个点
                 # np.linalg.norm() 求范数,sqrt(x1 + x2+ x3 + ... + xn)
                 if np.linalg.norm(v - cluster_centroid)<= radius:
                    # 把半径内的所有数据集合起来
                    temp_data.append(v)
                    cluster_frequence[i] += 1 # 聚类中心记录数据点出现的次数加1
             # 旧的质心
             old_centroid = cluster_centroid
             # 新的质心,半径内所有向量的平均
             new_centroid = np.average(temp_data,axis=0)
             # 更新质心
             cluster_centroid = new_centroid
             # 如果新旧质心一致,则退出
            if np.array_equal(new_centroid,old_centroid):
            # 聚合相同的簇
               has_same_cluster = False
               for cluster in clusters:
                   # 两个质心小于半径,则为同一个质心
                   if np.linalg.norm(cluster['centroid'] - cluster_centroid) <= radius:
                      has_same_cluster = True
                      cluster['frequency'] = cluster['frequency'] + cluster_frequence
                      break
               # 如果质心不同,保存质心
               if not has_same_cluster:
                   clusters.append({
    
    
                   'centroid':cluster_centroid,
                   'frequency':cluster_frequence })
           # 输出粗的个数和值
           print("clusters (", len(clusters), '): ', clusters)
           print("新的质心:", cluster_centroid)
       # 根据频率聚类数据
       def clustering(data, clusters):
           t = []
           for cluster in clusters:
               cluster['data'] = []
               t.append(cluster['frequency'])
           t = np.array(t)
           # 聚类
           for i in range(len(data)):
               column_frequency = t[:, i]
               cluster_index = np.where(column_frequency == np.max(column_frequency))[0][0]
               clusters[cluster_index]['data'].append(data[i])

16. python data foundation

# 1.交换变量值
a,b = 5,10
a,b = b ,a

# 2.列表中的元素,组合成字符串
a = ['python','is','good']
print("".join(a))

# 3.查找列表中频率最高的值
a = [1,2,3,1,2,3,3,4,5,1]
print(max(set(a),key=a.count))   # 返回1

from collections import Counter
cnt = Counter(a)
print(cnt.most_common(3))       #返回[(1, 3), (3, 3), (2, 2)]

# 4.检查两个字符串是否相同字母,不同顺序
from collections import Counter
Counter(str1) == Counter(str2)

# 5.反转字符串
a = 'ahbgkssjf'
print(a[::-1])

for char in reversed(a):
    print(char)

num=123456789
print(int(str(num)a[::-1]))

# 6.反转list
a = [5,4,3,2,1]
print(a[::-1])

for ele in reversed(a):
    print(ele)

# 7.转置二维数组
orig = [ ['a','b'], ['c','d'], ['e','f'] ]
transpose = zip(*orig)
print(list(transpose ))

# 8.链式比较
b = 6
print(4<b<7)      # True
print(1==b<20)

# 9.链式函数调用
def product(a,b):
    return a*b
def add(a,b):
    return a+b

b = True
print((product if b else add)(5,7))

# 10.深浅拷贝
b = a      # 深拷贝,b[0]=10 会修改 a b 两个
b = a[:]   # 浅拷贝,b[0]=10 只修改 b 一个

a = [1,2,3] b = a.copy()  # 浅拷贝

from copy import deepcopy
b = deepcopy(a)           # 浅拷贝

# 11.移除list中的重复元素
li = [2,2,3,4,5]
new = list(set(li))

from collections import OrderedDict
item = ['foo','bar', 'bar','foo']
print(list(OrderedDict.fromkeys(item).keys()))
# 返回['foo', 'bar']

# 12合并字典
d1={
    
    'a':1}
d2={
    
    'b':2}
print({
    
    **d1,**d2})
print(dict(d1.items()|d2.items()))    # 方法1

dict(d1.items()|d2.items())        # 方法2

d1.update(d2)                      # 方法3
 
# 13.list中最大、最小值索引
ab = [40,10,20,30]
def minIndex(ab):
    return min(range(len(ab)),key=ab.__getitem__}

def maxIndex(ab):
    return max(range(len(ab)),key=ab.__getitem__}

# 14.转换列表为,分隔符
it = ['ab','gh','hj']
print(','.join(it))   
''.join(it)  # 返回 'abghhj'

data = [1,3,'abc',5]
''.join(map(str,data))   # 返回'13abc5'

# 15.dict.get:查找字典元素
d = {
    
    'b': 2, 'a': 1}
d.get('c',3) ->3      d.get('a',3) ->1

# 16.dict排序
dd={
    
    'apple':10,'bnana':5,'tomato':20}
print(sorted(dd.items(),key=lambda x: x[1]))
# 返回:[('bnana', 5), ('apple', 10), ('tomato', 20)]

17.for else
insert image description here








Guess you like

Origin blog.csdn.net/qq_45752541/article/details/108530836