Table of contents
Single model training skills - CosineAnnealingWarmRestarts
Tasks and Datasets
Task
Objective - Image Classification
1. Solve image classification with convolutional neural networks.
2. Improve the performance with data augmentations.
3. Understand popular image model techniques such as residual
data set
● The images are collected from the food-11 dataset classified into 11 classes.
● Training set: 9866 labeled images
● Validation set: 3430 labeled images
● Testing set: 3347 image
Download address: ML2022Spring-HW3 | Kaggle
Baseline
Simple : 0.50099
Medium : 0.73207 Training Augmentation + Train Longer
Strong : 0.81872 Training Augmentation + Model Design + Train Looonger (+
Cross Validation + Ensemble)
Boss : 0.88446 Training Augmentation + Model Design +Test Time
Augmentation + Train Looonger (+ Cross Validation + Ensemble)
Guide package
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset
# This is for the progress bar.
from tqdm.auto import tqdm
from d2l import torch as d2l
import random
def same_seeds(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
data processing
Transforms
# Normally, We don't need augmentations in testing and validation.
# All we need here is to resize the PIL image and transform it into Tensor.
test_tfm = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
])
# However, it is also possible to use augmentation in the testing phase.
# You may use train_tfm to produce a variety of images and then test using ensemble methods
train_tfm = transforms.Compose([
# Resize the image into a fixed shape (height = width = 128)
transforms.Resize((128, 128)),
# You may add some transforms here.
# ToTensor() should be the last one of the transforms.
transforms.ToTensor(),
])
Datasets
class FoodDataset(Dataset):
def __init__(self,path=None,tfm=test_tfm,files=None):
super(FoodDataset).__init__()
self.path = path
if path:
self.files = sorted([os.path.join(path, x) for x in os.listdir(path) if x.endswith(".jpg")])
else:
self.files = files
self.transform = tfm
def __len__(self):
return len(self.files)
def __getitem__(self,idx):
fname = self.files[idx]
im = Image.open(fname)
im = self.transform(im)
#im = self.data[idx]
try:
label = int(fname.split("/")[-1].split("_")[0]) # windows写成\\
except:
label = -1 # test has no label
return im,label
data loading function
def loadData(dataset_dir, batch_size, num_workers, train_tfm, test_tfm):
# Construct datasets.
# The argument "loader" tells how torchvision reads the data.
train_set = FoodDataset(os.path.join(dataset_dir,"training"), tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last = True)
valid_set = FoodDataset(os.path.join(dataset_dir,"validation"), tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last = True)
print('训练集总长度是 {:d}, batch数量是 {:.2f}'.format(len(train_set), len(train_set)/ batch_size))
print('验证集总长度是 {:d}, batch数量是 {:.2f}'.format(len(valid_set), len(valid_set)/ batch_size))
return train_loader, valid_loader
classification model
The model used is a bit like VGG. The common point is to use 3X3 convolution kernel, use pooling, dimension continuous X2, and use linear layer. The difference is that the model here is shallower and uses BatchNorm
class Classifier(nn.Module):
def __init__(self):
super(Classifier, self).__init__()
# torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
# torch.nn.MaxPool2d(kernel_size, stride, padding)
# input 維度 [3, 128, 128]
self.cnn = nn.Sequential(
nn.Conv2d(3, 64, 3, 1, 1), # [64, 128, 128]
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0), # [64, 64, 64]
nn.Conv2d(64, 128, 3, 1, 1), # [128, 64, 64]
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0), # [128, 32, 32]
nn.Conv2d(128, 256, 3, 1, 1), # [256, 32, 32]
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0), # [256, 16, 16]
nn.Conv2d(256, 512, 3, 1, 1), # [512, 16, 16]
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0), # [512, 8, 8]
nn.Conv2d(512, 512, 3, 1, 1), # [512, 8, 8]
nn.BatchNorm2d(512),
nn.ReLU(),
nn.MaxPool2d(2, 2, 0), # [512, 4, 4]
)
self.fc = nn.Sequential(
nn.Linear(512*4*4, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, 11)
)
def forward(self, x):
out = self.cnn(x)
out = out.view(out.size()[0], -1)
return self.fc(out)
train
training function
def trainer(train_loader, val_loader, model, config, devices):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
T_0=config['T_0'], T_mult=config['T_mult'],
eta_min=config['learning_rate']/config['eta_min_ratio'])
n_epochs, patience = config['num_epoch'], config['patience']
num_batches = len(train_loader)
show_batches = num_batches // config['show_num']
if not os.path.isdir('./' + config['model_path'].split('/')[1]):
os.mkdir('./' + config['model_path'].split('/')[1]) # Create directory of saving models.
legend = ['train loss', 'train acc']
if val_loader is not None:
legend.append('valid loss')
legend.append('valid acc')
animator = d2l.Animator(xlabel='epoch', xlim=[0, n_epochs], legend=legend)
for epoch in range(n_epochs):
train_acc, train_loss = 0.0, 0.0
# training
model.train() # set the model to training mode
for i, (data, labels) in enumerate(train_loader):
data, labels = data.to(devices[0]), labels.to(devices[0])
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
_, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
train_acc += (train_pred.detach() == labels.detach()).sum().item()
train_loss += loss.item()
if (i + 1) % show_batches == 0:
train_acc = train_acc / show_batches / len(data)
train_loss = train_loss / show_batches
print('train_acc {:.3f}, train_loss {:.3f}'.format(train_acc, train_loss))
animator.add(epoch + (i + 1) / num_batches, (train_loss, train_acc, None, None))
train_acc, train_loss = 0.0, 0.0
scheduler.step()
# validation
if val_loader != None:
model.eval() # set the model to evaluation mode
val_acc, val_loss = 0.0, 0.0
with torch.no_grad():
for i, (data, labels) in enumerate(val_loader):
data, labels = data.to(devices[0]), labels.to(devices[0])
outputs = model(data)
loss = criterion(outputs, labels)
_, val_pred = torch.max(outputs, 1)
val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
val_loss += loss.item()
val_acc = val_acc / len(val_loader) / len(data)
val_loss = val_loss / len(val_loader)
print('val_acc {:.3f}, val_loss {:.3f} '.format(val_acc, val_loss))
animator.add(epoch + 1, (None, None, val_loss, val_acc))
# if the model improves, save a checkpoint at this epoch
if val_acc > config['best_acc']:
config['best_acc'] = val_acc
torch.save(model.state_dict(), config['model_path'])
# print('saving model with acc {:.3f}'.format(best_acc / len(val_loader) / len(labels)))
stale = 0
else:
stale += 1
if stale > patience:
print(f"No improvment {patience} consecutive epochs, early stopping")
break
# if not validating, save the last epoch
if val_loader == None:
torch.save(model.state_dict(), config['model_path'])
# print('saving model at last epoch')
Download Data
batch_size = 256
num_workers= 4
dataset_dir = "data/food11"
train_loader, val_loader = loadData(dataset_dir, batch_size, num_workers, train_tfm, test_tfm)
train
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
# fix random seed
seed = 0 # random seed
same_seeds(seed)
config = {
# training prarameters
'num_epoch': 30, # the number of training epoch
'learning_rate': 1e-4, # learning rate
'weight_decay': 1e-4,
'best_acc': 0.0,
'T_0': 2,
'T_mult': 2,
'eta_min_ratio':20,
'patience': 300,
'show_num': 1 # 每个epoch打印几次loss
}
config['model_path'] = './models/foldmodel' + str(config['learning_rate']) # the path where the checkpoint will be saved
model = Classifier().to(devices[0])
trainer(train_loader, val_loader, model, config, devices)
predict
# 单个model
def pred(test_loader, model, devices):
test_acc = 0.0
test_lengths = 0
pred = []
model.eval()
with torch.no_grad():
for batch in tqdm(test_loader):
features = batch[0].to(devices[0])
outputs = model(features)
_, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
pred.append(test_pred.cpu())
pred = torch.cat(pred, dim=0).numpy()
return pred
batch_size = 512
test_set = FoodDataset(os.path.join(dataset_dir,"test"), tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
model_best = Classifier().to(devices[0])
model_best.load_state_dict(torch.load('models/model0.0005Fold_0_best'))
prediction = pred(test_loader, model_best, devices)
output
#create test csv
def pad4(i):
return "0"*(4-len(str(i)))+str(i)
df = pd.DataFrame()
df["Id"] = [pad4(i) for i in range(1,len(test_set)+1)]
df["Category"] = prediction
df.to_csv("submission best0.csv",index = False)
answer
data augmentation
Through data augmentation, the same sample is fed to each epoch differently during the training process, the purpose is to prevent the model from overfitting.
# Normally, We don't need augmentations in testing and validation.
# All we need here is to resize the PIL image and transform it into Tensor.
test_tfm = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
])
# However, it is also possible to use augmentation in the testing phase.
# You may use train_tfm to produce a variety of images and then test using ensemble methods
train_tfm = transforms.Compose([
# Resize the image into a fixed shape (height = width = 128)
transforms.RandomResizedCrop((128, 128), scale=(0.5, 1), ratio=(0.5, 2)),
# You may add some transforms here.
transforms.RandomHorizontalFlip(0.5),
transforms.RandomVerticalFlip(0.5),
transforms.RandomGrayscale(0.2),
transforms.RandomRotation(30),
transforms.RandomAffine(30),
transforms.RandomSolarize(threshold=192.0, p=0.2),
transforms.ColorJitter(brightness=0.4,contrast=0.4, saturation=0.4),
# ToTensor() should be the last one of the transforms.
transforms.ToTensor(),
])
The following is a more complex version. In fact, some data augmentation methods are not so commonly used.
# Normally, We don't need augmentations in testing and validation.
# All we need here is to resize the PIL image and transform it into Tensor.
test_tfm = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
])
# However, it is also possible to use augmentation in the testing phase.
# You may use train_tfm to produce a variety of images and then test using ensemble methods
train_tfm = transforms.Compose([
# Resize the image into a fixed shape (height = width = 128)
transforms.RandomResizedCrop((128, 128), scale=(0.5, 1), ratio=(0.5, 2)),
# You may add some transforms here.
transforms.RandomHorizontalFlip(0.5),
transforms.RandomVerticalFlip(0.5),
transforms.RandomGrayscale(0.2),
transforms.RandomRotation(180),
transforms.ColorJitter(brightness=0.4,contrast=0.4, saturation=0.4),
transforms.RandomGrayscale(p=0.2),
transforms.AutoAugment(transforms.AutoAugmentPolicy.IMAGENET),
transforms.RandomInvert(p=0.2),
transforms.RandomAffine(30),
transforms.RandomSolarize(threshold=192.0, p=0.2),
transforms.RandomPosterize(bits=2),
transforms.RandomEqualize(p=0.2),
#transforms.RandomApply(torch.nn.ModuleList([]))
# ToTensor() should be the last one of the transforms.
transforms.ToTensor(),
])
Use source code model
Directly use the classification model given by the original code, plus data augmentation, careful training, no other techniques are used, the score is 0.83764, easily surpassing the strong line
batch_size = 256
num_workers= 4
dataset_dir = "data/food11"
train_loader, val_loader = loadData(dataset_dir, batch_size, num_workers, train_tfm, test_tfm)
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
# fix random seed
seed = 0 # random seed
same_seeds(seed)
config = {
# training prarameters
'num_epoch': 30, # the number of training epoch
'learning_rate': 1e-4, # learning rate
'weight_decay': 1e-4,
'best_acc': 0.0,
'T_0': 2,
'T_mult': 2,
'eta_min_ratio':20,
'patience': 300,
'show_num': 1 # 每个epoch打印几次loss
}
config['model_path'] = './models/foldmodel' + str(config['learning_rate']) # the path where the checkpoint will be saved
model = Classifier().to(devices[0])
trainer(train_loader, val_loader, model, config, devices)
Ensemble+TTA
Model Integration Ensemble
For each sample batch, the prediction matrix of each model is calculated and summed to obtain a prediction matrix. For each sample, the category with the highest score is selected as the predicted category. Save the prediction results of each batch in the same list, this is the final result
Several ways of integration:
● Average of logits or probability : Need to save verbose output, less ambiguous
● Voting : Easier to implement, need to break ties
● Coding : basic math operations with numpy or torch
# 模型集成
prediction = []
with torch.no_grad():
for data in test_loader:
test_preds = []
for model_best in models:
model_pred = model_best(data[0].to(devices[0])).cpu().numpy()
test_preds.append(model_pred)
test_preds = sum(test_preds)
test_label = np.argmax(test_preds, axis=1)
prediction += test_label.squeeze().tolist()
Test Time Augmentation
Create a test set using test_tfm and 5 test sets using train_tfm, and get the list preds, which contains 6 elements. The first is the prediction matrix obtained by using test_tfm on the test set, and the latter are all obtained by using train_tfm on the test set. The prediction matrix, the shape of each matrix is (3347,11), 3347 is the number of samples, and 11 is the number of categories.
# TTA
batch_size = 512
test_set = FoodDataset(os.path.join(dataset_dir,"test"), tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2)
test_loaders = []
for i in range(5):
test_set_i = FoodDataset(os.path.join(dataset_dir,"test"), tfm=train_tfm)
test_loader_i = DataLoader(test_set_i, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
test_loaders.append(test_loader_i)
model_best = Classifier().to(devices[0])
model_best.load_state_dict(torch.load('models23/model0.0005Fold_0_best'))
model_best.eval()
preds = [[], [], [], [], [], []]
with torch.no_grad():
for data, _ in test_loader:
batch_preds = model_best(data.to(devices[0])).cpu().data.numpy()
preds[0].extend(batch_preds)
for i, loader in enumerate(test_loaders):
for data, _ in loader:
batch_preds = model_best(data.to(devices[0])).cpu().data.numpy()
preds[i+1].extend(batch_preds)
The weighted average of the prediction results of the 6 test sets is obtained to obtain a (3347,11) matrix. The result is best not to overwrite preds, but to save it as a new variable, so that it is convenient to try different weight coefficients and get better results
preds_np = np.array(preds, dtype=object)
print(preds_np.shape)
bb = 0.6* preds_np[0] + 0.1 * preds_np[1] + 0.1 * preds_np[2] + 0.1 * preds_np[3] + 0.1 * preds_np[4] + 0.1 * preds_np[5]
print(bb.shape)
prediction = np.argmax(bb, axis=1)
Ensemble+TTA
write these two pieces together
test_loaders = []
batch_size = 256
for i in range(5):
test_set_i = FoodDataset(os.path.join(dataset_dir,"test"), tfm=train_tfm)
test_loader_i = DataLoader(test_set_i, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
test_loaders.append(test_loader_i)
preds = [[], [], [], [], [], []]
test_set = FoodDataset(os.path.join(dataset_dir,"test"), tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
with torch.no_grad():
for data, _ in test_loader:
batch_preds = []
for model_best in models:
batch_preds.append(model_best(data.to(devices[0])).cpu().data.numpy())
batch_preds = sum(batch_preds)
preds[0].extend(batch_preds.squeeze().tolist())
for i, loader in enumerate(test_loaders):
for data, _ in loader:
batch_preds = []
for model_best in models:
batch_preds.append(model_best(data.to(devices[0])).cpu().data.numpy())
batch_preds = sum(batch_preds)
preds[i+1].extend(batch_preds.squeeze().tolist())
The result is very close to the boss line
Cross Validation
Divide the training set into 5 parts and perform 5-fold cross-validation. Mix the training set and the verification set to get the total sample list total_files, then use the np.array_split function to get 5 data sets, and use the training set and the verification set in turn in 5 cycles. Use config['model_path'] to record the save path, and use config['best_accs'] to record the best performance of each model on the validation set
def trainer_k_folds(config, dataset_dir, batch_size, train_tfm, test_tfm, devices):
train_dir = os.path.join(dataset_dir,"training")
val_dir = os.path.join(dataset_dir,"validation")
train_files = [os.path.join(train_dir, x) for x in os.listdir(train_dir) if x.endswith('.jpg')]
val_files = [os.path.join(val_dir, x) for x in os.listdir(val_dir) if x.endswith('.jpg')]
total_files = np.array(train_files + val_files)
random.shuffle(total_files)
num_folds = config['num_folds']
train_folds = np.array_split(np.arange(len(total_files)), num_folds)
train_folds = np.array(train_folds, dtype=object) # 防止因为数组维度不整齐而报错
for i in range(num_folds):
print(f'\n\nStarting Fold: {i} ********************************************')
train_data = total_files[np.concatenate(np.delete(train_folds, i)) ]
val_data = total_files[train_folds[i]]
train_set = FoodDataset(tfm=train_tfm, files=train_data)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last = True)
valid_set = FoodDataset(tfm=test_tfm, files=val_data)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last = True)
print('训练集总长度是 {:d}, batch数量是 {:.2f}'.format(len(train_set), len(train_set)/ batch_size))
print('验证集总长度是 {:d}, batch数量是 {:.2f}'.format(len(valid_set), len(valid_set)/ batch_size))
tep = config['model_path']
config['model_path'] += f"Fold_{i}_best"
config['best_acc'] = 0.0
model = Classifier().to(devices[0])
# model.load_state_dict(torch.load('models/foldmodel0.0001')) 提前训练几个epoch,可能加快后面每一个模型的训练
trainer(train_loader, valid_loader, model, config, devices)
config['best_accs'].append(config['best_acc'])
config['model_path'] = tep
train
batch_size = 256
dataset_dir = "data/food11"
devices = d2l.try_all_gpus()
print(f'DEVICE: {devices}')
# fix random seed
seed = 0 # random seed
same_seeds(seed)
config = {
# training prarameters
'num_epoch': 300, # the number of training epoch
'learning_rate': 5e-4, # learning rate
'weight_decay': 1e-4,
'best_acc': 0.0,
'T_0': 16,
'T_mult': 1,
'eta_min_ratio':50,
'patience': 32,
'num_folds':5,
'show_num': 1,
'best_accs': []
}
config['model_path'] = './models22/model' + str(config['learning_rate']) # the path where the checkpoint will be saved
# Initialize a model, and put it on the device specified.
model = Classifier().to(devices[0])
trainer_k_folds(config, dataset_dir, batch_size, train_tfm, test_tfm, devices)
After cross-validating 5 models, perform model integration and TTA, the result is shown in the figure, which is worse than the boss line
discuss
no data augmentation
Tried various learning rates, and trained without data augmentation, the results are similar to the figure below.
It can be seen that the training error quickly drops to 0, but the verification error is still at a high level, but since the training error is 0, the gradient of backpropagation is 0, and the model cannot continue to improve. This is a typical overfitting .
Regularization can prevent over-fitting of the model. CS231n lists data augmentation as one of the means of regularization (refer to 2022 Cs231n PPT notes-training CNN_iwill323 blog-CSDN blog ), at first I did not understand why data augmentation also It belongs to regularization, and now I understand it after finishing this homework. After a series of random changes to the training images, similar but different training samples are generated, in effect increasing the size of the training set. Randomly changing training samples can reduce the model's dependence on certain properties, thereby improving the generalization ability of the model.
Single model training skills - CosineAnnealingWarmRestarts
Use learning_rate = 1e-4, weight_decay = 5e-3, T_0 = 2, T_mult = 2, and train the classifier() of the original code. The initial training curve is as shown below.
You can see the obvious wave shape, this is because CosineAnnealingWarmRestarts adjusts the learning rate according to the interval of the geometric sequence, that is, according to the parameter setting of T_0 = 2, T_mult = 2, every 2, 4, 8, 16... epoch cycle adjustment learning rate.
Let me talk about my understanding of CosineAnnealingWarmRestarts:
The wavy learning curve is actually problematic. The loss dropped in the previous cycle, in the next cycle, because the learning rate returns to a high point, the loss suddenly rises, and the next new cycle will take a lot of epochs to drop to the loss level of the previous cycle, which is undoubtedly very low efficient. So, why interrupt the decline process of loss and let the loss fluctuate, instead of keeping the loss on a downward trend?
The reason is that the downward trend of the learning rate cannot actually be maintained. If the learning rate is not adjusted in time, the decline in loss will eventually enter a "flat period", and the loss will gradually decline. During the training process, it is difficult to know what kind of learning rate to use at different stages. The CosineAnnealingWarmRestarts algorithm continuously "stirs" the learning rate, so that the loss keeps trying new learning rate cycles, and it may be more likely to learn a good optimization result.
Of course, we still want the loss to decline in a straight line, not in waves. If the peak value of the new cycle rises too high suddenly, and it takes a long time to "recover", then the maximum learning rate may be too large, and the maximum learning rate can be reduced; if the loss at the end of each cycle drops too gently, then the minimum The learning rate is too small, you can reduce eta_min.
Continue training and find that the loss curve is not wavy and shows a downward trend, which is ideal. Save the checkpoint and submit the prediction result with a score of 0.80478.
From the end of the above figure, it can be found that the loss can continue to decline, so the learning rate is adjusted to 5e-5 to continue training. At the beginning, the initial learning rate returns to a high point, so the loss increases. The final result is good, it seems that there is still room for improvement
During the training process, checkpoints are saved at different points, and the scores are as shown in the figure below
Model | Score |
model2 | 0.81872 |
model3 | 0.83764 |
model5 | 0.78685 |
model6 | 0.76792 |
model7 | 0.80478 |
model8 | 0.80577 |
model5e-05 | 0.83167 |
Model3 is the highest score instead. This is because every time you restart from a breakpoint, best_accuracy is 0 at the beginning, so after the first epoch, the model is overwritten, but the wavy nature of the CosineAnnealingWarmRestarts method cannot guarantee that the subsequent results will be better. So every time before restarting from a breakpoint, modify the best_accuracy of config
ResNet overfitting
class Residual_Block(nn.Module):
def __init__(self, ic, oc, stride=1):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=3, padding=1, stride=stride),
nn.BatchNorm2d(oc),
nn.ReLU(inplace=True)
)
self.conv2 = nn.Sequential(
nn.Conv2d(oc, oc, kernel_size=3, padding=1),
nn.BatchNorm2d(oc)
)
self.relu = nn.ReLU(inplace=True)
if stride != 1 or (ic != oc): # 对于resnet18,可以不需要stride != 1这个条件
self.conv3 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=1, stride=stride),
nn.BatchNorm2d(oc)
)
else:
self.conv3 = None
def forward(self, X):
Y = self.conv1(X)
Y = self.conv2(Y)
if self.conv3:
X = self.conv3(X)
Y += X
return self.relu(Y)
class ResNet(nn.Module):
def __init__(self, block = Residual_Block, num_layers = [2,2,2,2], num_classes=11):
super().__init__()
self.preconv = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.layer0 = self.make_residual(block, 64, 64, num_layers[0])
self.layer1 = self.make_residual(block, 64, 128, num_layers[1], stride=2)
self.layer2 = self.make_residual(block, 128, 256, num_layers[2], stride=2)
self.layer3 = self.make_residual(block, 256, 512, num_layers[3], stride=2)
self.postliner = nn.Sequential(
nn.AdaptiveAvgPool2d((1,1)),
nn.Flatten(),
nn.Linear(512, num_classes)
)
def make_residual(self, block, ic, oc, num_layer, stride=1):
layers = []
layers.append(block(ic, oc, stride))
for i in range(1, num_layer):
layers.append(block(oc, oc))
return nn.Sequential(*layers)
def forward(self, x):
out = self.preconv(x)
out = self.layer0(out) # [64, 32, 32]
out = self.layer1(out) # [128, 16, 16]
out = self.layer2(out) # [256, 8, 8]
out = self.layer3(out) # [512, 4, 4]
out = self.postliner(out)
return out
I wrote a RenNet18 by hand. For details, please refer to Li Mu's "Learning Deep Learning by Hands". Set learning_rate = 1e-4, you can see that learning is relatively normal at the beginning
As the learning progressed, it became abnormal, and the training error continued to decrease, but the verification error remained basically unchanged. In the end, the verification error "flattened", which was a bit overfitting.
dropout
Li Hongyi's 2022 Machine Learning HW3 Analysis_Machine Learning Craftsman's Blog-CSDN Blog adopts a drop out design at the end of the residual network.
class Residual_Block(nn.Module):
def __init__(self, ic, oc, stride=1):
# torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
# torch.nn.MaxPool2d(kernel_size, stride, padding)
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=3, stride=stride, padding=1),
nn.BatchNorm2d(oc),
nn.ReLU(inplace=True)
)
self.conv2 = nn.Sequential(
nn.Conv2d(oc, oc, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(oc),
)
self.relu = nn.ReLU(inplace=True)
self.downsample = None
if stride != 1 or (ic != oc):
self.downsample = nn.Sequential(
nn.Conv2d(ic, oc, kernel_size=1, stride=stride),
nn.BatchNorm2d(oc),
)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
return self.relu(out)
class Classifier(nn.Module):
def __init__(self, block=Residual_Block, num_layers=[2,2,2,2], num_classes=11):
super().__init__()
self.preconv = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
)
self.layer0 = self.make_residual(block, 32, 64, num_layers[0], stride=2)
self.layer1 = self.make_residual(block, 64, 128, num_layers[1], stride=2)
self.layer2 = self.make_residual(block, 128, 256, num_layers[2], stride=2)
self.layer3 = self.make_residual(block, 256, 512, num_layers[3], stride=2)
#self.avgpool = nn.AvgPool2d(2)
self.fc = nn.Sequential(
nn.Dropout(0.4),
nn.Linear(512*4*4, 512),
nn.BatchNorm1d(512),
nn.ReLU(inplace=True),
nn.Dropout(0.2),
nn.Linear(512, num_classes),
)
def make_residual(self, block, ic, oc, num_layer, stride=1):
layers = []
layers.append(block(ic, oc, stride))
for i in range(1, num_layer):
layers.append(block(oc, oc))
return nn.Sequential(*layers)
def forward(self, x):
# [3, 128, 128]
out = self.preconv(x) # [32, 64, 64]
out = self.layer0(out) # [64, 32, 32]
out = self.layer1(out) # [128, 16, 16]
out = self.layer2(out) # [256, 8, 8]
out = self.layer3(out) # [512, 4, 4]
#out = self.avgpool(out) # [512, 2, 2]
out = self.fc(out.view(out.size(0), -1))
return out
There was no significant improvement in training
transfer learning
Use the resnet18 that comes with torch, and use the pre-trained weights, and then train as a whole on the training set of this job. The whole process is easy and efficient. The power of transfer learning is reflected. It seems that if resnet18 adopts some training techniques, it can be used in this Successfully trained on the job's training set.
TTA
Do TTA for each model (training 5 train_tfm test sets), most of them have improved significantly.
Model | Score | TTA |
model2 | 0.81872 | 0.82569 |
model3 | 0.83764 | 0.83565 |
model5 | 0.78685 | 0.81274 |
model6 | 0.76792 | 0.7998 |
model7 | 0.80478 | 0.80876 |
model8 | 0.80577 | 0.8227 |
After model 3 has done TTA, its performance is basically unchanged, probably because its performance is already better, and the potential of TTA is not great.
assembly
Model ensembles do not always improve model performance. Integrate model5e-5 with other models, with a score of 0.82569, and integrate model3 with other models, with a score of 0.83366, both of which are not as good as before integration, so it is not possible to integrate excellent models with models with a large gap, and let models with similar performance be integrated Only then will it be effective.
- Integration of the same model
The four models from model5 to model8 are integrated, and the score is 0.8237, which can be said to be greatly improved. It should be noted here that these models are actually the same model on the same training set, and different checkpoints of the same training process can also have such an integrated effect, which shows the power of Ensamble.
The four models from model5 to model8 are integrated + TTA, compared with only integration, there is little change. It may be that the weight coefficient of TTA is not set properly.
- Integration across models
The resnet model trained from scratch (score 0.71414) and model6 (score 0.76792) are integrated across models, and the result is 0.78685, which is more obvious.
Continue to do TTA on the basis of integration, and only make a test set of train_tfm, which has a great improvement effect. The following is the score of different ratios of preds_np[0] and preds_np[1]:
Proportion | 1:1 | 2:1 | 6:1 | 9:1 |
Score | 0.77788 | 0.80776 | 0.81274 | 0.81573 |
- Final Result - Big Brother's Integration
According to the above experience, the three models with the best effect (model3, score 0.83565; model5e-05, score 0.83175; model migration learning, score 0.84163) were integrated and TTA.
Just do the integration:
Integration + TTA
It can be seen that the integration effect has been greatly improved, and TTA can further improve the effect. If it is fine-tuned, it is not a problem to exceed the boss line 0.88446.
Cross-validation
5-fold cross-validation was done, and 5 models were obtained. The results of each model are as follows:
Integrating these 5 models + TTA (5 train_tfm test sets were done), the scores are as follows. The coefficients of preds_np[0] are 1.5, 0.9, 0.6 respectively.
It can be seen that the score is not too high, not as good as the above "no-name army". This is because the scores of several models are not high, so the improvement of the ensemble is limited. It seems that model integration follows the law of "I am a hero and my son is a hero".
Why does the cross-validation model above not score well? This is because the model training process is automatic, and the hyperparameters are stopped without careful tuning, so the score is not high
Considering that the score of model0 is relatively low, remove him and use the remaining 4 models to do integration + TTA, and there is a certain improvement
Then remove the model1 with a lower score, the integrated score is 0.86254, no improvement.
Next, transform the coefficient of preds_np[0] in TTA, and the model score is as follows. It can be found that there is no obvious regularity and the variation of the results is limited. Combined with the previous example, 0.9 seems to be better than 0.6.
Coefficient of preds_np[0] | 0.2 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1.0 | 1.1 |
Score | 0.8645 | 0.8695 | 0.8665 | 0.8625 | 0.8635 | 0.8655 | 0.8655 | 0.8635 | 0.8645 |
num_works
I didn't pay attention to the impact of num_workers on the calculation speed of the model before. Using num_workers=0 for calculation, it was found that the GPU utilization rate was 0, while the CPU utilization rate was extremely high. The following results are the running time of training the classification model of the original code on the Nvidia 3090 graphics card with different num_workers. Every time after restarting the kernel, test the next num_workers
num_workers | Time-consuming when epoch=5 (s) | Time-consuming when epoch=15 (s) |
0 | 1236.3717732429504 | |
1 | 312.47620368003845 |
|
2 | 168.62319421768188 |
|
4 | 107.80014061927795 |
324.39282393455505 |
8 | 89.33517622947693 |
294.2875111103058 |
16 | 123.81631231307983 |
fail |
32 | 172.18559384346008 |
The figure below is the operation diagram of GPU and CPU when calculating 5 epochs when different num_workers are used. When num_workers=0, the GPU is not utilized, and the memory utilization is also very low. In fact, the machine is using the CPU for calculation. When num_workers=1, the CPU utilization rate plummets, and the GPU utilization rate rises a bit but is still not high. After a few calculations, it becomes 0. This is because the GPU completes the task quickly, but it is limited by the data transmission speed. , the data supply has not kept up. As num_workers increases, the memory utilization rate increases, and the CPU is stepping up to transfer data to the GPU, and the GPU utilization rate increases. When num_workers is equal to 4/8/16, GPU utilization is high and computation time is low. When num_workers is equal to 32, because the CPU uses too many threads to transfer data, it takes a lot of time for data coordination alone, so the GPU utilization rate drops again; the calculation front is lengthened (calculation of 15 epochs), and the kernel dies directly up.
The use of video memory is relatively stable from beginning to end, mainly affected by batch size.
Kaggle was also tested and calculated for 5 epochs:
num_workers | 0 | 2 | 4 | 8 |
Time-consuming (s) | 774 | 475 | 434 | 467 |
When using more than 2 num_workers, both kaggle and colab will report a reminder:
This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
That is, they recommend using num_workers=2 functions that need attention
np.array_split
Can be used to split training set and validation set in K-fold
import numpy as np
X_train = np.arange(23)
num_folds = 5
X_train_folds = np.array_split(X_train, num_folds)
X_train_folds = np.array(X_train_folds, dtype=object)
print(X_train_folds)
[array([0, 1, 2, 3, 4]) array([5, 6, 7, 8, 9]) array([10, 11, 12, 13, 14]) array([15, 16, 17, 18]) array([19, 20, 21, 22])]
a = np.delete(X_train_folds, 3)
print(a)
concat = np.concatenate(a)
print(concat)
[array([0, 1, 2, 3, 4]) array([5, 6, 7, 8, 9]) array([10, 11, 12, 13, 14]) array([19, 20, 21, 22])] [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 19 20 21 22]