深度学习项目的举例

#!/usr/bin/python
                                                                    
import os
import itertools
import argparse
import random
from tqdm import tqdm

import torch
import torch.nn  as nn
import torch.optim as optim

from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.optim import Adam, lr_scheduler


import torchvision
from torchvision.transforms import Compose, Normalize, ToTensor

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# load external modules

# from nets.network_pitch_vit import  PitchTransformer as create_model
#from nets.network_TimeFrequency_vit import  TimeFrequencyTransformerS1 as create_model
from sklearn.metrics import confusion_matrix, accuracy_score
from datetime import  datetime

from  torch.utils.tensorboard import SummaryWriter
import sys

# from config import *
from image_dataloader_v2 import *
from nets.net_Temporal35GroupTrans import temporal_group_Trans as create_model




print ("Train import done successfully")
# input argmuments
parser = argparse.ArgumentParser(description='Temporal_convolution_transformer: Lung Sound Classification')
parser.add_argument('--lr', default=1e-3, type=float, help='learning rate')
parser.add_argument('--weight_decay', default=0.0005,help='weight decay value')
parser.add_argument('--gpu_ids', default=[0], help='a list of gpus')
parser.add_argument('--num_worker', default=4, type=int, help='numbers of worker')
parser.add_argument('--batch_size', default=4, type=int, help='bacth size')
parser.add_argument('--epochs', default=10, type=int, help='epochs')
parser.add_argument('--start_epochs', default=0, type=int, help='start epochs')

parser.add_argument('--data_dir', type=str, help='data directory')
parser.add_argument('--split_method', default=1, type=int, help='0: official 6-4 split; 1: five folds split')
parser.add_argument('--folds_file', type=str, help='folds text file')
parser.add_argument('--test_fold', default=4, type=int, help='Test Fold ID')
parser.add_argument('--stetho_id', default=-1, type=int, help='Stethoscope device id')
parser.add_argument('--aug_scale', default=None, type=int, help='Augmentation multiplier')
parser.add_argument('--model_path',type=str, help='model saving directory')
parser.add_argument('--checkpoint', default=None, type=str, help='load checkpoint')

args = parser.parse_args()

################################MIXUP#####################################
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

##############################################################################
def get_score(hits, counts, pflag=False):
    se = (hits[1] + hits[2] + hits[3]) / (counts[1] + counts[2] + counts[3])  # 统计 异常类 的检测效果, 即异常类所有预测正确的个数占比 异常类所有预测样本的个数
    sp = hits[0] / counts[0]   # 统计正常类的预测效果;
    sc = (se+sp) / 2.0       # 分数, 异常类和 正常类 的平均值;

    if pflag:
        print("*************Metrics******************")
        print("Se: {}, Sp: {}, Score: {}".format(se, sp, sc))
        print("Normal: {}, Crackle: {}, Wheeze: {}, Both: {}".format(hits[0]/counts[0], hits[1]/counts[1], 
            hits[2]/counts[2], hits[3]/counts[3]))
    
class Trainer:
    def __init__(self):
        self.args = args
        print(" the current test_fold: \t  epochs \n :", self.args.test_fold,  self.args.epochs)
        print("Compute the mean and std on the whole dataset:",datetime.now())


        # mean = [-37.1571, -38.5051, -38.3118, -38.1477]  # nfft = 512
        # std = [27.2312, 28.2945, 28.2287, 28.1789]

        # nfft = 1024
        # MEAN tensor([0.2208, -57.0988, -49.5006, -39.9595])
        # STD tensor([7.6824, 14.2498, 22.3357, 11.5099])


        mean = [0.2193, -40.3485]
        std = [ 7.5823, 14.1366]
        # mean, std = get_mean_and_std_2channel(image_loader_TGT2channel(self.args.data_dir, self.args.split_method,  self.args.folds_file,
        #     self.args.test_fold, True,  Compose([ToTensor()]),  self.args.aug_scale, self.args.stetho_id,))
        print("MEAN",  mean, "STD", std)

        # note : 这里 compose 的用法
        print("\n Preparing for the train dataset: \n",datetime.now())
        self.input_transform = Compose([ToTensor(), Normalize(mean, std)])

        # 生成的是训练集中,所有的样本数据(这里生成语谱图的方式), 以及样本标签
        # image_loader() 是继承 Dataset类, 并且需要将 __getitem__(), 该方法重写;
        #  返回时, 通常是以元组的形式,(单个的训练样本数据,  该样本的标签 )
        train_dataset = image_loader_TGT2channel(self.args.data_dir, self.args.split_method, self.args.folds_file, self.args.test_fold,
                True,  self.input_transform, self.args.aug_scale, self.args.stetho_id,)



        test_dataset = image_loader_TGT2channel(self.args.data_dir,self.args.split_method,  self.args.folds_file, self.args.test_fold,
                False,  self.input_transform, self.args.aug_scale, self.args.stetho_id,)

        self.test_ids = np.array(test_dataset.identifiers) # ndarray: str 测试集中, 每个训练样本文件名_第几个cycle_类别
        self.test_paths = test_dataset.filenames_with_labels  # list: str 测试集中, 每个训练样本文件名_第几个cycle_类别

        # loading checkpoint


        self.net = create_model(num_class=4, ).cuda()  #并将网络模型放到 gpu 设备上;
        if self.args.checkpoint is not None:
            checkpoint = torch.load(self.args.checkpoint)
            self.net.load_state_dict(checkpoint)
            # uncomment in case fine-tuning, specify block layer
            # before block_layer, all layers will be frozen durin training
            # self.net.fine_tune(block_layer=5)
            print("Pre-trained Model Loaded:", self.args.checkpoint)
        # 在运行此DataParallel模块之前,并行化模块必须在device_ids [0]上具有其参数和缓冲区。
        self.net = nn.DataParallel(self.net, device_ids=self.args.gpu_ids)

        # weighted sampler

        # note: 5. 处理样本不均衡问题; 解决方式, 使用按照样本采样权重进行采样, 取出样本的位置索引。 
        # 即某一个类别的样本特别少,则该类别下的样本所对应的样本采样权重会大; 适用于各个类别之间的样本数据不均衡问题;
        reciprocal_weights = []  # 代表了每个样本所对应的类别权重;
        for idx in range(len(train_dataset)):
            # labels[idx]: 该样本所对应的类别标签, class_probs[类别标签i]: 该类别i下的样本数目占比在 当前训练阶段中所有类别样本的比率;
            # 将当前样本的类别权重= 该样本所对应的类别权重 
            reciprocal_weights.append(train_dataset.class_probs[train_dataset.labels[idx]])
            # 此时生成reciprocal_weigth 样本类别权重的个数等于样本的个数,即为每个样本分配一个自身对应的类别权重;[0.3, 0.2, ...0.1, ]

        # 样本采样权重 sample_weights  = 对每个样本所对应的类别权重 取倒数;
        sample_weights = (1 / torch.Tensor(reciprocal_weights))
        # 样本的索引位置 sampler_index =  按照样本采样权重, 取出来的样本索引位置;  索引的个数= 样本的个数
        sampler_index = torch.utils.data.sampler.WeightedRandomSampler(sample_weights, len(train_dataset))

        # dataLoader  用于一次取出batch size 个数据,送到网络中, 
        # 注意 如果指定sampler, 则表明使用这种规则的方式获取样本的索引, 则此时, shuffle 使用默认值 False;
        # shuffle 为False 时,且没有指定sampler时, 按照顺序采样样本; 
        self.train_data_loader = DataLoader(train_dataset, num_workers=self.args.num_worker,
                batch_size=self.args.batch_size, sampler=sampler_index)   # train_data_loader: 将训练集, 按照每batch size进行划分, 一共划分成 train_data_laoder/batch_size  个batch;
        self.val_data_loader = DataLoader(test_dataset, num_workers=self.args.num_worker, 
                batch_size=self.args.batch_size, shuffle=False)  # 注意,此时
        print("DATA LOADED")   # 此时,sampler, batch_sampler 都为None,  且shuffle = False ,默认使用 SequentialSampler() 顺序采样;



        params_to_update = []
        for name,param in self.net.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                # print("\n ", name, param.size())

        # Observe that all parameters are being optimized
        #self.optimizer = optim.SGD(params_to_update, lr=self.args.lr, momentum=0.9, weight_decay=self.args.weight_decay)
        self.optimizer = optim.Adam(params_to_update, lr=self.args.lr, weight_decay=self.args.weight_decay)

        # Decay LR by a factor
        #self.exp_lr_scheduler = lr_scheduler.StepLR(self.optimizer, step_size=20, gamma=0.33)  # 20 epoch  update  lr;
        self.exp_lr_scheduler = lr_scheduler.StepLR(self.optimizer, step_size= 15, gamma=0.33)
        self.exp_lr_scheduler  = lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=150, eta_min=0, )

        # weights for the loss function
        weights = torch.tensor([3.0, 1.0, 1.0, 1.0], dtype=torch.float32)
        #weights = torch.tensor(train_dataset.class_probs, dtype=torch.float32)
        weights = weights / weights.sum()
        weights = 1.0 / weights
        weights = weights / weights.sum()
        weights = weights.cuda()

        # weights 给每个类别分配指定的 损失权重;
        self.loss_func = nn.CrossEntropyLoss(weight=weights)
        # reduction = none: 表示不做任何平均, 逐个输出当前batch中,每个样本的loss 数值;
        self.loss_nored = nn.CrossEntropyLoss(reduction='none')


    def train(self):
        train_losses = []
        test_losses = []
        test_acc = []
        best_acc = -1

        tb_writer = SummaryWriter()


        #  开始一个epoch 的训练;
        for _, epoch in enumerate(range(self.args.start_epochs, self.args.epochs)):
            losses = []
            class_hits = [0.0, 0.0, 0.0, 0.0]
            class_counts = [0.0+1e-7, 0.0+1e-7, 0.0+1e-7, 0.0+1e-7]
            running_corrects = 0.0
            denom = 0.0
            classwise_train_losses = [[], [], [], []]  # 每个类别的损失;
                
            # 从dataloader 中 读取一个batch 的数据,  并且通过enumerate() 逐个取出该batch 中的每个样本到网络中;
            for i, (image,label) in enumerate(tqdm(self.train_data_loader,  desc=' training process')):
                image, label = image.cuda().float(), label.cuda().float()  # image, label : torch.float32
                # in case using mixup, uncomment 2 lines below
                # image, label_a, label_b, lam = mixup_data(image, label, alpha=0.5)
                # image, label_a, label_b = map(Variable, (image, label_a, label_b))

                # print("the input data image shape: \n",image.shape) # torch.Size([1, 4, 96, 428])
                output = self.net(image)
                # output  : torch.float32,   cuda:0;
                # calculate loss from output
                # in case using mixup, uncomment line below and comment the next line
                #loss = mixup_criterion(self.loss_func, output, label_a, label_b, lam)
                label = label.long()   # torch.float32 --> torch.int64
                loss = self.loss_func(output, label)  # 使用交叉熵,进行计算batch个样本的损失函数, 并且reduction= mean, 计算batch 样本的加权损失, 此时有每个类别的weigths损失权重传入进去,没有则默认相同;
                loss_nored = self.loss_nored(output, label)# reduction=none, 逐个输出batch 个样本的loss;

                _, preds = torch.max(output, 1)  # 1:代表在列维度上,即类别维度上,选取每个样本预测出来的最大概率,属于哪个类别,下标index即对应了哪个类别  _:预测最大类别的概率,preds: 下标index, 代表了哪个类别; 
                running_corrects += torch.sum(preds == label.data)# 统计当前batch 中预测正确的个数,  然后加running_corrects中,用于保留当前epoch中,训练集上,所有预测争取的个数;
                denom += len(label.data)  # float, 用于保存当前训练集上,所有参与预测的样本的个数, 这里没有直接用训练集样本的长度, 是因为batch_size 最后会取到重复的样本,或者丢弃一些样本,所以导致参与预测的样本数目和训练集的样本数目不相同; ,

                #class  note: 注意,这里为什么要重新统计每个epoch 中, 每个类别的数目, 而不直接根据训练集中实际的各个类别的样本数目得来, 这是因为 在 DataLoader 中采样的时候, 类别少的样本,会被重复采样多次,  根据样本采样权重sampler 进行采样的,
                for idx in range(preds.shape[0]):   # 遍历当前batch 上预测结果的个数;
                    class_counts[label[idx].item()] += 1.0 # list: float,按照真实标签, 来统计当前batch上每个类别的个数, 在加到class_counts 中,用于统计当前epoch中 每个类别的个数;
                    if preds[idx].item() == label[idx].item():  # 如果, 预测值和标签相同;
                         class_hits[label[idx].item()] += 1.0  # 则统计, 当前batch上,每个类别预测正确的个数; 最终得到,class_hits[] :  当前epoch上每个类别预测正确的个数;
                    classwise_train_losses[label[idx].item()].append(loss_nored[idx].item())  # 找到每个样本所对应的类别, 从而统计当前epoch上,每个类别下所对应的所有样本损失的总和;

                self.optimizer.zero_grad()   # 将优化器中, 所保存的各个变量的梯度 置零;
                loss.backward()  #  使用当前batch 个样本的损失, 将该损失从网络的输出层开始 传到输入层, 开始反向传播
                self.optimizer.step()  # 更新权重参数,  根据损失函数计算对当前层中的各个参数权重的梯度, 



                losses.append(loss.data.cpu().numpy())  # 将每个batch 的损失添加到列表中,  形成一个列表;
                #  这里设计的是运行到多少个batch 时, 开始在测试集上,进行验证; 正常情况下,应该是当前epoch 下所有的batch 都运行结束后, 开始测试一下精度, 即一个epoch 结束后,测试一下精度;
                if i % 10000 == self.train_data_loader.__len__()-1:  # i代表了是第几个batch , 当前i等于训练集上总共的batch 数目时, self.train_data_loader的长度便是, 一共划分成了多少个 batch 的数目 
                    print(" \n =============================================")
                    print("epoch {} iter {}/{} Train Total loss: {}".format(epoch,
                        i, len(self.train_data_loader), np.mean(losses)))  # 在当前epoch 上, 得到的损失, note : 这个损失是将所有btach 个的损失相加起来, 然后求平均的;
                    print("Train Accuracy: {}".format(running_corrects.double() / denom))  # 预测正确的个数 占比 总共预测的个数;
                    print("Classwise_Losses Normal: {}, Crackle: {}, Wheeze: {}, Both: {}".format(np.mean(classwise_train_losses[0]),  # 求出每个类别的损失, 同样是对每个类别batch个损失求平均的;
                        np.mean(classwise_train_losses[1]), np.mean(classwise_train_losses[2]), np.mean(classwise_train_losses[3])))
                    get_score(class_hits, class_counts, True)

                    print("testing......")
                    acc, test_loss = self.evaluate(self.net, epoch, i)


                    train_acc = running_corrects.double() / denom
                    train_loss = np.mean(losses)

                    tags = ["train_acc", "train_loss", "val_acc", "val_loss", "learning_rate" ]
                    tb_writer.add_scalar(tags[0], train_acc,  epoch)
                    tb_writer.add_scalar(tags[1], train_loss, epoch)
                    tb_writer.add_scalar(tags[2], acc, epoch)
                    tb_writer.add_scalar(tags[3], test_loss, epoch)
                    tb_writer.add_scalar(tags[4], self.optimizer.param_groups[0]['lr'], epoch)


                    if best_acc < acc:
                        best_acc = acc
                        # torch.save(self.net.module.state_dict(), args.model_path+'/pitch_lab2_6ch_'+'fold_'+str(self.args.test_fold)+str(epoch)+'_'+str(self.args.stetho_id)+'.pkl')
                        # torch.save(self.net.module.state_dict(),args.model_path + '/pitch_lab2_6ch_'+'fold_' + str(self.args.test_fold) +'epoch_'+ str(epoch)+'bestAcc_'+str(best_acc) + '.pkl')
                        torch.save(self.net.module.state_dict(), args.model_path + '/lab2_TGT' +'epoch_'+ str(epoch)+'bestAcc_'+str(best_acc)  +'.pkl')
                        print("Best ACC achieved......", best_acc.item())
                    print("BEST ACCURACY TILL NOW", best_acc)

                    train_losses.append(np.mean(losses))  # 将每个epoch 的损失添加到其中;
                    test_losses.append(test_loss)
                    test_acc.append(acc)
            self.exp_lr_scheduler.step()  #  此时一个epoch 训练结束, 需要更新一下学习率;



    def evaluate(self, net, epoch, iteration):

        self.net.eval()
        test_losses = []
        class_hits = [0.0, 0.0, 0.0, 0.0]  # normal, crackle, wheeze, both
        class_counts = [0.0, 0.0, 0.0 + 1e-7, 0.0 + 1e-7]  # normal, crackle, wheeze, both
        running_corrects = 0.0
        denom = 0.0

        classwise_test_losses = [[], [], [], []]
        conf_label, conf_pred = [], []

        # for i, (image, label) in tqdm(enumerate(self.val_data_loader)): # note 这里,测试集,是按照顺序传入测试样本的;
        for i, (image, label) in enumerate(tqdm(self.val_data_loader, desc='testing process ')):
            image, label = image.cuda().float(), label.cuda().float()
            output = self.net(image)

            label = label.long()
            # calculate loss from output
            loss = self.loss_func(output, label)  # 计算出当前batch 个样本的损失, reduction = mean ,求平均的方式;
            loss_nored = self.loss_nored(output, label) # 逐个输出batch中 每个样本的损失, reduction = None
            test_losses.append(loss.data.cpu().numpy())# 将当前batch 的损失添加到 test_losses 中,形成一个列表;

            _, preds = torch.max(output, 1)  # 取出batch 个样本中,每个样本预测值最大的, 所在的下标索引, 该下标索引即对应了所处的类别;
            running_corrects += torch.sum(preds == label.data) # 统计当前batch 中, 预测正确的个数;
            #  denom += len(label.data)  # float, 用于保存当前训练集上,所有参与预测的样本的个数, 这里没有直接用训练集样本的长度, 是因为batch_size 最后会取到重复的样本,或者丢弃一些样本,所以导致参与预测的样本数目和训练集的样本数目不相同; ,
            # updating denom  note:
            denom += len(label.data)   #  统计测试集中 参与预测的样本个数;

            # class
            for idx in range(preds.shape[0]):
                class_counts[label[idx].item()] += 1.0  # 统计测试集中,每个类别下,参与预测的样本的总数;
                conf_label.append(label[idx].item()) # 将batch中,每个样本的真实标签添加到其中,形成列表
                conf_pred.append(preds[idx].item())  # 将batch中,每个样本的预测标签添加到其中,形成列表
                if preds[idx].item() == label[idx].item():# 统计每个类别下,预测正确的个数;
                    class_hits[label[idx].item()] += 1.0
                # 找到每个样本所对应的类别, 从而统计当前epoch上,每个类别下所对应的所有样本损失的总和; 注意,loss_nored 是逐个输出每个样本的损失;
                classwise_test_losses[label[idx].item()].append(loss_nored[idx].item())

        print("Val Accuracy: {}".format(running_corrects.double() / denom))  # 测试集上,预测正确的比率;
        print("epoch {}, Validation BCE loss: {}".format(epoch, np.mean(test_losses)))
        # print("Classwise_Losses Normal: {}, Crackle: {}, Wheeze: {}, Both: {}".format(np.mean(classwise_test_losses[0]),
        #    np.mean(classwise_test_losses[1]), np.mean(classwise_test_losses[2]), np.mean(classwise_test_losses[3])))
        # get_score(class_hits, class_counts, True)

        # aggregating same id, majority voting
        conf_label = np.array(conf_label)  #  同时将列表转换成 ndarray,  将list -> 转成 ndarray;   cong_label: 列表,按照推理顺序, 存储了测试集上,所有样本的真实标签;
        conf_pred = np.array(conf_pred)  # 列表, 按照每个batch的推理顺序,同样将测试集上,所有样本的预测标签添加到其中;
        y_pred, y_true = [], []
        for pt in self.test_paths:   # 逐个取出测试集中的每个cycel,  文件名_第几个cycle_类别标签;
            v1 = np.where(self.test_ids == pt)   # 找到当前pt子音频cycle 的文件名位于 self.test_ids 列表中的下标索引;
            v2 = conf_pred[v1]    #  找到当前pt测试样本, 对应的预测标签; 由于v2的最大值,便是3, 代表了类别3;
            v3 = np.bincount(v2)  #   统计0- V2 之间,每个类出现的次数, 这里便是对应类别0 到类别v2,
            v4 = np.argmax(v3)    #  统计类别0-类别3之间,票数最多那个类别,作为当前pt预测的类别;

            y_pred.append(v4)

            # y_pred.append(np.argmax(np.bincount(conf_pred[np.where(self.test_ids == pt)])))# 投票产生一个预测值的 类别标签;
            y_true.append(int(pt.split('_')[-1]))  # 将取出样本的真实标签添加到其中; 

        conf_matrix = confusion_matrix(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        print("Confusion Matrix", conf_matrix)
        print("Accuracy Score", acc)
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
        print("Classwise Scores", conf_matrix.diagonal())
        self.net.train()

        return acc, np.mean(test_losses)
        # self.test_ids = np.array(test_dataset.identifiers) # ndarray: str 测试集中, 每个训练样本文件名_第几个cycle_类别
        # self.test_paths = test_dataset.filenames_with_labels  # list: str 测试集中, 每个训练样本文件名_第几个cycle_类别


if __name__ == "__main__":

    '''
    for test_id in range(0, 5):
        args.test_fold =  test_id
        args.epochs = 30
        args.lr = 3e-3
        args.arg_scale = 1
        args.checkpoint = './models_out/pitch_lab2_6ch_best_acc.pkl'
    '''
    trainer = Trainer()
    trainer.train()


'''
torch.save(self.net.module.state_dict(), args.model_path + '/pitch_lab2_6ch_' +'best_acc'+'.pkl')
python  train_lab3_mask_11  --data_dir ./data/ICBHI_final_database   --folds_file ./data/patient_list_foldwise.txt --model_path models_out --lr 1e-2 --batch_size 8 --num_worker 10 --start_epochs 0 --epochs 150 --test_fold 4 --aug_scale 1
python  train_lab4  --data_dir ./data/ICBHI_final_database   --folds_file ./data/patient_list_foldwise.txt --model_path models_out --lr 1e-2 --batch_size 8 --num_worker 10 --start_epochs 0 --epochs 150 --test_fold 4  
'''


'''
np.where( self.test == pt )
找出训练集中的文件名和 当前取出来的文件名, 
相同的该下标索引, 之前两个是按照顺序一一 对应的;

conf_pred: 该下标索引对应位置上, 所预测的类别标签;

np.bincount(0,0,1): 输出2,1
[0,0,1] 一共有2个值,那么他们的标签就是类别0和 类别1,
则类别0有2个, 类别1有1个;

argmax(array, axis) 用于返回一个numpy数组中最大值的索引值。
当一组中同时出现几个最大值时,返回第一个最大值的索引值。


'''

猜你喜欢

转载自blog.csdn.net/chumingqian/article/details/129650993