vs2017 VGG16处理cifar-10数据集的PyTorch实现

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_36556893/article/details/86608963

这是针对于博客vs2017安装和使用教程(详细)的VGG16-CIFAR10项目新建示例


目录

一、说明

二、代码

三、结果

四、注意事项


一、说明

1.网络框架搭建教程请参看博主博客:PyTorch 入门实战(四)——利用Torch.nn构建卷积神经网络

2.这里主要展示博主的代码运行结果,希望可以帮助到正在学习PyTorch的人们

二、代码

1.nn_module_sample.py里面是VGG-16(带有BatchNorm层)的网络,注意classifier分类器部分(全连接部分)的输入大小根据batch大小而定

import torch.nn as nn

class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            #1
            nn.Conv2d(3,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            #2
            nn.Conv2d(64,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #3
            nn.Conv2d(64,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            #4
            nn.Conv2d(128,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #5
            nn.Conv2d(128,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #6
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #7
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #8
            nn.Conv2d(256,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #9
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #10
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #11
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #12
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #13
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.AvgPool2d(kernel_size=1,stride=1),
            )
        self.classifier = nn.Sequential(
            #14
            nn.Linear(512,4096),
            nn.ReLU(True),
            nn.Dropout(),
            #15
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            #16
            nn.Linear(4096,num_classes),
            )
        #self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x) 
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out


class testNet(nn.Module):
    def __init__(self, num_classes=10):
        super(testNet, self).__init__()
        #定义自己的网络
        self.conv1 = nn.Conv2d(3,64,kernel_size=3,padding=1)
        self.BN1 = nn.BatchNorm2d(64)
        self.relu1 = nn.ReLU(True)
        self.pool1 = nn.MaxPool2d(kernel_size=2,stride=2)

        layer2 = nn.Sequential()
        layer2.add_module('conv2', nn.Conv2d(64,64,kernel_size=3,padding=1))
        layer2.add_module('BN2',nn.BatchNorm2d(64))
        layer2.add_module('relu2',nn.ReLU(True))
        layer2.add_module('pool2',nn.MaxPool2d(kernel_size=2,stride=2))
        self.layer2 = layer2

        self.layer3 = nn.Sequential(
            nn.Conv2d(64,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            )
        self.classifier = nn.Sequential(
            nn.Linear(128,256),
            nn.ReLU(True),
            nn.Dropout(),

            nn.Linear(256, 256),
            nn.ReLU(True),
            nn.Dropout(),

            nn.Linear(256,num_classes),
            )
    def forward(self,x):
        #定义自己的前向传播方式
        out = self.conv1(x)
        out = self.BN1(out)
        out = self.relu1(out)
        out = self.pool1(out)

        out = self.layer2(out)
        out = self.layer3(out)

        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

if __name__ == '__main__':
    import torch
    #使用gpu
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    net = VGG16().to(device)
    print(net)

2.train.py:包含参数设定、图像预处理、数据集读取、网络创建、损失和优化、训练和测试部分

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import os
import argparse

from tensorboardX import SummaryWriter

from nn_module_sample import VGG16
from torch.autograd import Variable

#参数设置
parser = argparse.ArgumentParser(description='cifar10')
parser.add_argument('--lr', default=1e-2,help='learning rate')
#parser.add_argument('--batch_size',default=50,help='batch size')
parser.add_argument('--epoch',default=15,help='time for ergodic')
parser.add_argument('--pre_epoch',default=0,help='begin epoch')
parser.add_argument('--outf', default='./model/', help='folder to output images and model checkpoints') #输出结果保存路径
parser.add_argument('--pre_model', default=True,help='use pre-model')#恢复训练时的模型路径
args = parser.parse_args()

#使用gpu
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

#数据预处理
# 图像预处理和增强
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4), #先四周填充0,再把图像随机裁剪成32*32
    transforms.RandomHorizontalFlip(),  #图像一半的概率翻转,一半的概率不翻转
    transforms.ToTensor(),
    #transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
    ])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    #transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
    ])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=0)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=0)
#Cifar-10的标签
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

#模型定义 VGG16
net = VGG16().to(device)

# 定义损失函数和优化方式
criterion = nn.CrossEntropyLoss() #损失函数为交叉熵,多用于多分类问题
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) #优化方式为mini-batch momentum-SGD,并采用L2正则化(权重衰减)

#使用预训练模型
if args.pre_model:
    print("Resume from checkpoint...")
    assert os.path.isdir('checkpoint'),'Error: no checkpoint directory found'
    state = torch.load('./checkpoint/ckpt.t7')
    net.load_state_dict(state['state_dict'])
    best_test_acc = state['acc']
    pre_epoch = state['epoch']
else:
    #定义最优的测试准确率
    best_test_acc = 0
    pre_epoch = args.pre_epoch

#训练
if __name__ == "__main__":

    writer = SummaryWriter(log_dir='./log')
    print("Start Training, VGG-16...")
    with open("acc.txt","w") as acc_f:
        with open("log.txt","w") as log_f:
            for epoch in range(pre_epoch, args.epoch):
                print('\nEpoch: %d' % (epoch + 1))
                #开始训练
                net.train()
                print(net)
                #总损失
                sum_loss = 0.0
                #准确率
                accuracy = 0.0
                total = 0.0

                for i, data in enumerate(trainloader):
                    #准备数据
                    length = len(trainloader) #数据大小
                    inputs, labels = data #取出数据
                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizer.zero_grad() #梯度初始化为零(因为一个batch的loss关于weight的导数是所有sample的loss关于weight的导数的累加和)
                    inputs, labels = Variable(inputs), Variable(labels)
                    #forward + backward + optimize
                    outputs = net(inputs) #前向传播求出预测值
                    loss = criterion(outputs, labels) #求loss
                    loss.backward() #反向传播求梯度
                    optimizer.step() #更新参数

                    # 每一个batch输出对应的损失loss和准确率accuracy
                    sum_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)#返回每一行中最大值的那个元素,且返回其索引
                    total += labels.size(0)
                    accuracy += predicted.eq(labels.data).cpu().sum() #预测值和真实值进行比较,将数据放到cpu上并且求和

                    print('[epoch:%d, iter:%d] Loss: %.03f | Acc: %.3f%% '
                         % (epoch + 1, (i + 1 + epoch * length), sum_loss / (i + 1), 100. * accuracy / total))

                    #写入日志
                    log_f.write('[epoch:%d, iter:%d] |Loss: %.03f | Acc: %.3f%% '
                         % (epoch + 1, (i + 1 + epoch * length), sum_loss / (i + 1), 100. * accuracy / total))
                    log_f.write('\n')
                    log_f.flush()

                #写入tensorboard
                writer.add_scalar('loss/train',sum_loss / (i + 1),epoch)
                writer.add_scalar('accuracy/train',100. * accuracy / total,epoch)
                #每一个训练epoch完成测试准确率
                print("Waiting for test...")
                #在上下文环境中切断梯度计算,在此模式下,每一步的计算结果中requires_grad都是False,即使input设置为requires_grad=True
                with torch.no_grad():
                    accuracy = 0
                    total = 0
                    for data in testloader:
                        #开始测试
                        net.eval()

                        images, labels = data
                        images, labels = images.to(device), labels.to(device)

                        outputs = net(images)

                        _, predicted = torch.max(outputs.data, 1)#返回每一行中最大值的那个元素,且返回其索引(得分高的那一类)
                        total += labels.size(0)
                        accuracy += (predicted == labels).sum()

                    #输出测试准确率
                    print('测试准确率为: %.3f%%' % (100 * accuracy / total))
                    acc = 100. * accuracy / total
                    
                    #写入tensorboard
                    writer.add_scalar('accuracy/test', acc,epoch)
                    
                    #将测试结果写入文件
                    print('Saving model...')
                    torch.save(net.state_dict(), '%s/net_%3d.pth' % (args.outf, epoch + 1))
                    acc_f.write("epoch = %03d, accuracy = %.3f%%" % (epoch + 1, acc))
                    acc_f.write('\n')
                    acc_f.flush()

                    #记录最佳的测试准确率
                    if acc > best_test_acc:
                        print('Saving Best Model...')
                        #存储状态
                        state = {
                            'state_dict': net.state_dict(),
                            'acc': acc,
                            'epoch': epoch + 1,
                        }
                        #没有就创建checkpoint文件夹
                        if not os.path.isdir('checkpoint'):
                            os.mkdir('checkpoint')
                        #best_acc_f = open("best_acc.txt","w")
                        #best_acc_f.write("epoch = %03d, accuracy = %.3f%%" % (epoch + 1, acc))
                        #best_acc_f.close()
                        torch.save(state, './checkpoint/ckpt.t7')
                        best_test_acc = acc
                        #写入tensorboard
                        writer.add_scalar('best_accuracy/test', best_test_acc,epoch)
            
            #训练结束
            print("Training Finished, Total Epoch = %d" % epoch)
            writer.close()



三、结果

1.打开cmd或者是Anaconda Prompt输入指令,找到你的log目录

tensorboard --logdir 你的文件夹目录/log

例如博主的是这样的

                          

然后打开最后一行的网址http://DESKTOP-xxxxxx:6006(这里每个电脑是不一样的),例如博主的是这样的

最终训练准确率89%左右,测试准确率87%左右~

2.在训练过程中还会生成datamodelcheckpoint文件夹

                          

四、注意事项

1.代码里参数设置部分pre_model是用来继续训练的,读取的是上一次epoch存储的checkpoint,设置为True即可继续训练,否则从头开始训练

2.代码里参数设置部分lr学习率如果再训练过程中准确率变化缓慢可以适当减小

3.注意如果没有gpu则需要在代码里注销这个部分

#使用gpu
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

并且所有的xx.to(device)都需要删除;

或者不注销上面的gpu使用,在每一个xx.to(device)之前加一句话

if use_cuda:

例如:

#模型定义 VGG16
if use_cuda:
    net = VGG16().to(device)
else:
    net = VGG16()

返回至原博客:vs2017安装和使用教程(详细)

猜你喜欢

转载自blog.csdn.net/qq_36556893/article/details/86608963