paddle深度学习基础之模型加载及恢复训练

前言

前面几节,我们从各个方面对模型进行了优化,也实现了将模型保存下来。但是在日常训练工作中我们会遇到一些突发情况,导致训练过程主动或被动的中断。如果训练一个模型需要花费几天的训练时间,中断后从初始状态重新训练是不可接受的。别着急,这一节咱们就是讨论这个事情。

保存模型

如果前面几篇博客看过的同学,肯定已经知道如何保存模型。这里还需要强调一下,我们不仅可以保存模型的参数,还可以保存优化器的参数。比如我们这次测试代码使用的是动态学习率的优化器,训练的次数不同,学习率也不一样,所有,我们也需要把这个信息给存储下来。

model_save_path="model/mnist-model/dygraph-mnist"
fluid.save_dygraph(model.state_dict(),model_save_path)#保存模型参数
fluid.save_dygraph(optimaizer.state_dict(),model_save_path)#保存优化器参数

加载模型

params_dict, opt_dict = fluid.load_dygraph(model_save_path)
model = MNIST()
model.load_dict(params_dict)
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
optimizer.set_dict(opt_dict)
  • params_dict :模型的参数

  • opt_dict :优化器的参数

完整代码

import paddle
import numpy as np
import matplotlib.pyplot as plt
import gzip
import json
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear
from paddle.fluid.dygraph.nn import Conv2D,Pool2D
from tb_paddle import SummaryWriter
from PIL import Image
import os
'''
此项目主要是在网络结构层面上优化模型
1.经典的全链接神经网络
'''

#解压缩
mnistdata = gzip.open('data/mnist.json.gz')
#通过json导入  因为此数据使用json形式保存的,所以需要json导入 也可以使用pandas 数据导入方式
data = json.load(mnistdata)

#分别获取训练集、验证集和测试集数据
train_data,val_data,test_data = data
#设置数据大小
IMG_ROWS=28
IMG_COLS=28
##数据乱序,生成批次数据
def data_loader(dataname='train',batch_size=20):
    #乱序处理方法1
    if(dataname=='train'):
        img = train_data[0]
        label = train_data[1]
    elif(dataname=='test'):
        img = test_data[0]
        label = test_data[1]
    elif(dataname=='val'):
        img = val_data[0]
        label = val_data[1]
    else:
        raise Exception("data only can be one of ['train','test','val']")
    #验证数据有效性
    assert len(img)==len(label),'the lenth of img must be the  same as the length of label'
    list = []
    datasize = len(img)
    list = [i for i in range(datasize)]
    #打乱数据
    np.random.shuffle(list)
    #获取数据,定义一个数据生成器
    def data_genergator():
        listdata=[]
        listlabel=[]
        for i in list:
            #转化数据结构
            imgdata = np.reshape(img[i],[1,IMG_ROWS,IMG_COLS]).astype('float32')
            labeldata = np.reshape(label[i],[1]).astype('int64')
            listdata.append(imgdata)
            listlabel.append(labeldata)
            if(len(listdata)%batch_size==0):
                yield np.array(listdata),np.array(listlabel)
                listlabel=[]
                listdata=[]
        if(len(listdata)>0):
            yield np.array(listdata),np.array(listlabel)

    return data_genergator
#定义类
class MNIST(fluid.dygraph.Layer):
    def __init__(self):
        super(MNIST, self).__init__()
        # self.linear1 = Linear(input_dim=28*28,output_dim=10,act=None)
        # self.linear2 = Linear(input_dim=10,output_dim=10,act='sigmoid')
        # self.linear3 = Linear(input_dim=10,output_dim=1,act='sigmoid')
        self.conv1 = Conv2D(num_channels=1, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')
        self.pool1 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
        self.conv2 = Conv2D(num_channels=20, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')
        self.pool2 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
        self.linear = Linear(input_dim=980, output_dim=10, act='softmax')
    def forward(self, inputs,label,check_shape=False,check_content=False):
        conv1 = self.conv1(inputs)
        pool1 = self.pool1(conv1)
        conv2 = self.conv2(pool1)
        pool2 = self.pool2(conv2)
        pool21 = fluid.layers.reshape(pool2, [pool2.shape[0], -1])
        outputs = self.linear(pool21)
        # hidden1 = self.linear1(inputs)
        # hidden2 = self.linear2(hidden1)
        # outputs = self.linear3(hidden2)
        if(check_shape):
            print("\n------------打印各个层设置的网络超参数的尺寸 -------------")
            print("conv1-- kernel_size:{}, padding:{}, stride:{}".format(self.conv1.weight.shape, self.conv1._padding, self.conv1._stride))
            print("conv2-- kernel_size:{}, padding:{}, stride:{}".format(self.conv2.weight.shape, self.conv2._padding, self.conv2._stride))
            print("pool1-- pool_type:{}, pool_size:{}, pool_stride:{}".format(self.pool1._pool_type, self.pool1._pool_size, self.pool1._pool_stride))
            print("pool2-- pool_type:{}, poo2_size:{}, pool_stride:{}".format(self.pool2._pool_type, self.pool2._pool_size, self.pool2._pool_stride))
            print("liner-- weight_size:{}, bias_size_{}, activation:{}".format(self.linear.weight.shape, self.linear.bias.shape, self.linear._act))

            print("\n------------打印各个层的形状 -------------")
            print("inputs_shape: {}".format(inputs.shape))
            print("outputs1_shape: {}".format(conv1.shape))
            print("outputs2_shape: {}".format(pool1.shape))
            print("outputs3_shape: {}".format(conv2.shape))
            print("outputs4_shape: {}".format(pool2.shape))
            print("outputs5_shape: {}".format(outputs.shape))

        if check_content:
            # 打印卷积层的参数-卷积核权重,权重参数较多,此处只打印部分参数
            print("\n########## print convolution layer's kernel ###############")
            print("conv1 params -- kernel weights:", self.conv1.weight[0][0])
            print("conv2 params -- kernel weights:", self.conv2.weight[0][0])

            # 创建随机数,随机打印某一个通道的输出值
            idx1 = np.random.randint(0, conv1.shape[1])
            idx2 = np.random.randint(0, conv1.shape[1])
            # 打印卷积-池化后的结果,仅打印batch中第一个图像对应的特征
            print("\nThe {}th channel of conv1 layer: ".format(idx1), conv1[0][idx1])
            print("The {}th channel of conv2 layer: ".format(idx2), conv1[0][idx2])
            print("The output of last layer:", conv1[0], '\n')
        if label is not None:
            acc = fluid.layers.accuracy(input=outputs,label=label)
            return outputs,acc
        else:
            return outputs
#训练
with fluid.dygraph.guard():
    model = MNIST()
    model.train()
    train_loader = data_loader()
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
    place = fluid.CPUPlace()
    traindata_loader = fluid.io.DataLoader.from_generator(capacity=5, return_list=True)
    traindata_loader.set_batch_generator(train_loader, places=place)
    EPOCH_NUM = 3
    #添加日志
    data_writer = SummaryWriter(logdir="log/data")
    model_save_path="model/mnist-model/dygraph-mnist"
    for epoch_id in range(EPOCH_NUM):
        for batch_id,data in enumerate(traindata_loader()):
            image_data, label_data = data
            image = fluid.dygraph.to_variable(image_data)
            label = fluid.dygraph.to_variable(label_data)
            if batch_id==1000:
                predict,acc = model(image,label,check_shape=False,check_content=False)
            else:
                predict,acc = model(image,label)
            # loss = fluid.layers.square_error_cost(predict,label)
            loss = fluid.layers.cross_entropy(predict,label)
            avg_loss = fluid.layers.mean(loss)
            if batch_id !=0 and batch_id %100 ==0:
                data_writer.add_scalar("train/loss",avg_loss.numpy(),batch_id)
                data_writer.add_scalar("train/accuracy",acc.numpy(),batch_id)
                print("epoch:{},batch:{},loss is:{},acc is :{}".format(epoch_id,batch_id,avg_loss.numpy(),acc.numpy()))
            avg_loss.backward()
            optimizer.minimize(avg_loss)
            model.clear_gradients()
        print("保存模型")
        fluid.save_dygraph(model.state_dict(), model_save_path+""+str(epoch_id))
        fluid.save_dygraph(optimizer.state_dict(),model_save_path+""+str(epoch_id))
#再训练
print("接着训练")
with fluid.dygraph.guard():
    model = MNIST()
    model_save_path="model/mnist-model/dygraph-mnist"
    params_dict, opt_dict = fluid.load_dygraph(model_save_path+"0")
    model.load_dict(params_dict)
    train_loader = data_loader()
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001,parameter_list=model.parameters())
    optimizer.set_dict(opt_dict)
    place = fluid.CPUPlace()
    traindata_loader = fluid.io.DataLoader.from_generator(capacity=5, return_list=True)
    traindata_loader.set_batch_generator(train_loader, places=place)
    EPOCH_NUM = 3
    #添加日志
    for epoch_id in range(1,EPOCH_NUM):
        for batch_id,data in enumerate(traindata_loader()):
            image_data, label_data = data
            image = fluid.dygraph.to_variable(image_data)
            label = fluid.dygraph.to_variable(label_data)
            predict,acc = model(image,label)
            # loss = fluid.layers.square_error_cost(predict,label)
            loss = fluid.layers.cross_entropy(predict,label)
            avg_loss = fluid.layers.mean(loss)
            if batch_id !=0 and batch_id %100 ==0:
                print("epoch:{},batch:{},loss is:{},acc is :{}".format(epoch_id,batch_id,avg_loss.numpy(),acc.numpy()))
            avg_loss.backward()
            optimizer.minimize(avg_loss)
            model.clear_gradients()

总结

截至到这篇博客,整个基础系列就总结结束了。这些资源都是百度AI Studio提供的免费课程,全程听完,实践后,真的是收获很多。也很感谢制作这些课程的工作人员,也同样希望,这一系列基础课程能够给大家带来一些帮助。

发布了87 篇原创文章 · 获赞 76 · 访问量 24万+

猜你喜欢

转载自blog.csdn.net/lzx159951/article/details/105255150