TensorFlow加载数据的方式

tensorflow作为符号编程框架，需要先构建数据流图，再读取数据，然后再进行训练。tensorflow提供了以下三种方式来加载数据：

预加载数据(preloaded data)：在tensorflow图中定义常量或变量来保存所有数据
填充数据(feeding)：Python产生数据，再把数据填充到后端
从文件中读取数据(reading from file)：通过队列管理器从文件中读取数据

下面将详细介绍这三种加载数据的方法以及它们之间的优缺点

一、预加载数据

下面是一个使用TensorFlow来预加载数据进行手写数字的识别

1、以constant的方式进行预加载

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.examples.tutorials.mnist import mnist
import time


class Config(object):
    #设置训练集存放的路径
    train_dir = "data/MNIST_data"
    #是否是fake_data进行单元测试
    fake_data = False
    #设置batch_size的大小
    batch_size = 100
    #设置学习率的大小
    learning_rate = 0.01
    #设置迭代的轮数
    num_epochs = 2
    #第一层隐藏层的层数
    hidden1 = 128
    #第二层隐藏层的层数
    hidden2 = 128
    #设置每多少次迭代输出一次结果
    print_step = 100
    #多少次保存一次模型
    save_checkpoint_step = 1000
    #设置模型的保存路径
    save_model_path = "model/"
    #设置tensorboard的保存目录
    save_tensorboard_path = "log/"

#神经网络的配置设置
config = Config()
'''
训练模型
'''
def run_training():
    #获取数据
    data_set = input_data.read_data_sets(config.train_dir)
    #设置默认的图
    with tf.Graph().as_default():
        with tf.name_scope("input"):
            #使用cpu加载数据
            input_images = tf.constant(data_set.train.images)
            input_labels = tf.constant(data_set.train.labels)
        image,label = tf.train.slice_input_producer([input_images,input_labels],
                                                    num_epochs=config.num_epochs)
        #转换标签的类型
        label = tf.cast(label,tf.int32)
        #获取一个批量数据
        images,labels = tf.train.batch([image,label],batch_size=config.batch_size)
        #构建一个计算图建立预测模型
        logits = mnist.inference(images,config.hidden1,config.hidden2)
        #计算损失值
        loss = mnist.loss(logits,labels)
        #开始训练，使用梯度下降算法
        train_op = mnist.training(loss,config.learning_rate)
        #计算预测的准确率
        eval_correct = mnist.evaluation(logits,labels)
        #构建tensorboard
        summary_op = tf.summary.merge_all()
        #保存模型
        saver = tf.train.Saver()
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        #创建会话
        sess = tf.Session()
        #初始化变量
        sess.run(init_op)
        #保存计算图
        summary_writer = tf.summary.FileWriter(config.save_tensorboard_path,sess.graph)

        #开启入队线程
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess,coord=coord)

        #开始训练
        try:
            step = 0
            #检查是否关闭线程
            while not coord.should_stop():
                #记录开始时间
                start_time = time.time()
                #获取该次迭代的损失值
                _,loss_value = sess.run([train_op,loss])
                #计算本次迭代所耗费时间
                duration = time.time() - start_time
                if step % config.print_step == 0:
                    print("step:%d,loss:%.3f,consum time:%.3f sec"%(step,loss_value,duration))
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str,step)
                #保存模型
                if(step+1)%config.save_checkpoint_step == 0:
                    saver.save(sess,config.save_model_path,global_step=step)
                step += 1
        except tf.errors.OutOfRangeError:
            saver.save(sess,config.save_model_path,global_step=step)
        finally:
            #停止线程
            coord.request_stop()
        #等待线程结束
        coord.join(threads)
        sess.close()

if __name__ == "__main__":
    run_training()

2、以variable的方式加载数据

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.examples.tutorials.mnist import mnist
import time

'''
网络参数设置
'''
class Config(object):
    #设置训练集存放的路径
    train_dir = "data/MNIST_data"
    #是否是fake_data进行单元测试
    fake_data = False
    #设置batch_size的大小
    batch_size = 100
    #设置学习率的大小
    learning_rate = 0.01
    #设置迭代的轮数
    num_epochs = 2
    #第一层隐藏层的层数
    hidden1 = 128
    #第二层隐藏层的层数
    hidden2 = 128
    #设置每多少次迭代输出一次结果
    print_step = 100
    #多少次保存一次模型
    save_checkpoint_step = 1000
    #设置模型的保存路径
    save_model_path = "model/"
    #设置tensorboard的保存目录
    save_tensorboard_path = "log/"

#神经网络的配置设置
config = Config()


def run_training():
    #加载数据集
    data_sets = input_data.read_data_sets(config.train_dir,config.fake_data)
    #设置默认计算图
    with tf.Graph().as_default():
        with tf.name_scope("input"):
            images_initializer = tf.placeholder(dtype=data_sets.train.images.dtype,
                                                shape=data_sets.train.images.shape)
            labels_initializer = tf.placeholder(dtype=data_sets.train.labels.dtype,
                                                shape=data_sets.train.labels.shape)
            input_images = tf.Variable(images_initializer,trainable=False,collections=[])
            input_labels = tf.Variable(labels_initializer,trainable=False,collections=[])
            image,label = tf.train.slice_input_producer([input_images,input_labels],num_epochs=config.num_epochs)
            label = tf.cast(label,tf.int32)
            images,labels = tf.train.batch([image,label],batch_size=config.batch_size)
        #构建训练网络
        logits = mnist.inference(images,config.hidden1,config.hidden2)
        #计算损失值
        loss = mnist.loss(logits,labels)
        #使用优化算法最小化损失函数
        train_op = mnist.training(loss,config.learning_rate)
        #计算准确率
        eval_correct = mnist.evaluation(logits,labels)
        #保存tensorboard
        summary_op = tf.summary.merge_all()
        #保存模型
        saver = tf.train.Saver()
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        #创建会话
        sess = tf.Session()
        #初始化变量
        sess.run(init_op)
        sess.run(input_images.initializer,feed_dict={images_initializer:data_sets.train.images})
        sess.run(input_labels.initializer,feed_dict={labels_initializer:data_sets.train.labels})
        #保存tensorboard
        summary_writer = tf.summary.FileWriter(config.save_tensorboard_path,sess.graph)
        #启动入队线程
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess,coord=coord)
        #开始训练
        try:
            step = 0
            while not coord.should_stop():
                start_time = time.time()
                _,loss_value = sess.run([train_op,loss])
                #计算时间间隔
                duration = time.time() - start_time
                if step % config.print_step == 0:
                    print("step:%s,loss:%.3f,consume time:%.3f sec"%(step,loss_value,duration))
                    #写入tensorboard中
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str,step)

                #保存模型
                if (step+1)%config.save_checkpoint_step == 0:
                    saver.save(sess,config.save_model_path,global_step=step)
                step += 1
        except tf.errors.OutOfRangeError:
            saver.save(sess,config.train_dir,global_step=step)
        finally:
            #通知线程退出
            coord.request_stop()
        #等待线程关闭
        coord.join(threads)
        sess.close()

if __name__ == "__main__":
    run_training()

通过预加载的方式加载数据的缺点在于，将数据直接嵌入到数据流图中，当训练数据较大时，很消耗内存。

二、填充数据

填充数据是使用sess.run()中的feed_dict参数，将Python产生的数据填充给后端。

import tensorflow as tf

if __name__ == "__main__":
    #定义输入数据
    a = tf.placeholder(tf.float32)
    b = tf.placeholder(tf.float32)
    #定义操作
    c = tf.add(a,b)
    #python产生数据
    x1 = [1.0,2.0,3.0]
    x2 = [5.0,6.0,7.0]
    #创建会话
    with tf.Session() as sess:
        print(sess.run(c,feed_dict={a:x1,b:x2}))

填充数据也存在数据量大、消耗内存等缺点，并数据类型转换等中间环节也增加了不少的内存开销。这时候最好采用最后一种加载数据的方式，先在图中定义好文件读取的方法，通过TensorFlow从文件中读取数据，再解码成为训练数据。

三、从文件读取数据

从文件中读取数据，主要分为两个步骤：

1、将样本写入到TFRecords二进制文件中

import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets import mnist
import os

#设置TFRecords文件保存路径目录
TFRecords_dir = "tfRecords/"
#设置data的存放目录
data_dir = "data/MNIST_data"

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

'''
将数据转换为TFRecords文件
'''
def convert_to(data_sets,name):
    images = data_sets.images
    labels = data_sets.labels
    num_examples = data_sets.num_examples
    if images.shape[0] != num_examples:
        return ValueError("images size %d does not match label size %d."
                          %(images.shape[0],num_examples))
    #获取图片的高
    rows = images.shape[1]
    #获取图片的宽
    cols = images.shape[2]
    #获取图片的深度
    depth = images.shape[3]
    #获取TFRecords文件的保存路径
    save_TFRecords_path = os.path.join(TFRecords_dir,name+".tfrecords")
    #将数据保存成tfrecords文件
    with tf.python_io.TFRecordWriter(save_TFRecords_path) as writer:
        for index in range(num_examples):
            #将图片数据转换为字符串
            image_raw = images[index].tostring()
            #将数据写入协议缓冲区中,将图片的宽、高、通道数、标签编码成为int64
            #将图片数据编码成为二进制
            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                    "height":_int64_feature(rows),
                    "width":_int64_feature(cols),
                    "depth":_int64_feature(depth),
                    "label":_int64_feature(int(labels[index])),
                    "image_raw":_bytes_feature(image_raw)
                }))
            #将协议缓冲区数据转为字符串写入文件
            writer.write(example.SerializeToString())

if __name__ == "__main__":
    #获取数据
    data_sets = mnist.read_data_sets(data_dir,dtype=tf.uint8,reshape=False,validation_size=5000)
    #将训练集转换成为tfrecords文件
    convert_to(data_sets.train,"train")
    #将验证集保存为tfrecords文件
    convert_to(data_sets.validation,"validation")
    #将测试集保存为tfrecords文件
    convert_to(data_sets.test,"test")

2、再从队列中读取解码成为可以进行训练的数据

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import mnist
import os,time

#设置TFRecords文件的存放路径
train_tfrecords_path = "tfRecords/train.tfrecords"
test_tfrecords_path = "tfRecords/test.tfrecords"
validation_tfrecords_path = "tfRecords/validation.tfrecords"

'''
读取TFRecords文件并解码
'''
def decode(serialized_example):
  features = tf.parse_single_example(
      serialized_example,
      features={
          'image_raw': tf.FixedLenFeature([], tf.string),
          'label': tf.FixedLenFeature([], tf.int64),
      })
  image = tf.decode_raw(features['image_raw'], tf.uint8)
  image.set_shape((mnist.IMAGE_PIXELS))
  label = tf.cast(features['label'], tf.int32)
  return image, label

def augment(image, label):
    return image, label

'''
归一化图片的像素
'''
def normalize(image, label):
    #将图片像素值从[0,255]转换成为[-0.5,0.5]
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
    return image, label

'''
读取数据
'''
def inputs(tfrecords_path,batch_size,num_epochs):
    if not num_epochs:
        num_epochs = None
    with tf.name_scope("input"):
        #读取tfrecords文件
        dataset = tf.data.TFRecordDataset(tfrecords_path)
        #tfrecords数据解码
        dataset = dataset.map(decode)
        dataset = dataset.map(augment)
        dataset = dataset.map(normalize)
        #打乱数据的顺序
        dataset = dataset.shuffle(1000 + 3 * batch_size)
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

'''
网络参数设置
'''
class Config(object):
    #设置训练集存放的路径
    train_dir = "data/MNIST_data"
    #是否是fake_data进行单元测试
    fake_data = False
    #设置batch_size的大小
    batch_size = 100
    #设置学习率的大小
    learning_rate = 0.01
    #设置迭代的轮数
    num_epochs = 2
    #第一层隐藏层的层数
    hidden1 = 128
    #第二层隐藏层的层数
    hidden2 = 128
    #设置每多少次迭代输出一次结果
    print_step = 100
    #多少次保存一次模型
    save_checkpoint_step = 1000
    #设置模型的保存路径
    save_model_path = "model/"
    #设置tensorboard的保存目录
    save_tensorboard_path = "log/"

#神经网络的配置设置
config = Config()

def run_training():
    with tf.Graph().as_default():
        image_batch,label_batch = inputs(train_tfrecords_path,batch_size=config.batch_size,num_epochs=config.num_epochs)
        logits = mnist.inference(image_batch,config.hidden1,config.hidden2)
        loss = mnist.loss(logits,label_batch)
        train_op = mnist.training(loss,config.learning_rate)
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        with tf.Session() as sess:
            sess.run(init_op)
            try:
                step = 0
                while True:
                    start_time = time.time()
                    _,loss_value = sess.run([train_op,loss])
                    duration_time = time.time() - start_time
                    if step % config.print_step == 0:
                        print("step:%s,loss:%.3f,consume time:%.3f sec"%(step,loss_value,duration_time))
                    step += 1
            except tf.errors.OutOfRangeError:
                print("Done training for %d epchos,%d steps."%(config.num_epochs,step))

if __name__ == "__main__":
    run_training()

TensorFlow加载数据的方式

一、预加载数据

二、填充数据

三、从文件读取数据

猜你喜欢