DL学习笔记-TFRecord输入数据格式

一、TFRecord录入格式转换

TFRecord的录入格式是确定，整数，实数或二进制列表。

#生成整数型的属性
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
#生成字符串型的属性
def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value=[value]))

二、TFRecord文件写入

将mnist数据相关信息以每张图片为单位写入同一个TFR文件中

import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import os

#生成整数型的属性
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
#生成字符串型的属性
def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value=[value]))

mnist = input_data.read_data_sets('F:/Python/files/tydstudy/tensorflowstudy/MNIST_data', dtype=tf.uint8, one_hot=True)
images = mnist.train.images
#训练数据对应的正确答案，可以作为一个属性保存在TFRecord中
labels = mnist.train.labels
#训练数据的图像分辨率，也可以做为一个属性
pixels = images.shape[1]  #784
#图片数量
num_examples = mnist.train.num_examples  #55000

#输出TFRecord文件的地址
filename = 'file/mnist_output.tfrecords'

if not os.path.exists('file/'):
    os.makedirs('file/')

#创建一个writer来写TFRecord文件
writer = tf.python_io.TFRecordWriter(filename)

for index in range(num_examples):
    #将图像矩阵转化成一个字符串
    image_raw = images[index].tostring()
    #将一个样例转化成example protocol buffer，并将所以的信息写入这个数据结构中
    example = tf.train.Example(features = tf.train.Features(feature={
        'pixels': _int64_feature(pixels),
        'label': _int64_feature(np.argmax(labels[index])),
        'image_raw':_bytes_feature(image_raw)
    }))

    #将一个Example写入TFRecord文件
    writer.write(example.SerializeToString())  # 序列化为字符串
writer.close()

当数据量较大时，也可以将数据写入多个TFRecord文件。

import tensorflow as tf

#定义写多少个文件（数据量大时可以写入多个文件加速）
num_shards = 2
#定义每个文件放入多少数据
instances_per_shard = 2

def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))


for i in range(num_shards):
    
    filename = 'file/data.tfrecords-{}-of-{}'.format(i, num_shards)
    writer = tf.python_io.TFRecordWriter(filename)
    for j in range(instances_per_shard):
        example = tf.train.Example(features = tf.train.Features(feature={
            'i':_int64_feature(i),
            'j':_int64_feature(j),
            'list':_bytes_feature(bytes([1,2,3]))
        }))
        writer.write(example.SerializeToString())
    writer.close()

TFRecord文件读取

import tensorflow as tf

#创建一个reader来读取TFRecord文件中的样例
reader = tf.TFRecordReader()

#创建一个队列来维护输入文件列表
filename_queue = tf.train.string_input_producer(['file/mnist_output.tfrecords'])

#从文件中读出一个样例，也可以使用read_up_to一次性读取多个样例
# _, serialized_example = reader.read(filename_queue)
_, serialized_example = reader.read_up_to(filename_queue,10)  #怎么用？

#解析读入的一个样例，如果需要解析多个样例，可以用parse_example，注意要和上面的读入对上
# features = tf.parse_single_example  解析单个样例
features = tf.parse_example(serialized_example,
                            features={
                                'image_raw': tf.FixedLenFeature([], tf.string),
                                'pixels': tf.FixedLenFeature([], tf.int64),
                                'label': tf.FixedLenFeature([], tf.int64)
                            })

#解析二进制数据串，按照uint8格式解析
images = tf.decode_raw(features['image_raw'], tf.uint8)
labels = tf.cast(features['label'], tf.int32)
pixels = tf.cast(features['pixels'], tf.int32)

#按照一个batch读取
batch_size = 2
capacity = 1000 + 3 * batch_size #容量

images.set_shape([10, 784])
labels.set_shape(10)
pixels.set_shape(10)
image_batch, label_batch, pixel_batch = tf.train.batch([images, labels, pixels],
                                                       batch_size=batch_size,
                                                       capacity=capacity)

sess = tf.Session()
#启动多线程处理输入数据
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess = sess, coord= coord)

#每次运行可以读取一个样例，当所有样例都读完以后，在此样例中程序会重头读取
for i in range(10):
    # print(sess.run([image, label, pixels]))
    image, label, pixel = sess.run([image_batch, label_batch, pixel_batch])
    print(image.shape, label, pixel)
coord.request_stop()
coord.join(threads)

拼接batch尺寸为2，每次读取10个数据，可以看到，这里尺寸指定的实际上是读取的次数，

一次循环操作会执行两次读取，每次读取10个数据，这里用到了多线程。

(2, 10, 784)

[[7 3 4 6 1 8 1 0 9 8] [0 3 1 2 7 0 2 9 6 0]]

[[784 784 784 784 784 784 784 784 784 784] [784 784 784 784 784 784 784 784 784 784]](2, 10, 784)

[[1 6 7 1 9 7 6 5 5 8] [8 3 4 4 8 7 3 6 4 6]]

[[784 784 784 784 784 784 784 784 784 784] [784 784 784 784 784 784 784 784 784 784]]

也可以多线程读取多个文件

import tensorflow as tf
import glob

files = tf.train.match_filenames_once('file/data.tfrecords-*')
filename_queue = tf.train.string_input_producer(files, num_epochs=2, shuffle=True)

reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)

features = tf.parse_single_example(serialized_example,
                                   features={
                                       'i': tf.FixedLenFeature([], tf.int64),
                                       'j': tf.FixedLenFeature([], tf.int64),
                                       'list': tf.FixedLenFeature([], tf.string)
                                   })

example, label, array = features['i'], features['j'], features['list']

#一个batch中的样例数量
batch_size = 3
#队列中最多可以存储的样例个数
capacity = 1000 + 3* batch_size

shuffer = True
#batch操作实际代指的就是数据读取和预处理操作
if shuffer is not True:
    example_batch, label_batch, array_batch = tf.train.batch([example, label, array],
                                                             batch_size=batch_size,
                                                             capacity=capacity)
else:
    example_batch, label_batch, array_batch = tf.train.shuffle_batch([example, label, array],
                                                                     batch_size=batch_size,
                                                                     capacity=capacity,
                                                                     min_after_dequeue=30)
    #min_after_dequeue 限制出队时队列中元素的最少个数，如果不够会等待更多的元素入队才会完成

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    #
    for i in range(2):
        cur_example_batch, cur_label_batch, cur_array_batch = sess.run([example_batch, label_batch, array_batch])
        print(cur_example_batch, cur_label_batch, cur_array_batch)

    coord.request_stop()
    coord.join(threads)

从结果可以看出，一个循环读取三次

[0 1 0] [0 1 0] [b'\x01\x02\x03' b'\x01\x02\x03' b'\x01\x02\x03']

[0 1 0] [1 0 1] [b'\x01\x02\x03' b'\x01\x02\x03' b'\x01\x02\x03']

单一文件多线程，一般选用tf.train.batch(需要打乱样本，就用对应的tf.train.shuffle_batch)

多线程多文件的情况，一般选用tf.train.batch_join来获取样本（打乱样本同样也有tf.train.shuffle_batch_join）

DL学习笔记-TFRecord输入数据格式

猜你喜欢