深度学习框架tensorflow二实战（TFRecords的使用）

把一大堆不同格式的数据进行统一处理。

TFRecords制作

为了高效地读取数据，可以将数据进行序列化存储，这样也便于网络流式读取数据。TFRecord是一种比较常用的存储二进制序列数据的方法
tf.Example类是一种将数据表示为{“string”: value}形式的meassage类型，Tensorflow经常使用tf.Example来写入、读取TFRecord数据
通常情况下，tf.Example中可以使用以下几种格式：
tf.train.BytesList: 可以使用的类型包括 string和byte
tf.train.FloatList: 可以使用的类型包括 float和double
tf.train.Int64List: 可以使用的类型包括 enum,bool, int32, uint32, int64

初始化常见的转换函数

import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string/byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Return a float_list form a float/double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Return a int64_list from a bool/enum/int/uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# tf.train.BytesList
print(_bytes_feature(b'test_string'))
print(_bytes_feature('test_string'.encode('utf8')))

# tf.train.FloatList
print(_float_feature(np.exp(1)))

# tf.train.Int64List
print(_int64_feature(True))
print(_int64_feature(1))

整合多种不同类型输入

def serialize_example(feature0, feature1, feature2, feature3):
    """
    创建tf.Example
    """

    # 转换成相应类型
    feature = {
        'feature0': _int64_feature(feature0),
        'feature1': _int64_feature(feature1),
        'feature2': _bytes_feature(feature2),
        'feature3': _float_feature(feature3),
    }
    # 使用tf.train.Example来创建
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    # SerializeToString方法转换为二进制字符串
    return example_proto.SerializeToString()

模拟测试

# 数据量
n_observations = int(1e4)

# Boolean feature
feature0 = np.random.choice([False, True], n_observations)

# Integer feature
feature1 = np.random.randint(0, 5, n_observations)

# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# Float feature
feature3 = np.random.randn(n_observations)

filename = 'tfrecord' #写到该文件

with tf.io.TFRecordWriter(filename) as writer:
    for i in range(n_observations):
        example = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
        writer.write(example)

读取tfrecord文件

filenames = [filename]

# 读取
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

图像实战

import os
import glob
from datetime import datetime

import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

image_path = './数据增强/input/input2/'
images = glob.glob(image_path + '*.jpg')

for fname in images:
    image = mpimg.imread(fname)
    f, (ax1) = plt.subplots(1, 1, figsize=(8, 8))
    f.subplots_adjust(hspace=.2, wspace=.05)

    ax1.imshow(image)
    ax1.set_title('Image', fontsize=20)

image_labels = {
    'dog': 0,
    'pear': 1,
}

# 读数据，binary格式
image_string = open('./数据增强/input/input2/dog.jpg', 'rb').read()
label = image_labels['dog']

def _bytes_feature(value):
    """Returns a bytes_list from a string/byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Return a float_list form a float/double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Return a int64_list from a bool/enum/int/uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# 创建图像数据的Example
def image_example(image_string, label):
    image_shape = tf.image.decode_jpeg(image_string).shape

    feature = {
        'height': _int64_feature(image_shape[0]),
        'width': _int64_feature(image_shape[1]),
        'depth': _int64_feature(image_shape[2]),
        'label': _int64_feature(label),
        'image_raw': _bytes_feature(image_string),
    }

    return tf.train.Example(features=tf.train.Features(feature=feature))

image_example_proto = image_example(image_string, label)

for line in str(image_example_proto).split('\n')[:15]:
    print(line)
print('...')

# 制作 `images.tfrecords`. 现在开始制作～～～～～～～

image_path = './数据增强/input/input2/'
images = glob.glob(image_path + '*.jpg')
record_file = 'images.tfrecord'
counter = 0

with tf.io.TFRecordWriter(record_file) as writer:
    for fname in images:
        with open(fname, 'rb') as f:
            image_string = f.read()
            label = image_labels[os.path.basename(fname).replace('.jpg', '')]

            # `tf.Example`
            tf_example = image_example(image_string, label)

            # 将`tf.example` 写入 TFRecord,转成二进制字符串
            writer.write(tf_example.SerializeToString())

            counter += 1
            print('Processed {:d} of {:d} images.'.format(
                counter, len(images)))

print(' Wrote {} images to {}'.format(counter, record_file))

#加载制作好的TFRecord～～～～～～
raw_train_dataset = tf.data.TFRecordDataset('images.tfrecord')
raw_train_dataset

# 解析的格式需要跟之前创建example时一致
image_feature_description = {
    'height': tf.io.FixedLenFeature([], tf.int64),
    'width': tf.io.FixedLenFeature([], tf.int64),
    'depth': tf.io.FixedLenFeature([], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
}


def parse_tf_example(example_proto): #这个函数是只对一个例子进行解析
    # 解析出来
    parsed_example = tf.io.parse_single_example(example_proto, image_feature_description)

    # 预处理
    x_train = tf.image.decode_jpeg(parsed_example['image_raw'], channels=3)
    x_train = tf.image.resize(x_train, (416, 416))
    x_train /= 255.

    lebel = parsed_example['label']
    y_train = lebel

    return x_train, y_train

#以上函数只对一个例子进行解析，但map函数可以传一个方法进去，对每个样本进行相同的操作
train_dataset = raw_train_dataset.map(parse_tf_example)
train_dataset


num_epochs = 10

train_ds = train_dataset.shuffle(buffer_size=10000).batch(2).repeat(num_epochs)#重复个10次～～ 简单训练一下
train_ds

for batch, (x, y) in enumerate(train_ds):
    print(batch, x.shape, y)


#开始训练啦啊啦啊
model = tf.keras.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

model.fit(train_ds, epochs=num_epochs)