tf.data : Dataset construction and preprocessing

Dataset Object Creation

tf.data.Dataset Consists of a sequence of iterable elements, each containing one or more tensors. For example, for a dataset consisting of images, each element can be an  长×宽×通道数 image tensor of shape , or a tuple (Tuple) consisting of an image tensor and an image label tensor

The most basic establishment  tf.data.Dataset method is to use  tf.data.Dataset.from_tensor_slices() , which is suitable for the case where the amount of data is small (it can be fully loaded into the memory)

import tensorflow as tf
import numpy as np

X = tf.constant([2013, 2014, 2015, 2016, 2017])
Y = tf.constant([12000, 14000, 15000, 16500, 17500])

# 也可以使用NumPy数组,效果相同
# X = np.array([2013, 2014, 2015, 2016, 2017])
# Y = np.array([12000, 14000, 15000, 16500, 17500])

dataset = tf.data.Dataset.from_tensor_slices((X, Y))

for x, y in dataset:
    print(x.numpy(), y.numpy())
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
(train_data, train_label), (_, _) = tf.keras.datasets.mnist.load_data()
train_data = np.expand_dims(train_data.astype(np.float32) / 255.0, axis=-1)      # [60000, 28, 28, 1]
mnist_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))

for image, label in mnist_dataset:
    plt.title(label.numpy())
    plt.imshow(image.numpy()[:, :, 0])
    plt.show()

Preprocessing of Dataset Objects

tf.data.Dataset The class provides us with a variety of dataset preprocessing methods. The most commonly used ones are:

  • Dataset.map(f) : Apply a function to each element in the data set  f to get a new data set (this part is often combined with  tf.io reading and writing and decoding files  tf.image for image processing);

  • Dataset.shuffle(buffer_size) : Shuffle the data set (set a fixed-size buffer (Buffer), take out the previous  buffer_size element and put it in, and randomly sample from the buffer, and replace the sampled data with subsequent data);

  • Dataset.batch(batch_size) : Divide the data set into batches, that is, for batch_size each  element, use tf.stack() the merge at the 0th dimension to become an element;

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
(train_data, train_label), (_, _) = tf.keras.datasets.mnist.load_data()
train_data = np.expand_dims(train_data.astype(np.float32) / 255.0, axis=-1)      # [60000, 28, 28, 1]
mnist_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_label))

def rot90(image, label):
    image = tf.image.rot90(image)
    return image, label

mnist_dataset = mnist_dataset.map(rot90)

# for image, label in mnist_dataset:
#     plt.title(label.numpy())
#     plt.imshow(image.numpy()[:, :, 0])
#     plt.show()

mnist_dataset = mnist_dataset.shuffle(buffer_size=10000).batch(4)

for images, labels in mnist_dataset:
    fig, axs = plt.subplots(1, 4)
    for i in range(4):
        axs[i].set_title(labels.numpy()[i])
        axs[i].imshow(images.numpy()[i, :, :, 0])
    plt.show()

tf.data The parallelization strategy used improves the efficiency of the training process 

tf.data The dataset object provides us with  Dataset.prefetch() a method that allows us to prefetch several elements of the dataset object  Dataset during training, so that the CPU can prepare data while the GPU is training, thereby improving the efficiency of the training process

mnist_dataset = mnist_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

Similarly,  Dataset.map() multiple GPU resources can also be used to transform data items in parallel to improve efficiency

mnist_dataset = mnist_dataset.map(map_func=rot90, num_parallel_calls=2)

Acquisition and use of data set elements

dataset = tf.data.Dataset.from_tensor_slices((A, B, C, ...))
for a, b, c, ... in dataset:
    # 对张量a, b, c等进行操作,例如送入模型进行训练
import tensorflow as tf
import os
import matplotlib.pyplot as plt
num_epochs = 1
batch_size = 4
learning_rate = 0.001
data_dir = './catsdogs'
train_cats_dir = data_dir + '/train/Cat/'
train_dogs_dir = data_dir + '/train/Dog/'
test_cats_dir = data_dir + '/test/Cat/'
test_dogs_dir = data_dir + '/test/Dog/'

def _decode_and_resize(filename, label):
    image_string = tf.io.read_file(filename)            # 读取原始文件
    image_decoded = tf.image.decode_image(image_string,expand_animations = False)  # 解码JPEG图片
    # image_decoded = tf.image.decode_jpeg(image_string)
    image_resized = tf.image.resize(image_decoded, [256, 256]) / 255.0
    return image_resized, label

if __name__ == '__main__':
    # 构建训练数据集
    train_cat_filenames = tf.constant([train_cats_dir + filename for filename in os.listdir(train_cats_dir)])
    train_dog_filenames = tf.constant([train_dogs_dir + filename for filename in os.listdir(train_dogs_dir)])
    train_filenames = tf.concat([train_cat_filenames, train_dog_filenames], axis=-1)
    hh=train_cat_filenames.shape
    train_labels = tf.concat([
        tf.zeros(train_cat_filenames.shape, dtype=tf.int32),
        tf.ones(train_dog_filenames.shape, dtype=tf.int32)],
        axis=-1)

    train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))
    # for num,(image,labels) in enumerate(train_dataset):
    #     print(num,image,labels)
    #     image,labels=_decode_and_resize(image,labels)
    #     print(image.shape)
    # if True:
    #     train_dataset = train_dataset.shuffle(buffer_size=1000)
    #     for images, labels in train_dataset:
    #         images,labels=_decode_and_resize(images,labels)
    #         plt.title(labels.numpy())
    #         plt.imshow(images.numpy())
    #         plt.show()
    train_dataset = train_dataset.map(
        map_func=_decode_and_resize,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # 取出前buffer_size个数据放入buffer,并从其中随机采样,采样后的数据用后续数据替换
    train_dataset = train_dataset.shuffle(buffer_size=1000)
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(256, 256, 3)),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(32, 5, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        metrics=[tf.keras.metrics.sparse_categorical_accuracy]
    )

    model.fit(train_dataset, epochs=num_epochs)

    # 构建测试数据集
    test_cat_filenames = tf.constant([test_cats_dir + filename for filename in os.listdir(test_cats_dir)])
    test_dog_filenames = tf.constant([test_dogs_dir + filename for filename in os.listdir(test_dogs_dir)])
    test_filenames = tf.concat([test_cat_filenames, test_dog_filenames], axis=-1)
    test_labels = tf.concat([
        tf.zeros(test_cat_filenames.shape, dtype=tf.int32),
        tf.ones(test_dog_filenames.shape, dtype=tf.int32)],
        axis=-1)

    test_dataset = tf.data.Dataset.from_tensor_slices((test_filenames, test_labels))
    test_dataset = test_dataset.map(_decode_and_resize)
    test_dataset = test_dataset.batch(batch_size)

    print(model.metrics_names)
    print(model.evaluate(test_dataset))

tensorflow reads pictures

import tensorflow as tf
import os
import matplotlib.pyplot as plt

def decode_and_resize(filename):
    image_string = tf.io.read_file(filename)            # 读取原始文件
    image_decoded = tf.image.decode_jpeg(image_string)  # 解码JPEG图片
    image_resized = tf.image.resize(image_decoded, [256, 256]) / 255.0
    return image_resized

image=decode_and_resize('./catsdogs/test/Cat/85.jpg')
plt.imshow(image.numpy())
plt.show()
print(image)

 

references:

Common modules of TensorFlow — simple and rough TensorFlow 2 0.4 beta documentation (tf.wiki)

Cat and dog classification-kaggle data set-small convolutional network_small animal data set on kaggle_Mr.Ma.01's blog-CSDN blog

Guess you like

Origin blog.csdn.net/qq_40107571/article/details/131368047