TF2_403 tfrecord file generation and reading

1. tfrecord format description

1.1 tf.train.Example generates tfrecord format

  • 1.tf.train.Features : {“key”: tf.train.Feature}
    • tf.train.Feature: tf.train.ByteList / FloatList / Int64List
      (string type / floating point number type / integer type)
  • 2. Use Features to create Example
    • Serialize Example, compress, and reduce size
  • 3. Save the Example in the file
    • new folder
    • Use TFRecordWriter to open the file and write Example
  • 4. Use tf.data.TFRecordDataset to read and parse the tfrecord file
# tfrecord 文件格式
# -> tf.train.Example
#    -> tf.train.Features -> {"key": tf.train.Feature}
#       -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

favorite_books = [name.encode('utf-8')# 格式转换
                  for name in ["machine learning", "cc150"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)# BytesList为字符串格式
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])# FloatList为浮点数类型
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42]) # Int64List为整数类型
print(age_int64list)

features = tf.train.Features(  # 建立features
    feature = {
    
    
        "favorite_books": tf.train.Feature(bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(float_list = hours_floatlist),
        "age": tf.train.Feature(int64_list = age_int64list),
    }
)
print(features)
# 利用features建立example
example = tf.train.Example(features=features)
print(example)

# 对example序列化,压缩,减小size
serialized_example = example.SerializeToString()
print(serialized_example)
# 把examples存入文件中

output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)

# 使用TFRecordWriter 打开文件
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)
# 利用tf.data.TFRecordDataset读取tfrecord文件

dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:# 读取序列化后的文件
    print(serialized_example_tensor)

# 将序列化后的进行解析

expected_features = {
    
    
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),# VarLenFeature变长变量
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}

# 读取数据集
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(   # 对序列化后的进行解析
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

1.2 Generate .zip file

  • Steps 1 and 2 are the same as above
  • 3. Generate compressed files from tfrecord files
    • Use tf.io.TFRecordOptions, the compression type is "GZIP"
  • 4. Use tf.data.TFRecordDataset to read the compressed file, just add the parameter compression_type="GZIP", indicating that the read file type is a compressed file
# 将tfrecord文件生成压缩文件

filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type = "GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)
# 读取压缩后的文件

dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], 
                                      compression_type= "GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

2. How to use tfrecord in specific file data (using TF2_402csv file to read and generate tfrecord)

Previous blog: csv file generated by TF2_402

2.1 Read csv file

# 读取csv文件

def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)

2.2 traverse the read data, turn it into train_Example type, and serialize it

def serialize_example(x, y):# 将想x,y变为train_examples类型,并进行序列化
    """Converts x, y to tf.train.Example and serialize"""
    input_feautres = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
    
    
            "input_features": tf.train.Feature(
                float_list = input_feautres),
            "label": tf.train.Feature(float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

2.3 Convert csv file to tfrecord file

# csv文件转换为tfrecord文件
def csv_dataset_to_tfrecords(base_filename, 
                             dataset,
                             n_shards, # 存储文件个数
                             steps_per_shard, # 每次遍历多少个文件
                             compression_type = None): # 是否使用某些压缩方法
    # 首先定义压缩方法
    options = tf.io.TFRecordOptions(
        compression_type = compression_type)
    all_filenames = []
    
    # 定义具体输出的文件名
    for shard_id in range(n_shards):
        filename_fullpath = '{}_{:05d}-of-{:05d}'.format(
            base_filename, shard_id, n_shards)
        # 打开文件,写入数据
        with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
            for x_batch, y_batch in dataset.take(steps_per_shard):
                for x_example, y_example in zip(x_batch, y_batch):
                    writer.write(
                        serialize_example(x_example, y_example))
        
        # 保存所有的文件名
        all_filenames.append(filename_fullpath)
    return all_filenames


# 调用文件
n_shards = 20 # 分成20个

# 计算每一个的个数
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# 定义文件名字
train_basename = os.path.join(output_dir, "train")
valid_basename = os.path.join(output_dir, "valid")
test_basename = os.path.join(output_dir, "test")

# 调用csv--tfrecord函数
train_tfrecord_filenames = csv_dataset_to_tfrecords(
    train_basename, train_set, n_shards, train_steps_per_shard, None)
valid_tfrecord_filenames = csv_dataset_to_tfrecords(
    valid_basename, valid_set, n_shards, valid_steps_per_shard, None)
test_tfrecord_fielnames = csv_dataset_to_tfrecords(
    test_basename, test_set, n_shards, test_steps_per_shard, None)

2.4 Convert csv file to tfrecord file and compress

# 生成压缩后的tfrecord文件 ,压缩后每个文件变得更小了

n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = "generate_tfrecords_zip"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

train_basename = os.path.join(output_dir, "train")
valid_basename = os.path.join(output_dir, "valid")
test_basename = os.path.join(output_dir, "test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(
    train_basename, train_set, n_shards, train_steps_per_shard,
    compression_type = "GZIP")
valid_tfrecord_filenames = csv_dataset_to_tfrecords(
    valid_basename, valid_set, n_shards, valid_steps_per_shard,
    compression_type = "GZIP")
test_tfrecord_fielnames = csv_dataset_to_tfrecords(
    test_basename, test_set, n_shards, test_steps_per_shard,
    compression_type = "GZIP")
    
pprint.pprint(train_tfrecord_filenames)
pprint.pprint(valid_tfrecord_filenames)
pprint.pprint(test_tfrecord_fielnames)

2.5 Read tfrecord file

  • Set features first
  • Analytical data
  • tfrecord read function
expected_features = {
    
    
    "input_features": tf.io.FixedLenFeature([8], dtype=tf.float32),# FixedLenFeature定长
    "label": tf.io.FixedLenFeature([1], dtype=tf.float32)
}

# 解析数据
def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example,
                                         expected_features)
    return example["input_features"], example["label"]

# tfrecord读取
def tfrecords_reader_dataset(filenames, n_readers=5,
                             batch_size=32, n_parse_threads=5,
                             shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TFRecordDataset(# 按照TFReordDataset格式读取文件
            filename, compression_type = "GZIP"),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_example,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

# 生成训练中使用的数据集

batch_size = 32
tfrecords_train_set = tfrecords_reader_dataset(
    train_tfrecord_filenames, batch_size = batch_size)
tfrecords_valid_set = tfrecords_reader_dataset(
    valid_tfrecord_filenames, batch_size = batch_size)
tfrecords_test_set = tfrecords_reader_dataset(
    test_tfrecord_fielnames, batch_size = batch_size)


# 测试函数
tfrecords_train = tfrecords_reader_dataset(train_tfrecord_filenames,
                                           batch_size = 3)
for x_batch, y_batch in tfrecords_train.take(2):
    print(x_batch)
    print(y_batch)

2.6 The read data set is trained using a neural network

# 读取后的数据集使用神经网络进行训练

model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]

history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

Guess you like

Origin blog.csdn.net/qq_44783177/article/details/105900572