TF2_402 tf.data_generate_csv

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

导入数据——加利福尼亚房价数据集

from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

1.生成csv文件

1.1 具体函数程序

output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix,
                header=None, n_parts=10):
    # 保存的文件名字
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")# os.path.join() 实现路径拼接  .join()实现字符串拼接
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        
        # 输入文件名字的两个参数 [name_prefix]_[file_idx].csv
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        
        # 打开文件,写入数据
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:       # 先写header
                f.write(header + "\n")
            for row_index in row_indices: # 按照批次写入数据
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train",
                              header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                              header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)

1.2 输出保存的文件名称

import pprint
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filenames:")
pprint.pprint(test_filenames)

注释其中语法说明

  1. os.path.join()路径拼接

  2. .join() 字符拼接
    str="-"
    str = “-”; seq = (“a”, “b”, “c”); # 字符串序列 print(str.join(seq)) #或者
    [output:] a-b-c

  3. repr() 返回一个对象的string格式
    s=‘rand’
    repr(s)
    [output:] “‘rand’”

2.读取csv文件

2.1 filename -> dataset

2.2 read file -> dataset -> datasets -> merges

2.3 parse csv

注释其中语法说明
[tf.data.Dataset]常见使用

  1. tf.data.TextLineDataset 接口提供了一种方法从数据文件中读取。我们提供只需要提供文件名(1个或者多个)。这个接口会自动构造一个dataset,类中保存的元素:文中一行,就是一个元素,是string类型的tensor
  2. from_tensor_slices:表示从张量中获取数据
    3.dataset=dataset.map():map是在数据集中的最常用的操作,表示对数据集中的每一条数据都调用参数中指定的parser方法,对每一条数据处理后,map将处理后的数据包装成一个新的数据集后返回。搭配lambda函数是最为常用的形式
  3. dataset=dataset.shuffle(buffer_size):buffle的机制是在内存缓冲区中保存一个buffer_size条数据,每读入一条数据,从这个缓冲区中随机选择一条数据进行输出,缓冲区的大小越大,随机性能越好,但是也会耗费内
  4. dataset = dataset.batch(batch_size)
  5. dataset = dataset.repeat(N):表示将数据复制N份
  6. skip(N):表示在数据中跳过前N项数据
  7. map():对元素进行操作,()里函数决定dataset处理方式
  8. shuttle():打乱元素的序列,即随机组合
  9. zip(): 把不同的dataset组合
  10. `tf.io.decode_csv(str,record_defaults)将字符串转换为Tensor类型,record_defaults指定字符串的类型
  11. tf.stack() 矩阵拼接函数
def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])# tf.stack() 矩阵拼接函数
    y = tf.stack(parsed_fields[-1:])
    return x, y

# 定义完整的函数 完成整个流程
# 1. filename -> dataset
# 2. read file -> dataset -> datasets -> merge
# 3. parse csv

def csv_reader_dataset(filenames, 
                       n_readers=5,
                       batch_size=32, 
                       n_parse_threads=5,   # 解析时的并行度
                       shuffle_buffer_size=10000):
    
    
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()#重复次数,不加参数即为无限次
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),# 转换数据并忽略第一行header
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)# shuffle 混排
    
    # 解析 map 一对一 ;
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)

for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)
                    
                    
model.evaluate(test_set, steps = 5160 // batch_size)

猜你喜欢

转载自blog.csdn.net/qq_44783177/article/details/105896582
tf
tf2
今日推荐