Keras(13)はcsvファイルを生成します

1つは、データを準備する

sklearnデータセットからデータを抽出します

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
    
# 1,下载并使用sklearn中的“fetch_california_housing”数据集
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

# 2,拆分数据集中的数据为 训练数据、验证数据、测试数据
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state = 11)
print("x_train:",x_train.shape, y_train.shape)
print("x_valid:",x_valid.shape, y_valid.shape)
print("x_test:",x_test.shape, y_test.shape)

# 3,在将数据带入到模型之前,先进行预处理-训练、验证、测试数据标准化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

次に、csvファイルを保存する方法を定義します

# 4,创建"generate_csv"文件夹
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# 5,定义保存csv文件的方法
def save_to_csv(output_dir, data, name_prefix,header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

3、メモリデータをcsvファイルとして保存します

# 6,分别按行合并"训练数据","验证数据","测试数据"中的 特征值和目标值
# 1)使用np.c_[]方法合并
# train_data = np.c_[x_train_scaled, y_train]
# valid_data = np.c_[x_valid_scaled, y_valid]
# test_data = np.c_[x_test_scaled, y_test]

# 2)使用np.column_stack()方法合并
train_data = np.column_stack((x_train_scaled, y_train))
valid_data = np.column_stack((x_valid_scaled, y_valid))
test_data = np.column_stack((x_test_scaled, y_test))

# 7,将数据集中增加目标特征的字段;合并特征字段为一个字符串
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

# 8,将内存中的数据存储为csv文件
train_filenames = save_to_csv(output_dir, train_data, "train",header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",header_str, n_parts=10)

第四に、コードを要約します

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
    
# 1,下载并使用sklearn中的“fetch_california_housing”数据集
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

# 2,拆分数据集中的数据为 训练数据、验证数据、测试数据
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state = 11)
print("x_train:",x_train.shape, y_train.shape)
print("x_valid:",x_valid.shape, y_valid.shape)
print("x_test:",x_test.shape, y_test.shape)

# 3,在将数据带入到模型之前,先进行预处理-训练、验证、测试数据标准化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

# 4,创建"generate_csv"文件夹
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# 5,定义保存csv文件的方法
def save_to_csv(output_dir, data, name_prefix,header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

# 6,分别按行合并"训练数据","验证数据","测试数据"中的 特征值和目标值
# 1)使用np.c_[]方法合并
# train_data = np.c_[x_train_scaled, y_train]
# valid_data = np.c_[x_valid_scaled, y_valid]
# test_data = np.c_[x_test_scaled, y_test]

# 2)使用np.column_stack()方法合并
train_data = np.column_stack((x_train_scaled, y_train))
valid_data = np.column_stack((x_valid_scaled, y_valid))
test_data = np.column_stack((x_test_scaled, y_test))

# 7,将数据集中增加目标特征的字段;合并特征字段为一个字符串
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

# 8,将内存中的数据存储为csv文件
train_filenames = save_to_csv(output_dir, train_data, "train",header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",header_str, n_parts=10)

パブリックアカウントに注目することを歓迎します:NLPレクチャーホール、人工知能、アルゴリズム、自然言語処理の知識、最先端のテクノロジーについて初めて学ぶ
ここに画像の説明を挿入

おすすめ

転載: blog.csdn.net/TFATS/article/details/111043351