Keras (17) about the use of feature_column, keras model transfer tf.estimator

This article will introduce:

  • Load Titanic data set
  • Use feature_column for data processing and convert it to tf.data.dataset type data
  • hard_to_estimator

One, load the Titanic data set

1. Download the Titanic data set, use pandas to read and parse the data set
# 在如下的两个网址下载数据
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head()) # 默认取出前5条数据
print(eval_df.head())
2. Separate the characteristic value and target value
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())
3. Use panda to count the fields of numeric data
print(train_df.describe())

# ---output------
              age  n_siblings_spouses       parch        fare
count  627.000000          627.000000  627.000000  627.000000
mean    29.631308            0.545455    0.379585   34.385399
std     12.511818            1.151090    0.792999   54.597730
min      0.750000            0.000000    0.000000    0.000000
25%     23.000000            0.000000    0.000000    7.895800
50%     28.000000            0.000000    0.000000   15.045800
75%     35.000000            1.000000    0.000000   31.387500
max     80.000000            8.000000    5.000000  512.329200
4. View the test set in the data set and the data dimension of the verification set
print(train_df.shape, eval_df.shape)

# ---output------
(627, 9) (264, 9)
5. Use matplotlib in pands to draw charts and understand the data more intuitively
1) Statistics-visual map of age
train_df.age.hist(bins = 50)# bins是将所有数据分为多少份
2) Statistics-Gender Visual Map
# value_counts() --> 将value归类并按类计数
train_df.sex.value_counts().plot(kind = 'barh') # 横向的柱状图是"barh";纵向的柱状图"bar"
3) Statistics-how many passengers are in different positions
train_df['class'].value_counts().plot(kind = 'barh')
4) Statistics-in Titanic, how many men were rescued and how many women were rescued
pd.concat([train_df, y_train], axis = 1).groupby('sex').survived.mean()
pd.concat([train_df, y_train], axis = 1).groupby('sex').survived.mean().plot(kind='barh')

Second, use feature_column for data processing and convert it to tf.data.dataset type data

1. Integrate "discrete features" and "continuous features" into one-hot encoding
1) Divide the features into two lists, "discrete features" and "continuous features"
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class','deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
2) Use tf.feature_column to process "discrete features"
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))
                
# ---output------
sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']
3) Use tf.feature_column to process "continuous features"
for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))
2. Convert ndarray data into BatchDataset type data in tf.data.dataset
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

train_dataset = make_dataset(train_df, y_train, batch_size = 5)

# 查看转化后的tf.data.dataset中的一条数据的信息
for x, y in train_dataset.take(1):
    print(x, y)

# ---output---------
{
    
    'sex': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'female', b'male', b'male', b'male', b'male'], dtype=object)>, 'age': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([32., 28., 44., 28., 28.])>, 'n_siblings_spouses': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 1, 0, 0], dtype=int32)>, 'parch': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0], dtype=int32)>, 'fare': <tf.Tensor: shape=(5,), dtype=float64, numpy=array([15.5   ,  7.2292, 26.    ,  8.05  ,  7.8958])>, 'class': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Third', b'Third', b'Second', b'Third', b'Third'], dtype=object)>, 'deck': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'unknown', b'unknown', b'unknown', b'unknown', b'unknown'],
      dtype=object)>, 'embark_town': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Queenstown', b'Cherbourg', b'Southampton', b'Southampton',
       b'Southampton'], dtype=object)>, 'alone': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'n', b'y', b'n', b'y', b'y'], dtype=object)>} tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)
3. Use keras.layers.DenseFeature to convert two fields of a piece of data into one-hot processed data
# keras.layers.DenseFeature
for x, y in train_dataset.take(1):
    age_column = feature_columns[7]
    gender_column = feature_columns[0]
    print(keras.layers.DenseFeatures(age_column)(x).numpy())
    print(keras.layers.DenseFeatures(gender_column)(x).numpy())

# ---output----------
[[28.]
 [50.]
 [27.]
 [28.]
 [32.]]

[[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
4. Use keras.layers.DenseFeature to convert all fields in a piece of data into one-hot processed data
# keras.layers.DenseFeature
for x, y in train_dataset.take(1):
    print(keras.layers.DenseFeatures(feature_columns)(x).numpy())

三 , hard_to_estimator

1. Define the keras model, the input layer input is converted to one-hot processed data
model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(2, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy',
              optimizer = keras.optimizers.SGD(lr=0.01),
              metrics = ['accuracy'])
2. Training the model

The training model can use the following two methods:

1) Use ordinary model model training
train_dataset = make_dataset(train_df, y_train, epochs = 100)
eval_dataset = make_dataset(eval_df, y_eval, epochs = 1, shuffle = False)
history = model.fit(train_dataset,
                    validation_data = eval_dataset,
                    steps_per_epoch = 19,
                    validation_steps = 8,
                    epochs = 100)
2) Training using the model model converted to estimator

Note: There are still bugs in this method in tensorflow2, which need to be resolved.

estimator = keras.estimator.model_to_estimator(model)
# 1. function
# 2. return a. (features, labels) b. dataset -> (feature, label)
estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs=100))

Guess you like

Origin blog.csdn.net/TFATS/article/details/111661361