Keras(十八)预定义estimator使用、交叉特征实战

本文将介绍:

  • 预定义estimator.LinearClassifier模型
  • 预定义estimator.DNNClassifier模型
  • 交叉特征实战

一,加载Titanic数据集

1,下载Titanic数据集,使用pandas读取并解析数据集

下载数据的网址如下:

https://storage.googleapis.com/tf-datasets/titanic/train.csv
https://storage.googleapis.com/tf-datasets/titanic/eval.csv

将下载的数据读取到pandas中,代码如下:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

2,分离出特征值和目标值

y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())
3,使用panda对数值型数据的字段进行统计
train_df.describe()

二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据

1,将"离散特征"和"连续特征"整合为one-hot编码
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))
2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

三,LinearClassifier

# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

四,DNNClassifier

# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

五,交叉特征实战

1,交叉特征的代码实现
# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))
2,总结代码如下
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 14:03:56 2020

@author: nijiahui
"""
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

### 一,加载Titanic数据集 ##########################################
# 1,下载Titanic数据集,使用pandas读取并解析数据集
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

# 2,分离出特征值和目标值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

# 3,使用panda对数值型数据的字段进行统计
train_df.describe()

### 二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据 ##############
# 1,将"离散特征"和"连续特征"整合为one-hot编码
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class','deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column, dtype=tf.float32))

# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))

# 2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

### 三,LinearClassifier
# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model_new_features'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))


### 四,DNNClassifier
# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model_new_features'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    # optimizer = 'Adam'
    optimizer = 'SGD'
    )
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))

猜你喜欢

转载自blog.csdn.net/TFATS/article/details/111694596
今日推荐