Tensorflow Estimator使用（全）

1、泰坦尼克问题引入分析

数据集下载地址：

https://storage.googleapis.com/tf-datasets/titanic/train.csv
https://storage.googleapis.com/tf-datasets/titanic/eval.csv

数据集解析并展示前5条数据：

train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())#5条数据
print(eval_df.head())

 survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         1    male  34.0                   0      0  13.0000  Second   

      deck  embark_town alone  
0  unknown  Southampton     y  
1        E  Southampton     y  
2        C  Southampton     y  
3  unknown  Southampton     y  
4        D  Southampton     y

将标签（survived字段）从数据集里去除掉：

#pop函数可以把相应字段从dataframe里去除掉，同时返回结果
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

 sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

   embark_town alone  
0  Southampton     y  
1  Southampton     y  
2  Southampton     y  
3  Southampton     y  
4  Southampton     y  
0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64
0    0
1    0
2    1
3    1
4    1
Name: survived, dtype: int64

查看数据集中的统计量：

train_df.describe() #数据集中的统计量，只有这四个字段是有数值的，其他都是离散型的

age n_siblings_spouses parch fare

count 627.000000 627.000000 627.000000 627.000000

mean 29.631308 0.545455 0.379585 34.385399

std 12.511818 1.151090 0.792999 54.597730

min 0.750000 0.000000 0.000000 0.000000

25% 23.000000 0.000000 0.000000 7.895800

50% 28.000000 0.000000 0.000000 15.045800

75% 35.000000 1.000000 0.000000 31.387500

max 80.000000 8.000000 5.000000 512.329200

	age	n_siblings_spouses	parch	fare
count	627.000000	627.000000	627.000000	627.000000
mean	29.631308	0.545455	0.379585	34.385399
std	12.511818	1.151090	0.792999	54.597730
min	0.750000	0.000000	0.000000	0.000000
25%	23.000000	0.000000	0.000000	7.895800
50%	28.000000	0.000000	0.000000	15.045800
75%	35.000000	1.000000	0.000000	31.387500
max	80.000000	8.000000	5.000000	512.329200

print(train_df.shape, eval_df.shape)

(627, 9) (264, 9)

泰坦尼克号上乘客的年龄符合的分布：

train_df.age.hist(bins = 20)
#.age把对应字段的值取出来，.hist表示画直方图，bins表示将所有的值分多少份

不同性别的人各占多少：

train_df.sex.value_counts().plot(kind = 'barh')
#.value_counts()统计各个值的个数，.plot(kind = 'barh')画一个横向的柱状图（barv）

不同舱位的乘客都各有多少：

train_df['class'].value_counts().plot(kind = 'barh')#.class与函数冲突？

统计：男性中有百分之多少获救，女性中有百分之多少获救：

pd.concat([train_df, y_train], axis = 1).groupby('sex').survived.mean().plot(kind='barh')

2、feature_column使用

离散特征->One_hot编码—>向量

连续特征，直接输入

#离散特征
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']

#连续特征
numeric_columns = ['age', 'fare']

feature_columns = []

#离散特征
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique() #该离散特征所在列的所有可能值
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column( #one-hot编码
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))
    #离散特征，定义feature_column，再添加到feature_columns
    
#连续特征，直接输入
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column , dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']

构建dataset：

note:data_df是有多列的一个pandas的dataframe结构，需要变成字典（Key是列名，value是数据值）。

def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000) #buffer_size=10000
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

train_dataset = make_dataset(train_df, y_train, batch_size = 5)
#取出来的是一个batch，在取出的数据中，每个field的数据是聚在一起的，比如sex，它存在字典key为'sex'的value中。
for x, y in train_dataset.take(1):
    print(x, y)

{'sex': <tf.Tensor: id=82, shape=(5,), dtype=string, numpy=array([b'female', b'male', b'male', b'male', b'female'], dtype=object)>, 'age': <tf.Tensor: id=74, shape=(5,), dtype=float64, numpy=array([41., 30., 28., 71., 28.])>, 'n_siblings_spouses': <tf.Tensor: id=80, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 1])>, 'parch': <tf.Tensor: id=81, shape=(5,), dtype=int32, numpy=array([2, 0, 0, 0, 0])>, 'fare': <tf.Tensor: id=79, shape=(5,), dtype=float64, numpy=array([20.2125,  7.8958,  8.05  , 34.6542, 15.5   ])>, 'class': <tf.Tensor: id=76, shape=(5,), dtype=string, numpy=array([b'Third', b'Third', b'Third', b'First', b'Third'], dtype=object)>, 'deck': <tf.Tensor: id=77, shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'unknown', b'A', b'unknown'], dtype=object)>, 'embark_town': <tf.Tensor: id=78, shape=(5,), dtype=string, numpy=
array([b'Southampton', b'Southampton', b'Southampton', b'Cherbourg',
       b'Queenstown'], dtype=object)>, 'alone': <tf.Tensor: id=75, shape=(5,), dtype=string, numpy=array([b'n', b'y', b'y', b'y', b'n'], dtype=object)>} tf.Tensor([0 0 0 0 1], shape=(5,), dtype=int32)

dataset与feature_columns结合:

keras.layers.DenseFeature：DenseFeature可以把刚才定义的feature_columns给应用到dataset中去，
feature_columns本质上是一组对feature进行变换的规则，
DenseFeature可以把这一组规则给应用到dataset中的每一个数据上去。

1)

for x, y in train_dataset.take(1):
    age_column = feature_columns[7]
    gender_column = feature_columns[0]
    print(keras.layers.DenseFeatures(age_column)(x).numpy())
    print(keras.layers.DenseFeatures(gender_column)(x).numpy())

[[ 2. ]
 [33. ]
 [28. ]
 [50. ]
 [70.5]]

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]

2)

# keras.layers.DenseFeature
for x, y in train_dataset.take(1):
    print(keras.layers.DenseFeatures(feature_columns)(x).numpy())

[[50.     1.     0.     0.     1.     0.     0.     0.     0.     0.
   0.     0.     0.     1.     1.     0.     0.     0.    55.9    1.
   0.     0.     0.     0.     0.     0.     1.     0.     0.     0.
   0.     0.     1.     0.   ]
 [45.     1.     0.     0.     0.     1.     1.     0.     0.     0.
   0.     0.     0.     0.     1.     0.     0.     0.    26.25   1.
   0.     0.     0.     0.     0.     0.     0.     1.     0.     0.
   0.     0.     0.     1.   ]
 [28.     0.     1.     1.     0.     0.     1.     0.     0.     0.
   0.     0.     0.     0.     1.     0.     0.     0.     9.5    0.
   1.     0.     0.     0.     0.     0.     1.     0.     0.     0.
   0.     0.     1.     0.   ]
 [38.     0.     1.     1.     0.     0.     1.     0.     0.     0.
   0.     0.     0.     0.     1.     0.     0.     0.     7.05   0.
   1.     0.     0.     0.     0.     0.     1.     0.     0.     0.
   0.     0.     1.     0.   ]
 [30.     0.     1.     1.     0.     0.     1.     0.     0.     0.
   0.     0.     0.     0.     1.     0.     0.     0.    12.475  0.
   1.     0.     0.     0.     0.     0.     1.     0.     0.     0.
   0.     0.     0.     1.   ]]

3、keras_to_estimator

构造keras模型：

model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(2, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy',
              optimizer = keras.optimizers.SGD(lr=0.01),
              metrics = ['accuracy'])

1） model.fit

train_dataset = make_dataset(train_df, y_train, epochs = 100)
eval_dataset = make_dataset(eval_df, y_eval, epochs = 1, shuffle = False)
history = model.fit(train_dataset,
                    validation_data = eval_dataset,
                    steps_per_epoch = 19, #训练集中的样本数/batch_size
                    validation_steps = 8,
                    epochs = 100)

2） model -> estimator -> train

estimator = keras.estimator.model_to_estimator(model)
# input_fn：
# 1. 是一个function
# 2. return a. (features, labels) b. dataset -> (feature, label)
estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs=100))

4、预定义estimator使用

1）baseline_estimator：

output_dir = 'baseline_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes = 2)
baseline_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

#随机猜测，现在还有bug
baseline_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False, batch_size = 20))

note：FailedPreconditionError: GetNext() failed。官方bug，仍未解决。

2）linear_estimator：

linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

默认在"./linear_model"下会保存tensorboard，可以在tensorboard中查看训练的loss等信息。

{'accuracy': 0.7878788,
 'accuracy_baseline': 0.625,
 'auc': 0.8367616,
 'auc_precision_recall': 0.7849544,
 'average_loss': 0.46802205,
 'label/mean': 0.375,
 'loss': 0.4526842,
 'precision': 0.7171717,
 'prediction/mean': 0.3789331,
 'recall': 0.7171717,
 'global_step': 1960}

3）dnn_estimator：

dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

{'accuracy': 0.8068182,
 'accuracy_baseline': 0.625,
 'auc': 0.85723907,
 'auc_precision_recall': 0.8250245,
 'average_loss': 0.46943292,
 'label/mean': 0.375,
 'loss': 0.44829544,
 'precision': 0.7222222,
 'prediction/mean': 0.42193753,
 'recall': 0.7878788,
 'global_step': 1960}

5、交叉特征实战

cross feature:对两个离散特征做笛卡尔积： age: [1,2,3,4,5], gender:[male, female]
age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
hash_bucket_size: 100000: 100 -> hash(100000 values) % 100

feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))

dnn_estimator：

dnn_output_dir = './dnn_model_new_features'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

兰钧

原创文章 46 获赞 49 访问量 2166

关注私信