Tensorflow Estimator之LinearRegressor

LinearRegressor的基本格式

初始化

# 定义线性回归器
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,  # 自定义的特征列
    optimizer=my_optimizer  # 选择合适的优化函数
)

训练

linear_regressor.train(
    input_fn=lambda: my_input_fn(my_feature, targets),  # 输入函数喂取数据
    steps=100  # 训练的步数
)

预测

# 预测数据
predictions = linear_regressor.predict(
    input_fn=lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)
)

预测的格式:

{
'predictions': array([0.34],dtype=float32)
}

是一个字典类型的数据,与之前的DNNClassifier不同,这个仅仅是输出了每一标签对应的输出值,每一项是一个标量。而神经网络的更加复杂一些。。。

配合Dataset读取数据

之前的神经网络分类器DNNClassifier使用了随机乱序的输入,这里的方式类似,不过使用了数据迭代的方式进行:

def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    '''
    输入函数,用于向回归器喂取数据
    :param features: 特征列
    :param labels: 标签
    :param targets: 目标值
    :param batch_size: batch size
    :param shuffle: 是否乱序
    :param num_epochs: epochs的数量
    :return: 数据和标签
    '''
    # 把pandas数据转换成dict的array类型,每一个
    # features = {key: np.array(value) for key, value in dict(features).items()}
    ds = Dataset.from_tensor_slices((dict(features), targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    if shuffle:
        ds.shuffle(10000)
    # 迭代方式,每次返回一个批次的数据
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

使用make_one_shot_iterator().get_next()方法,每次yeild数据,直到训练步数完成。

完整版代码:

说明:这里的线性回归只是一个测试的例子,如果观察绘制的图像或者最小二乘损失的话,误差会很大的。不过,这仅仅是为了说明一般的使用方法。。忽略掉误差吧。。。。。。。

import math
import os
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
# pd.options.display.float_format = '{:.1f}'.format()

# 读取数据
if not os.path.exists('data.csv'):
    california_housing_dataframe = pd.read_csv(
        "https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
else:
    california_housing_dataframe = pd.read_csv('data.csv')

# 数据乱序操作
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))

# 对median_house_value数据列进行标量化处理
california_housing_dataframe['median_house_value'] /= 1000.0
print(california_housing_dataframe.describe())

# 确定标签、标签列、目标结果
my_feature = california_housing_dataframe[['total_rooms']]
feature_columns = [tf.feature_column.numeric_column(key='total_rooms')]
targets = california_housing_dataframe['median_house_value']

# 设置梯度下降函数,同时设置最大下降范围,防止梯度爆炸
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

# 定义线性回归器
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)


def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    '''
    输入函数,用于向回归器喂取数据
    :param features: 特征列
    :param labels: 标签
    :param targets: 目标值
    :param batch_size: batch size
    :param shuffle: 是否乱序
    :param num_epochs: epochs的数量
    :return: 数据和标签
    '''
    # 把pandas数据转换成dict的array类型,每一个
    # features = {key: np.array(value) for key, value in dict(features).items()}
    ds = Dataset.from_tensor_slices((dict(features), targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    if shuffle:
        ds.shuffle(10000)
    # 迭代方式,每次返回一个批次的数据
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels


# 开始训练
linear_regressor.train(
    input_fn=lambda: my_input_fn(my_feature, targets),
    steps=100
)

# 预测数据
predictions = linear_regressor.predict(
    input_fn=lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)
)

predictions = np.array([item['predictions'][0] for item in predictions])

MSE = metrics.mean_squared_error(predictions, targets)
RMSE = math.sqrt(MSE)
print("Mean Squared Error (on training data): %0.3f" % MSE)
print("Root Mean Squared Error (on training data): %0.3f" % RMSE)

min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value

print("Min. Median House Value: %0.3f" % min_house_value)
print("Max. Median House Value: %0.3f" % max_house_value)
print("Difference between Min. and Max.: %0.3f" % min_max_difference)
print("Root Mean Squared Error: %0.3f" % RMSE)

calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(targets)
print(calibration_data.describe())

sample = california_housing_dataframe.sample(n=300)

# 获取最大最小值
x_0 = sample["total_rooms"].min()
x_1 = sample["total_rooms"].max()

# 获取权重和偏置项
weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')

# 获取有关的数据.
y_0 = weight * x_0 + bias
y_1 = weight * x_1 + bias

# 追至直线
plt.plot([x_0, x_1], [y_0, y_1], c='r')

# 坐标的名称
plt.ylabel("median_house_value")
plt.xlabel("total_rooms")

# 画出图的分散点
plt.scatter(sample["total_rooms"], sample["median_house_value"])

# Display graph.
plt.show()

猜你喜欢

转载自blog.csdn.net/qq_35976351/article/details/80870111