机器学习笔记8:基于TensorFlow的数据预测
本文是在一篇博客预测天朝铁路的客运量一文中学习,代码部分引用该文,对其进行部分修改。
时间序列数据是指在不同时间点上收集到的数据,这类数据反映了某一事物、现象等随时间的变化状态或程度。
铁路客运量历史数据
铁路客运量.csv(2005-2016月度数据)
使用matplotlib画出数据走势
import matplotlib.pyplot as plt
import pandas as pd
import requests
import io
import numpy as np
url = 'http://blog.topspeedsnail.com/wp-content/uploads/2016/12/铁路客运量.csv'
ass_data = requests.get(url).content
df = pd.read_csv(io.StringIO(ass_data.decode('utf-8'))) # python2使用StringIO.StringIO
data = np.array(df['铁路客运量_当期值(万人)'])
# normalize
normalized_data = (data - np.mean(data)) / np.std(data)
plt.figure()
plt.plot(data)
plt.show()
利用TensorFlow进行预测
# coding=utf-8
'''
Author:Chen hao
Description: counter
Date: August 22 , 2017
'''
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import requests
import io
# 加载数据
url = 'http://blog.topspeedsnail.com/wp-content/uploads/2016/12/铁路客运量.csv'
ass_data = requests.get(url).content
df = pd.read_csv(io.StringIO(ass_data.decode('utf-8'))) # python2使用StringIO.StringIO
data = np.array(df['铁路客运量_当期值(万人)'])
# normalize
normalized_data = (data - np.mean(data)) / np.std(data)
seq_size = 3
train_x, train_y = [], []
for i in range(len(normalized_data) - seq_size - 1):
train_x.append(np.expand_dims(normalized_data[i: i + seq_size], axis=1).tolist())
train_y.append(normalized_data[i + 1: i + seq_size + 1].tolist())
input_dim = 1
X = tf.placeholder(tf.float32, [None, seq_size, input_dim])
Y = tf.placeholder(tf.float32, [None, seq_size])
# regression
def ass_rnn(hidden_layer_size=6):
W = tf.Variable(tf.random_normal([hidden_layer_size, 1]), name='W')
b = tf.Variable(tf.random_normal([1]), name='b')
cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
W_repeated = tf.tile(tf.expand_dims(W, 0), [tf.shape(X)[0], 1, 1])
out = tf.matmul(outputs, W_repeated) + b
out = tf.squeeze(out)
return out
def train_rnn():
out = ass_rnn()
loss = tf.reduce_mean(tf.square(out - Y))
train_op = tf.train.AdamOptimizer(learning_rate=0.003).minimize(loss)
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
# tf.get_variable_scope().reuse_variables()
sess.run(tf.global_variables_initializer())
for step in range(9000):
_, loss_ = sess.run([train_op, loss], feed_dict={X: train_x, Y: train_y})
if step % 10 == 0:
# 用测试数据评估loss
print(step, loss_)
print("保存模型: ", saver.save(sess, 'ass.model'))
def prediction():
out = ass_rnn()
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
# tf.get_variable_scope().reuse_variables()
saver.restore(sess, './ass.model')
prev_seq = train_x[-1]
predict = []
for i in range(12):
next_seq = sess.run(out, feed_dict={X: [prev_seq]})
predict.append(next_seq[-1])
prev_seq = np.vstack((prev_seq[1:], next_seq[-1]))
plt.figure()
plt.plot(list(range(len(normalized_data))), normalized_data, color='b')
plt.plot(list(range(len(normalized_data), len(normalized_data) + len(predict))), predict, color='r')
plt.show()
#train_rnn()
prediction()
需要注意的是,先要运行train_rnn()函数,然后将训练产生的模型保存到当前的文件夹(屏蔽prediction函数),然后在运行prediction()函数(屏蔽train_rnn()函数)即可获得预测的结果。
运行的结果如下图所示:
运行构建的模型在很多情况下并不理想。