时间序列预测之keras

'''
多变量时间序列预测
'''


import os
os.getcwd()
os.chdir('C:\\Users\\87671\\Desktop\\比特魔方')
from pandas import read_csv
from datetime import datetime
from numpy import concatenate
from matplotlib import pyplot
from pandas import concat
from pandas import DataFrame
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# 把日期转换成指定格式 年月日小时
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')

dataset = read_csv('raw.csv', parse_dates=[['year','month','day','hour']],index_col=0,date_parser=parse)
# parse_dates指定时间转化的列 ,date_parser指定转化的形式，index_col指定某列作为索引,一个中括号在一起表示合并在了一起 
dataset.drop('No',axis =1 ,inplace = True)
# 赋予特殊的列名
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# 填充NA为0
dataset['pollution'].fillna(0,inplace = True)
# 删除前24行的数据
dataset = dataset[24:]
print(dataset.head(5))
#dataset.to_csv('pollution.csv')

dataset = read_csv("pollution.csv",header = 0,index_col = 0)

# 画图，显示每个变量的5年数据
values = dataset.values                                      # 首先提取数据框里面的values
groups = [0,1,2,3,5,6,7]
i = 1
pyplot.figure()
for group in groups:
    pyplot.subplot(len(groups),1,i)                          # 子图要几行几列，每个图在第几个
    pyplot.plot(values[:,group])
    pyplot.title(dataset.columns[group],y = 0.5,loc='right')
    i += 1
pyplot.show()


'''
明确监督问题：根据前1个小时的天气情况x和y，预测下一个阶段的y
将时间序列数据集转化为监督学习问题
基础知识
1.df['t-1']=df['t'].shift(1) #创建了和t之后一项的序列，同等长度，前面的第一个变成了NA
  df['t-1']=df['t'].shift(-1) # 提前一项，最后一项为nan
'''


def series_to_supervised(data, n_in=1,n_out=1,dropnan = True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols,names = list(),list()
    # 输入序列(t-n,...,t-1)
    for i in range(n_in,0,-1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # 预测序列(t ,t+1,...,t+n)
    for i in range(0, n_out):              # 只包括前不包括后，所以range0,1代表只有一个
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # 将他们放在一起
    agg = concat(cols,axis=1)
    agg.columns = names
    # 去除有na的行
    if dropnan:
        agg.dropna(inplace = True)
    return agg

'''
举例
## 单变量(就只有一个时间变量)
values = [x for x in range(10)]
data = series_to_supervised(values)
print(data)
data = series_to_supervised(values,3)         # 滞后三项预测一个y
print(data)
data = series_to_supervised(values,2,2)      # 滞后2,预测2
print(data)
## 多变量(以2个时间变量为例子)
raw = DataFrame()
raw['ob1'] = [x for x in range(10)]
raw['ob2'] = [x for x in range(50, 60)]
values = raw.values
data = series_to_supervised(values)        # 2个都滞后一项
print(data)
'''
# 将类别变量（第5列）独热编码
encoder = preprocessing.LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# 确保所有的data是float形式
values = values.astype('float32')
# 标准化压缩到0-1之间
scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 
scaled = scaler.fit_transform(values)
###########################################################################timestep=1
# 使用时间序列变监督学习的函数
reframed1 = series_to_supervised(scaled,1,1) 
# 其实我们只需要预测的是第一个变量，所以其他变量对应的时间序列可以删去
reframed1.drop(reframed1.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True)
print(reframed1.head())  #前八个变量时原来数据集的X和Y，最后一个是Y

'''
现在数据准备好了，开始拟合模型
'''
#划分训练集测试集（这里讲第一年当做训练集，其余4年当做测试集）
values = reframed1.values
n_train_hours = 365*24
train = values[:n_train_hours,:]
test = values[n_train_hours:,:]
#划分X和Y
train_X,train_y = train[:,:-1],train[:,-1]
test_X,test_y = test[:,:-1],test[:,-1]
#X转化成LSTM需要的格式[样本，时间步长，特征]
train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1])) 
test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1]))
print(train_X.shape,train_y.shape,test_X.shape,test_y.shape)
# 设计网络
model = Sequential()
# 50个神经元，X[1]是一个时间步长,X[2]是8个维度
model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2])))
model.add(Dense(1))
# 平均绝对误差mae
model.compile(loss='mae',optimizer = 'adam')
# 拟合network
# 50个批量大小为72的训练时机
history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'],label='train')
pyplot.plot(history.history['val_loss'],label='test')
pyplot.legend()
pyplot.show()
# 看图观察可能过拟合

'''
模型评估
'''
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))  # 3W条数据，8个特征
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)       # 拼接成y对应X7个变量的格式
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]                                    # 只选择第一列也就是我们真正的yhat
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))           # 计算mse
print('Test RMSE: %.3f' % rmse)



##################################################################################时间跨度为2呢

reframed = series_to_supervised(scaled,2,1) 
'''
不能再像原来那样删这些编号啦，可以这样做
train_X, train_y = train[:, 0:n_obs], train[:, -n_features]
reframed.drop(reframed.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True)
'''
'''
现在数据准备好了，开始拟合模型
'''
#划分训练集测试集（这里讲第一年当做训练集，其余4年当做测试集）
values = reframed.values
n_train_hours = 365*24
train = values[:n_train_hours,:]
test = values[n_train_hours:,:]
#划分X和Y
# 和step=1不一样
n_hours = 2                # timestep=2
n_features = 8
n_obs = n_hours * n_features  # step=2
train_X, train_y = train[:, 0:n_obs], train[:, -n_features]
test_X, test_y = test[:, 0:n_obs], test[:, -n_features]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
#这里可以检查train_X和reframed各自什么样子，经验证是我们需要的样子！
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

'''
train_X,train_y = train[:,:-1],train[:,-1]
test_X,test_y = test[:,:-1],test[:,-1]
#X转化成LSTM需要的格式[样本，时间不长，特征]
train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1]))
'''

# 设计网络
model = Sequential()
# 50个神经元，X是一个时间步长，8个维度
model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2])))
model.add(Dense(1))
# 平均绝对误差mae
model.compile(loss='mae',optimizer = 'adam')
# 拟合network
# 50个批量大小为72的训练时机
history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'],label='train')
pyplot.plot(history.history['val_loss'],label='test')
pyplot.legend()
pyplot.show()
# 看图观察可能过拟合

'''
模型评估
'''

'''
# make a prediction 返回原来的单位
trainPredict = model.predict(train_X)
testPredict = model.predict(test_X)
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
import math
trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0]))
trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0]))
print('Train RMSE: %.3f' % trainScore)         
print('Test RMSE: %.3f' % testScore)
'''


yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]*n_hours))  # 3W条数据，16个特征
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:8]), axis=1)               # 因为之前标准化之前是8列，所以inverse的时候也要是8列，
                                                                     # 当然我们有用的只是第一列，所以1:8,8:15都可以
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]                                             # 只选择第一列也就是我们真正的yhat
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:8]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))                     # 计算mse
print('Test RMSE: %.3f' % rmse)                                      # 26.335

# 看一下预测的曲线（这里值取了前面50个）
plt.plot(inv_y[0:50],color='green')
plt.plot(inv_yhat[0:50])
plt.show()
时间序列预测之keras

猜你喜欢