Fund inflow and outflow forecast-time series

1. Time series forecasting based on period factors

  1. Extract periodical characteristics of time series for prediction
    • Determine the period and calculate the period factor
      Method 1: Divide by the weekly average, and take the median by column
      Method 2: Season index calculation method, get the average of each weekday or weekend, and divide by the overall average
    • Calculate base
    • Forecast=base*period factor
      ** Observe the sequence. When the sequence changes periodically, you can use the periodic factor method as the baseline **
  2. How to predict what will happen every day next month
    • Get the average value of the day (1-31)
    • Statistics (Monday to Sunday) daily frequency
    • Obtain the weighted mean based on the weekly period factor
    • Forecast based on factors and daily averages

2. Specific operation

1. Weekly median forecast

  1. Select time period

    • Training set period: 2014-03-01~2014-08-03
    • Test data set: 2014-08-04~2014-08-31
  2. Import data package

import pandas as pd
import numpy as np
import sklearn as skr
import datetime
import matplotlib as plt
import seaborn as sns
from dateutil import relativedelta
  1. Read data
def load_data(file_path):
    data_balance = pd.read_csv(file_path)
    data_balance = add_timestamp(data_balance,"report_date")
    return data_balance.reset_index(drop=True)
#给数据集添加时间戳
def add_timestamp(data,date):
    data_balance = data.copy()
    data_balance["date"] = pd.to_datetime(data_balance[date],format="%Y%m%d")
    data_balance["day"] = data_balance["date"].dt.day
    data_balance["month"]= data_balance["date"].dt.month
    data_balance["year"] = data_balance["date"].dt.year
    data_balance["week"] = data_balance["date"].dt.week
    data_balance["weekday"] = data_balance["date"].dt.weekday
    return data_balance.reset_index(drop=True)
#按照日期统计申购/赎回金额总量
def total_amt(data,date):
    data_temp = data.copy()
    data_temp = data.groupby("date",as_index=False)["total_purchase_amt","total_redeem_amt"].sum()
    return data_temp[data_temp["date"]>=date].reset_index(drop=True)
#生成测试数据
def generate_data(data,start_date,end_date):
    total_balance = data.copy()
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    test_data=[]
    while start_date!=end_date:
        temp = [start_date,np.nan,np.nan]
        test_data.append(temp)
        start_date += datetime.timedelta(days = 1)
    test_data = pd.DataFrame(test_data)
    test_data.columns = total_balance.columns
    total_balance = pd.concat([total_balance,test_data],axis=0)
    return total_balance.reset_index(drop=True)
    
user_balance_file_path = r"./Data/user_balance_table.csv"
user_info_file_path = r"./Data/user_profile_table.csv"
data_balance = load_data(user_balance_file_path)
total_balance = total_amt(data_balance,"2014-03-01")
total_balance = generate_data(total_balance,"2014-08-04","2014-8-31")
total_balance = add_timestamp(total_balance,"date")

Insert picture description here

#定义时间序列规则预测结果的方法
def generate_base(data,month_index):
    #选中固定时间短的数据集
    total_balance = data.copy()
    total_balance = total_balance[["date","total_purchase_amt","total_redeem_amt"]]
    total_balance = total_balance[(total_balance["date"]>="2014-03-01")&(total_balance["date"]<pd.Timestamp(2014,month_index,1))]
    #加入时间戳
    total_balance["day"] = total_balance["date"].dt.day
    total_balance["month"] = total_balance["date"].dt.month
    total_balance["week"] = total_balance["date"].dt.week
    total_balance["weekday"] = total_balance["date"].dt.weekday
    #统计每日因子,按照星期聚合的均值/所有数据的均值
    mean_of_each_weekday = total_balance[["weekday","total_purchase_amt","total_redeem_amt"]].groupby("weekday",as_index=False).mean()
    for name in ["total_purchase_amt","total_redeem_amt"]:
        mean_of_each_weekday = mean_of_each_weekday.rename(columns={
    
    name:name+"_weekdaymean"})
    mean_of_each_weekday["total_purchase_amt_weekdaymean"] /=np.mean(total_balance["total_purchase_amt"])
    mean_of_each_weekday["total_redeem_amt_weekdaymean"] /=np.mean(total_balance["total_redeem_amt"])
    #将统计结果左联到原数据集中
    total_balance = pd.merge(total_balance,mean_of_each_weekday,on="weekday",how="left")
    #统计1-31号中星期出现的频率
    weekday_count = total_balance[["date","weekday","day"]].groupby(["day","weekday"],as_index=False).count()
    weekday_count = pd.merge(weekday_count,mean_of_each_weekday,on="weekday")
    #根据频率对每日因子加权,获得日期因子
    weekday_count["total_purchase_amt_weekdaymean"] *= weekday_count["date"]/len(np.unique(total_balance["month"]))
    weekday_count["total_redeem_amt_weekdaymean"]*=weekday_count["date"]/len(np.unique(total_balance["month"]))
    day_rate = weekday_count.drop(["weekday","date"],axis=1).groupby("day",as_index=False).sum()
    #将测试数据集中所有日期的均值剔除日期残差得到base
    day_mean = total_balance[["day","total_purchase_amt","total_redeem_amt"]].groupby("day",as_index=False).mean()
    day_pre = pd.merge(day_mean,day_rate,on="day",how="left")
    day_pre["total_purchase_amt"] /=day_pre["total_purchase_amt_weekdaymean"]
    day_pre["total_purchase_amt"] /=day_pre["total_redeem_amt_weekdaymean"]
    #生成测试数据集
    for index,row in day_pre.iterrows():
        if month_index in (2,4,6,9) and row["day"]==31:
            break
        day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
    #基于base和每日因子计算最终的预测结果
    day_pre["weekday"] = day_pre["date"].dt.weekday
    day_pre = day_pre[["date","weekday","total_purchase_amt","total_redeem_amt"]]
    day_pre = pd.merge(day_pre,mean_of_each_weekday,on="weekday")
    day_pre["total_purchase_amt"]*=day_pre["total_purchase_amt_weekdaymean"]
    day_pre["total_purchase_amt"]*=day_pre["total_redeem_amt_weekdaymean"]
    day_pre = day_pre.sort_values("date")[["date","total_purchase_amt","total_redeem_amt"]]
    return day_pre

Guess you like

Origin blog.csdn.net/ava_zhang2017/article/details/108166568
Recommended