FDDC2018_A股上市公司季度营收预测

  去年参加的阿里天池算法大赛,初赛66名,复赛最终11名。当初比赛结束并未来得及整理,搁置半年有余。今天算是整理了一遍,放上来做个纪念,非常高兴遇到的4个队友,最让人惊奇的是我们参赛前一天才认识,然后吹吹牛要拿什么什么名次,其实当时大家都没个底,没想到初赛过了,中间有个小插曲,差点因为我的失误过不了初赛,初赛截止上传数据当天,一直在整理结果,错过了时间,最开始上传过一版,竟然也过了。复赛的时候我们又增加了两个模型,最好的名次的时候是第3名,当时我是我点小惊奇,我们有这么厉害了?当然,在这期间,大家付出都挺多的,每周大概要讨论3次,几乎每个人都会参与讨论。
  这个论坛里面有前三名分享的帖子,看了下他们的经验分享,对算法方面的知识还是比我们深得多。自知不足,后来我就去把吴恩达的深度学习微专业学了一遍,真的是非常好的东西,绝对能五星推荐。
  今年的一个目标就是把一些常用的机器学习方法撸一遍。

比赛数据链接:https://pan.baidu.com/s/14QVX3oYXOjDrkcWQtRyUrA
提取码:893g

import pandas as pd
import numpy as np
import zipfile
import time

import statsmodels.api as sm 
from statsmodels.graphics.api import qqplot
from scipy import stats
from statsmodels.tsa.stattools import adfuller
import statsmodels.tsa.stattools as st 
from statsmodels.tsa.arima_model import ARMA
import copy

pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)   
pd.set_option('display.max_rows', 1000)

#2.文件路径定义------------------------------------------------------------------------------------------------
tickers_Path='../data/FDDC_financial_submit_20180524.csv'
Market_Data_Path='../data/500/[Add June July] FDDC_financial_data_20180711/[New] Market Data_20180711.xlsx'            
Balance_sh_Path='../data/500/[Add June July] FDDC_financial_data_20180711/[New] Financial Data_20180711/Balance Sheet.xls'                           #负债表
Cash_F_st_Path='../data/500/[Add June July] FDDC_financial_data_20180711/[New] Financial Data_20180711/Cashflow Statement.xls'                       #现金流量表
Income_st_Path='../data/500/[Add June July] FDDC_financial_data_20180711/[New] Financial Data_20180711/Income Statement.xls'                         #利润表

#3.数据处理-----------------------------------------------------------------------------------------
#功能:读取预测目标股票池股票代码列表;
#参数说明:
#FileName(文件名及路径):股票池csv文件名及路径;
#Header(列名)=csv文件列表列名;
#输出:初赛股票池股票代码的DtaFrame数据,包含TICKER_SYMBOL和secID(包含交易场所)两列数据

# 获取待预测股票列表
def get_prd_tickers(FileName=tickers_Path,Header=None):
    tickers = pd.read_csv(FileName,header=Header)   # 读股票列表  
    tickers.columns = ['TICKER_SYMBOL']
    tickers['secID'] = tickers['TICKER_SYMBOL']
    tickers['TICKER_SYMBOL'] = tickers['TICKER_SYMBOL'].str.slice(0,6) #删除市场代码,merge使用
    return tickers

# 获取报表数据
def get_factor_data(indu_tickers=u'',FileName=Income_st_Path,Factors=[]):
    columns = ['TICKER_SYMBOL','PUBLISH_DATE','END_DATE','REPORT_TYPE','FISCAL_PERIOD'] + Factors
    
    while FileName[-1] != '.': FileName = FileName[:-1] #删除后缀
    FileName = FileName + 'csv' 
    
    df = pd.read_csv(FileName,encoding = 'utf-8')[columns]
    df['TICKER_SYMBOL'] = df['TICKER_SYMBOL'].astype(str).str.zfill(6)  #补全6位股票代码
    df['END_DATE'] = pd.to_datetime(df['END_DATE'])
    df['PUBLISH_DATE'] = pd.to_datetime(df['PUBLISH_DATE'])
    df= df.loc[df['END_DATE'].dt.month == df['FISCAL_PERIOD'].astype(int)]   #删除与报告期不一致的数据
    df = df.sort_values(['TICKER_SYMBOL', 'END_DATE', 'PUBLISH_DATE'],ascending=[False,False,False]).drop_duplicates(['TICKER_SYMBOL', 'END_DATE'], keep='first') #去重,保留第一次发布数据
    if len(indu_tickers)>0:
        df = df.loc[df.TICKER_SYMBOL.isin(indu_tickers.index)]
    return df

# 获取市值数据
def get_market_data(indu_tickers):
    columns = ['TICKER_SYMBOL','END_DATE_','MARKET_VALUE','TYPE_NAME_EN']
    df = pd.read_excel(Market_Data_Path,sheet_name=u'DATA')
#     df=mdf[u'DATA']    #u'General Business'/u'Bank'/u'Insurance'/u'Securities' 
#     df = pd.read_csv(Market_Data_Path, encoding = 'utf-8')[columns]
    df['TICKER_SYMBOL'] = df['TICKER_SYMBOL'].astype(str).str.zfill(6)
    df['END_DATE_'] = pd.to_datetime(df['END_DATE_'])    
    df['END_DATE_year&month'] = df['END_DATE_'].dt.year.astype(str) + df['END_DATE_'].dt.month.astype(str)
    
    def change_industry(df):
        df = df.sort_values('END_DATE_',ascending=False)   
        df['TYPE_NAME_EN'][1:] = np.NAN
        df['TYPE_NAME_EN'] = df['TYPE_NAME_EN'].ffill()  #使用最近出现的行业来填充
        return df

    df = df.groupby('TICKER_SYMBOL').apply(lambda x:change_industry(x)).reset_index(drop=True)  #解决一个公司前后在不同行业的问题
    if len(indu_tickers)>0:
        df = df.loc[df.TICKER_SYMBOL.isin(indu_tickers.index)]
    return df

# 将三个报表以及市值合并为一个DataFrame
def Factor_Merge(indu_tickers, factors_income, factors_balance, factors_cash,PIT_date='2018-06-30') :
    #利润表因子数据读取
    FactorData_income = get_factor_data(indu_tickers,Income_st_Path,Factors=factors_income)
    #负债表因子数据读取
    FactorData_balance = get_factor_data(indu_tickers,Balance_sh_Path,Factors=factors_balance)
    #现金流量表因子数据读取
    FactorData_cash = get_factor_data(indu_tickers,Cash_F_st_Path,Factors=factors_cash)

    #数据的PIT处理和排序,此处的排序与后面的空值填充有关系
    FactorData_income = FactorData_income.loc[FactorData_income['PUBLISH_DATE'] <= pd.to_datetime(PIT_date)].sort_values(['END_DATE', 'PUBLISH_DATE'], ascending=[False, False])
    FactorData_balance = FactorData_balance.loc[FactorData_balance['PUBLISH_DATE'] <= pd.to_datetime(PIT_date)].sort_values(['END_DATE', 'PUBLISH_DATE'], ascending=[False, False])
    FactorData_cash = FactorData_cash.loc[FactorData_cash['PUBLISH_DATE'] <= pd.to_datetime(PIT_date)].sort_values(['END_DATE', 'PUBLISH_DATE'], ascending=[False, False])
    
    #市值数据读取
    FactorData_market = get_market_data(indu_tickers)
    
    del FactorData_income['PUBLISH_DATE']
    del FactorData_balance['PUBLISH_DATE']
    del FactorData_cash['PUBLISH_DATE']
    
    df_temp1 = FactorData_income.merge(FactorData_balance, left_on=['TICKER_SYMBOL', 'END_DATE', 'FISCAL_PERIOD', 'REPORT_TYPE'], right_on=['TICKER_SYMBOL', 'END_DATE', 'FISCAL_PERIOD', 'REPORT_TYPE'], how='outer')
    df_temp2 = df_temp1.merge(FactorData_cash, left_on=['TICKER_SYMBOL', 'END_DATE', 'FISCAL_PERIOD', 'REPORT_TYPE'], right_on=['TICKER_SYMBOL', 'END_DATE', 'FISCAL_PERIOD', 'REPORT_TYPE'], how='outer')   
    df_temp2['END_DATE_year&month'] = df_temp2['END_DATE'].dt.year.astype(str)+df_temp2['END_DATE'].dt.month.astype(str) 
    #'END_DATE_year&month'与市值merge使用
#     display(df_temp2.head())
#     display(FactorData_market.head())
    factor_merge = df_temp2.merge(FactorData_market, left_on=['TICKER_SYMBOL', 'END_DATE_year&month'], right_on=['TICKER_SYMBOL', 'END_DATE_year&month'], how='outer') 
#     display(factor_merge.head())
    factor_merge = factor_merge[factor_merge['FISCAL_PERIOD'].notnull()]  #保留三大报表有的信息
    factor_merge = factor_merge[factor_merge['TYPE_NAME_EN'].notnull()]

    return factor_merge

# 4.model_1----------------------------------------------------------------------------------
# 缺失值填充,每只股票做填充
def nan_fill(in_df) :
    df = in_df.copy()
    df = df.groupby(by='TICKER_SYMBOL', as_index=False).fillna(method='backfill')#填充方法跟排序有关
    fill0_df  = df.fillna(0)
    dropna_df = df.dropna()
    return fill0_df, dropna_df

# 将 REVENUE 移一位,为了实现上期因子对应下期营收
def shift_revenue(in_df) :
    df = in_df.copy()
    df['NEXT_REVENUE'] = df['REVENUE'].shift(1)
    df['NEXT_MARKET_VALUE'] = df['MARKET_VALUE'].shift(1)#后面需要取对数需要移一位
    return df

# 获取训练集、测试集和目标预测集
def extract_for_model_1(factor_df, test_date) :
    # date的月份和日期只能取['xxxx-03-31', 'xxxx-06-30', 'xxxx-09-30', 'xxxx-12-31']
    df = factor_df.copy()
    df = df.groupby(by='TICKER_SYMBOL').apply(lambda x: shift_revenue(x)).reset_index(drop=True)
    df = df.fillna(0)
    # df = df.dropna()   #这样会把2018.03.31的测试数据删掉
    df = df.set_index('TICKER_SYMBOL')
    
    train_df = df[df['END_DATE'] != test_date]
    train_df = train_df[train_df['END_DATE'] !='2018-03-31'] #2018-03-31的数据是预测样本
    test_df = df[df['END_DATE'] == test_date]
    goal_df = df[df['END_DATE'] == '2018-03-31'] #输出最终测试数据
#     display(goal_df.head())
    return train_df, test_df,goal_df

# 机器学习
def sklearn_model_1(train_df, test_df,goal_df, x_name, y_name):
    #获取训练、测试、预测数据
    train_in = train_df.copy()
    test_in = test_df.copy()
    goal_in = goal_df.copy()
    
    x_train_in = train_in[x_name].values
    y_train_in = train_in[y_name].values.reshape(-1,1)
    x_test_in  = test_in[x_name].values
    y_test_in  = test_in[y_name].values.reshape(-1,1)
    x_goal_in = goal_in[x_name].values
    
    #数据标准化处理
    from sklearn.preprocessing import StandardScaler
    ss_x=StandardScaler()
    ss_y=StandardScaler()
    x_train=ss_x.fit_transform(x_train_in)
    x_test=ss_x.transform(x_test_in)
    y_train=ss_y.fit_transform(y_train_in)
    y_test=ss_y.transform(y_test_in)
    x_goal=ss_x.transform(x_goal_in)  
    
    #放入SVR模型学习,使用的是'linear',多项式'poly',径向基'rbf'效果咋样?
    from sklearn.svm import SVR
    import numpy as np
    # linear_svr = SVR(kernel='linear')
    # poly_svr = SVR(kernel='poly')
    
    rbf_svr = SVR(kernel='rbf')
    '''
    A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_sample
    y_train改为y_train.ravel()就ok了
    '''
    rbf_svr.fit(x_train,y_train.ravel())
    y_test_prd = rbf_svr.predict(x_test)
    y_goal_prd = rbf_svr.predict(x_goal)
    # 逆标准化
    y_test_inverse = ss_y.inverse_transform(y_test)
    y_test_prd_inverse = ss_y.inverse_transform(y_test_prd)
    y_goal_prd_inverse = ss_y.inverse_transform(y_goal_prd)
    
    del test_df[y_name]
    test_df['NEXT_REVENUE'] = y_test_inverse
    test_df['REVENUE_prd'] = y_test_prd_inverse  
#     display(test_df.head())
    goal_prd = pd.DataFrame(index = goal_in.index)
    goal_prd['goal_REVENUE_prd'] = y_goal_prd_inverse 
#     display(goal_prd.head())
    return test_df,goal_prd


# 模型一:
# 利润总额 = 净利润 + 所得税费用
# 营业利润 = 利润总额 - 营业外收入
# 营业收入 = 营业利润 + 营业税金及附加 + 销售费用 + 管理费用 + 财务费用 + 资产减值损失 - 公允价值变动收益 - 投资收益
def model_1(all_test_date,all_industries,factor_df):
    
    t1 = time.time()
    
    #净利润,所得税费用,营业外收入,营业税金及附加,销售费用,营业成本,管理费用,营业收入,营业利润
    factors_income  =['N_INCOME', 'INCOME_TAX', 'NOPERATE_INCOME', 'BIZ_TAX_SURCHG', 'COGS','GENL_ADMIN_EXP', 'REVENUE','OPERATE_PROFIT']
    # 负债合计, 资产总计,所有者权益(或股东权益)合计
    factors_balance =['T_LIAB', 'T_ASSETS','T_SH_EQUITY']
    # 经营活动产生的现金流量净额, 投资活动产生的现金流量净额, 筹资活动产生的现金流量净额, 期末现金及现金等价物余额
    factors_cash    =['N_CF_OPERATE_A', 'N_CF_FR_INVEST_A', 'N_CF_FR_FINAN_A', 'N_CE_END_BAL']
    
    prd_ticker_list = get_prd_tickers().TICKER_SYMBOL
    goal_set = set(prd_ticker_list)
    everytimes_goal_prd = pd.DataFrame(index = prd_ticker_list)
    percent = 0.7        #筛选预测效果较好的百分比
    

    for test_date in all_test_date:
        
        all_result_prd_list = []
        all_goal_prd_list = []
    
        print (test_date,'总共%d个行业'%len(all_industries))
        for industry in all_industries:
            df_copy=factor_df.copy()
            df_copy=df_copy.loc[df_copy['TYPE_NAME_EN']==industry]    #分行业训练模型,选行业
            df_copy.dropna(axis=1, thresh=int(len(df_copy)*1.0/10),inplace=True)   # 某列的数据量小于1/10删除此因子
            # 缺失值填充,fill0_df为填0, dropna_df为丢弃nan样本
            fill0_df, dropna_df = nan_fill(df_copy)       
            #切分训练和测试集
            train_df, test_df, goal_df= extract_for_model_1(fill0_df, test_date)
            # 机器学习
            x_name = factors_income + factors_balance + factors_cash  #需要选取模型一里面的因子
            x_name = [ j for j in x_name if j in train_df.columns ]   #剔除已被删除的因子
            y_name = 'NEXT_REVENUE'
            
            result_prd,goal_prd = sklearn_model_1(train_df, test_df,goal_df, x_name, y_name)
    
            result_prd = result_prd[result_prd.index.isin(prd_ticker_list)]
            goal_prd = goal_prd[goal_prd.index.isin(prd_ticker_list)]
            all_result_prd_list.append(result_prd)       #把测试预测结果分行业append
            all_goal_prd_list.append(goal_prd)    #把目标预测结果分行业append,2018-06-31
            
        final_result = pd.concat(all_result_prd_list,axis=0)
        final_result = final_result[['END_DATE','NEXT_REVENUE','REVENUE_prd','NEXT_MARKET_VALUE']]
        
        # 把每个行业的预测结果拼在一起,放在df里面
        allindustries_goal_prd_df = pd.concat(all_goal_prd_list,axis=0)
        allindustries_goal_prd_df = allindustries_goal_prd_df[['goal_REVENUE_prd']]
#         allindustries_goal_prd_df.index = allindustries_goal_prd_df.index.astype(str).str.zfill(6) 
#         display(everytimes_goal_prd.head(10))
#         display(allindustries_goal_prd_df.head(10))
        everytimes_goal_prd = everytimes_goal_prd.merge(allindustries_goal_prd_df,left_index=True,right_index=True,how='left')
        
#         plt_scatter_1(final_result)                               #每一个季度的预测结果展示
        goal_set = quantile_goal(final_result,goal_set,percent)  #选取指定百分位股票
 
    everytimes_goal_prd['goal_REVENUE'] = everytimes_goal_prd.mean(axis=1)
    everytimes_goal_prd['priority'] =3   #设置数据优先级 
        
#     write_prd_tocsv(everytimes_goal_prd,Flag='model_1',a=1)      #输出预测值到CSV文件,以哪一个季度作为测试集结果应该是没太大区别
#     write_prd_tocsv(final_result,Flag='model_1',a=0)         #输出预测值到CSV文件,以哪一个季度作为测试集结果应该是没太大区别

#     print ('多期指定百分位最优股票交集打分情况:' )   
    final_goal_list = list(goal_set)
    goal_final_result_model_1 = final_result[final_result.index.isin( final_goal_list )]  
    #不需要重新训练模型,直接就用这个模型训练选出的最适合股票就行,图形展示效果和分数
#     plt_scatter_1(goal_final_result_model_1)

    t2 = time.time()
#     print ('运行时间:%d s'%(t2-t1))
    return everytimes_goal_prd,final_goal_list

# 5.model_2----------------------------------------------------------------------------------
# ROE = (EBIT/revenue)*(EBT/EBIT)*(net income/EBT)*(revenue/average asset)*(average assets/average equity)*(1-dividend rate)
# ROE公式计算5个因子,并计算log值
def factor_compute(in_df, base_value, golden_enddate) :
    df = in_df.copy()
    df['market_value_1'] = np.clip(df['MARKET_VALUE'], 2e8, max(df['MARKET_VALUE']))
    df['market_value_1'] = np.log2(df['market_value_1'])
    df = df.merge(golden_enddate, left_on=['END_DATE'], right_on=['END_DATE'], how='right')
    df['EBIT'] = df['N_INCOME'] + df['C_OUTF_FR_FINAN_A'] + df['INCOME_TAX'] # 息税前利润 = 净利润+利息+税
    df['EBT'] = df['N_INCOME'] + df['INCOME_TAX']
    df['LAST_T_ASSETS'] = df['T_ASSETS'].shift(-1)
    df['average_assets'] = (df['LAST_T_ASSETS'] + df['T_ASSETS']) / 2
    df['LAST_T_SH_EQUITY'] = df['T_SH_EQUITY'].shift(-1)
    df['average_equity'] = (df['LAST_T_SH_EQUITY'] + df['T_SH_EQUITY']) / 2
    df['dividend_rate'] = df['DIV_PAYABLE'] / df['EBIT']
    df['NEXT_REVENUE'] = df['REVENUE'].shift(1)
    df['NEXT_MARKET_VALUE'] = df['MARKET_VALUE'].shift(1)
    #print df[['EBIT','EBT','N_INCOME','REVENUE','average_assets','average_equity']].head().to_html()
    #print df[['average_equity','average_assets']].head().to_html()
    df['market_value_2'] = df['market_value_1'].shift(1)
    df['log_cons'] = df['market_value_2'] / df['market_value_1']
    df['log_cons'] = df['log_cons'].fillna(1)
    
    # 1_dividend_rate
    df['1_dividend_rate'] = 1 - df['dividend_rate']
    df['1_dividend_rate'] = df['1_dividend_rate'].fillna(1)
    
    #保存各因子正负号,各因子取绝对值
    df['flag'] = df['N_INCOME'] * df['average_equity']/abs( df['N_INCOME'] * df['average_equity'] ) 
    df['EBIT']=abs(df['EBIT'])
    df['EBT']=abs(df['EBT'])
    df['N_INCOME']=abs(df['N_INCOME'])
    df['REVENUE']=abs(df['REVENUE'])
    df['average_assets']=abs(df['average_assets'])
    df['average_equity']=abs(df['average_equity'])
    df['log_cons']=abs(df['log_cons'])
    df['1_dividend_rate']=abs(df['1_dividend_rate'])
    
    # 五个因子
    df['EBIT/REVENUE'] = df['EBIT'] / df['REVENUE']
    df['EBT/EBIT'] = df['EBT'] / df['EBIT']
    df['N_INCOME/EBT'] = df['N_INCOME'] / df['EBT']
    df['REVENUE/average_assets'] = df['REVENUE'] / df['average_assets']
    df['average_assets/average_equity'] = df['average_assets'] / df['average_equity']
       
    # ROE
    df['ROE'] = df['EBIT/REVENUE'] * df['EBT/EBIT'] * df['N_INCOME/EBT'] * \
    df['REVENUE/average_assets'] * df['average_assets/average_equity']
    df['NEXT_ROE'] = df['ROE'].shift(1)
    
    #保证各因子不等于0
    df = df[(df['log_cons']>0)&(df['1_dividend_rate']>0)&(df['EBIT']>0)&(df['EBT']> 0)&(df['N_INCOME'] > 0)&(df['REVENUE']>0)&(df['average_assets']>0)&(df['average_equity']>0)] 
    # df = df[df['log_cons']>0][df['1_dividend_rate']>0][df['EBIT']>0][df['EBT']> 0][df['N_INCOME'] > 0][df['REVENUE']>0][df['average_assets']>0][df['average_equity']>0] 
    df = df[df['average_assets/average_equity'] > 0]#[df['1_dividend_rate'] > 0]#[df['NEXT_ROE'] > 0]    
    
    #取log
    df['LOG_EBIT/REVENUE'] = np.log10(df['EBIT/REVENUE'])/np.log10(base_value)
    df['LOG_EBT/EBIT'] = np.log10(df['EBT/EBIT'])/np.log10(base_value)
    df['LOG_N_INCOME/EBT'] = np.log10(df['N_INCOME/EBT'])/np.log10(base_value)
    df['LOG_REVENUE/average_assets'] = np.log10(df['REVENUE/average_assets'])/np.log10(base_value)
    df['LOG_average_assets/average_equity'] = np.log10(df['average_assets/average_equity'])/np.log10(base_value)
    df['LOG_NEXT_ROE'] = np.log10(df['NEXT_ROE'])/np.log10(base_value)
    df['LOG_log_cons'] = np.log10(df['log_cons'])/np.log10(base_value)
    df['LOG_1_dividend_rate'] = np.log10(df['1_dividend_rate'])/np.log10(base_value)
    
    df['Arate'] = df['ROE'] * df['1_dividend_rate'] + 1
    df['Arate'] = df['Arate'] * df['log_cons'] - 1
    df['LOG_Arate'] = np.log10(df['Arate'])/np.log10(base_value)
    # columns = ['TICKER_SYMBOL','END_DATE','LOG_EBIT/REVENUE','LOG_EBT/EBIT','LOG_N_INCOME/EBT','LOG_REVENUE/average_assets','LOG_average_assets/average_equity','LOG_NEXT_ROE','1_dividend_rate','REVENUE','NEXT_REVENUE','NEXT_MARKET_VALUE','flag', 'LOG_Arate', 'log_cons']
    # df = df[columns]
    return df

# 计算因子,
def extract_for_model_2(in_df, base_value, test_date, golden_enddate) :
    # date的月份和日期只能取['xxxx-03-31', 'xxxx-06-30', 'xxxx-09-30', 'xxxx-12-31']
    df = in_df.copy()
    df = df.sort_values('END_DATE', ascending=False)
    # print 'before factor_compute:', df.shape
    df = df.groupby(by='TICKER_SYMBOL').apply(lambda x: factor_compute(x, base_value, golden_enddate)).reset_index(drop=True)
    # print 'before dropna:', df.shape
    df1 = df[df['END_DATE'] == '2018-03-31']
    df2 = df[df['END_DATE'] != '2018-03-31'].dropna()  # 避免把2018.03.31的测试数据删掉
    df = df1.append(df2)
    # print 'after dropna:', df.shape
    
    # print df.shape, df.head().to_html()
    df = df.set_index('TICKER_SYMBOL')
    train_df = df[df['END_DATE'] != test_date]
    train_df = train_df[train_df['END_DATE'] !='2018-03-31'] # 这一期数据不作为训练数据,因为没有y值
    test_df = df[df['END_DATE'] == test_date]
    goal_df = df[df['END_DATE'] == '2018-03-31']    # 二季度预测数据
    # print 'train :', train_df.shape
    return train_df, test_df, goal_df

# 机器学习
def sklearn_model_2(train_df, test_df,goal_df, x_name, y_name): 
    train_in = train_df.copy()
    test_in = test_df.copy()
    goal_in = goal_df.copy()
    x_train_in = train_in[x_name].values
    y_train_in = train_in[y_name].values.reshape(-1,1)
    x_test_in  = test_in[x_name].values
    y_test_in  = test_in[y_name].values.reshape(-1,1)
    x_goal_in = goal_in[x_name].values
     
    from sklearn.preprocessing import StandardScaler
    ss_x=StandardScaler()
    ss_y=StandardScaler()
    x_train=ss_x.fit_transform(x_train_in)
    x_test=ss_x.transform(x_test_in)
    y_train=ss_y.fit_transform(y_train_in)
    y_test=ss_y.transform(y_test_in)
    x_goal=ss_x.transform(x_goal_in)   

    from sklearn.svm import SVR
    import numpy as np
    # linear_svr = SVR(kernel='linear')
    # poly_svr = SVR(kernel='poly')
    
    rbf_svr = SVR(kernel='rbf')
    rbf_svr.fit(x_train,y_train.ravel())
    y_test_prd = rbf_svr.predict(x_test)
    y_goal_prd = rbf_svr.predict(x_goal)
    
      
    # 逆标准化
    y_test_inverse = ss_y.inverse_transform(y_test)
    y_test_prd_inverse = ss_y.inverse_transform(y_test_prd)
    y_goal_prd_inverse = ss_y.inverse_transform(y_goal_prd)
        
    test_df['test'] = y_test_inverse
    test_df['prd'] = y_test_prd_inverse
    goal_in['goal_prd'] = y_goal_prd_inverse
    return test_df, goal_in

# 用模型输出的结果计算出预测营收值
def compute_revenue(in_df, goal_in_df, base_value, y_name) :
    df = in_df.copy()
    goal_df = goal_in_df.copy()
    if y_name == 'LOG_NEXT_ROE' :
        # 把模型预测出来的log值恢复
        df['prd_ROE'] = np.power(df['prd'],base_value)
        df['REVENUE_RATE'] = df['prd_ROE'] * df['1_dividend_rate']
        df['REVENUE_RATE'] = df['REVENUE_RATE']*df['flag']   #直接预测营收增长率   #'flag'不用此句的话,熊熊做的筛选用上了吗?
        # 计算出预测的营收
        df['REVENUE_prd'] = (1 + df['REVENUE_RATE']) * df['REVENUE']
        
        # for 20180630
        goal_df['goal_prd_ROE'] = np.power(goal_df['goal_prd'],base_value)
        goal_df['goal_REVENUE_RATE'] = goal_df['goal_prd_ROE'] * goal_df['1_dividend_rate']
        goal_df['goal_REVENUE_RATE'] = goal_df['goal_REVENUE_RATE']*goal_df['flag']   #直接预测营收增长率   
        goal_df['goal_REVENUE_prd'] = (1 + goal_df['goal_REVENUE_RATE']) * goal_df['REVENUE']
        
    elif y_name == 'LOG_Arate' :
        # 把模型预测出来的log值恢复
        df['prd_Arate'] = np.power(df['prd'],base_value)
        df['REVENUE_RATE'] = (df['prd_Arate'] + 1) / df['log_cons']
        df['REVENUE_RATE'] = df['REVENUE_RATE'] - 1
        df['REVENUE_RATE'] = df['REVENUE_RATE']*df['flag']
        # 计算出预测的营收
        df['REVENUE_prd'] = (1 + df['REVENUE_RATE']) * df['REVENUE']
        
        # for 20180630
        goal_df['goal_prd_Arate'] = np.power(goal_df['goal_prd'],base_value)
        goal_df['goal_REVENUE_RATE'] = ((goal_df['goal_prd_Arate'] + 1) / goal_df['log_cons'] ) - 1
        goal_df['goal_REVENUE_RATE'] = goal_df['goal_REVENUE_RATE']*goal_df['flag']
        goal_df['goal_REVENUE_prd'] = (1 + goal_df['goal_REVENUE_RATE']) * goal_df['REVENUE']
    return df, goal_df

# 模型二
# ROE = (EBIT/revenue)*(EBT/EBIT)*(net income/EBT)*(revenue/average asset)*(average assets/average equity)*(1-dividend rate)
# 经营活动产生的现金流量净额, 投资活动产生的现金流量净额, 筹资活动产生的现金流量净额, 期末现金及现金等价物余额
def model_2(all_test_date,all_industries,factor_df,percent = 0.9):
    
    t1 = time.time()
    factors_cash = ['N_CF_OPERATE_A', 'N_CF_FR_INVEST_A', 'N_CF_FR_FINAN_A', 'N_CE_END_BAL', 'C_OUTF_FR_FINAN_A']
    # 营业收入, 利息支出,   所得税费用,      净利润
    factors_income  = ['REVENUE', 'INT_EXP', 'INCOME_TAX', 'N_INCOME']
    # 应付股息, 资产总计,所有者权益(或股东权益)合计
    factors_balance = ['DIV_PAYABLE', 'T_ASSETS','T_SH_EQUITY']
    
    golden_enddate = factor_df[factor_df['TICKER_SYMBOL'] == '000001']['END_DATE']
    golden_enddate = pd.DataFrame(golden_enddate)
    prd_ticker_list = get_prd_tickers().TICKER_SYMBOL
    goal_set = set(prd_ticker_list)
    everytimes_goal_prd = pd.DataFrame(index = prd_ticker_list)
    base_value = 500      #取对数的底
    
    for test_date in all_test_date:
        
        all_result_prd_list = []
        all_goal_prd_list = []
    
        print (test_date,'总共%d个行业'%len(all_industries))
        for industry in all_industries:
            df_copy=factor_df.copy()
            df_copy=df_copy.loc[df_copy['TYPE_NAME_EN']==industry]    #选行业
            df_copy[['INT_EXP', 'DIV_PAYABLE']] = df_copy[['INT_EXP', 'DIV_PAYABLE']].fillna(0)
            df_copy.dropna(axis=1, thresh=int(len(df_copy)*1.0/10),inplace=True)   # 某列的数据量小于1/10删除此因子

            # 计算出5个因子,并求log值, 用test_date这一期的因子来预测下一期的y
            train_df, test_df,goal_df = extract_for_model_2(df_copy, base_value, test_date, golden_enddate)

            # 机器学习
            x_name = ['LOG_EBIT/REVENUE', 'LOG_EBT/EBIT', 'LOG_N_INCOME/EBT', 'LOG_REVENUE/average_assets', 'LOG_average_assets/average_equity']
            y_name = 'LOG_NEXT_ROE'
            # x_name = ['LOG_EBIT/REVENUE', 'LOG_EBT/EBIT', 'LOG_N_INCOME/EBT', 'LOG_REVENUE/average_assets', 'LOG_average_assets/average_equity', 'LOG_log_cons', 'LOG_1_dividend_rate']
            # y_name = 'LOG_Arate'
            result_prd, goal_prd = sklearn_model_2(train_df, test_df,goal_df, x_name, y_name)
            # print '第%d个行业'%i,industry,'预测数量%d'%len(goal_prd)
            # 用模型输出的结果计算出预测营收值
            result_prd = result_prd[result_prd.index.isin(prd_ticker_list)]
            goal_prd = goal_prd[goal_prd.index.isin(prd_ticker_list)]
            
            result_prd, goal_prd = compute_revenue(result_prd, goal_prd, base_value, y_name)
            
            all_result_prd_list.append(result_prd)
            all_goal_prd_list.append(goal_prd)         
               
        final_result = pd.concat(all_result_prd_list,axis=0)
        columns = ['END_DATE','REVENUE','NEXT_REVENUE','REVENUE_prd','NEXT_MARKET_VALUE']
        final_result = final_result[columns]
        
        # final_result['REVENUE_prd']为第二季度营收预测,如果要预测半年度营收,则需要加上前面的第一季度营收
        
        final_result['REVENUE_prd'] = final_result['REVENUE_prd'] + final_result['REVENUE']
        
        allindustries_goal_prd_df = pd.concat(all_goal_prd_list,axis=0)
        allindustries_goal_prd_df['goal_REVENUE_prd'] = allindustries_goal_prd_df['goal_REVENUE_prd'] + allindustries_goal_prd_df['REVENUE']
        allindustries_goal_prd_df = allindustries_goal_prd_df[['goal_REVENUE_prd']]
        # print goal_df_0631.shape
        everytimes_goal_prd = everytimes_goal_prd.merge(allindustries_goal_prd_df,left_index=True,right_index=True,how='left')      
        
#         plt_scatter_1(final_result)                               #每一个季度的预测结果展示
        goal_set = quantile_goal(final_result,goal_set,percent)  #选取指定百分位股票
 
    everytimes_goal_prd['goal_REVENUE'] = everytimes_goal_prd.mean(axis = 1)
    everytimes_goal_prd['priority'] = 2   #设置数据优先级
    final_goal_list = list(goal_set)  
    goal_final_result_model_2 = final_result[final_result.index.isin(final_goal_list)]  
    #不需要重新训练模型,直接就用这个模型训练选出的最适合股票就行,图形展示效果和分数
#     plt_scatter_1(goal_final_result_model_2)
    t2 = time.time()
#     print ('运行时间:%d s'%(t2-t1))

    return everytimes_goal_prd,final_goal_list
# 3.model_3----------------------------------------------------------------------------------    
####模型三:------------时序模型---8.7-------------------------------------
#--------函数定义----------------------------
#功能:获取数据dataframe连续财务期数'C_PERIOD'和总财务期数'N_PERIOD',原函数data_filter作废
#参数说明:
#df:需处理的数据dataframe;
#输出:
#df:附有连续财务期数'C_PERIOD'和总财务期数'N_PERIOD'的数据dataframe

def get_period(df):
    def merge_DF(df1,df2):  #合并完整的交易日历和判断连续期数;df1:数据dataframe;df2:完整的交易日历dataframe
        df1['N_PERIOD']=len(df1)
        df2=df1.merge(df2,on=['END_DATE'],how='outer').sort_values(by='END_DATE', ascending=False).reset_index(drop=True)
        df2['TICKER_SYMBOL'] = df2['TICKER_SYMBOL'].ffill().bfill()#填充股票代码
        #df2=df2.fillna(0)#数据表其他空值填充
        df2['C_PERIOD']=df2['C_PERIOD'].fillna('2000-01-01')#数据表其他空值填充(python3对数据格式要求更严格)
        df2['END_DATE']=pd.to_datetime(df2['END_DATE'])
        df2['C_PERIOD']=pd.to_datetime(df2['C_PERIOD'])
        df2['C_PERIOD']=df2['END_DATE']-df2['C_PERIOD']#判断是否缺期,相减不等于0的为缺期
        i=0
        while i<41:   #提取连续0的个数,即连续的期数
            if df2['C_PERIOD'][i].days==0:   
                i=i+1               
            else:break    
        df2['C_PERIOD']=i 
        return df2    
    End_date=(df.loc[df['TICKER_SYMBOL']=='000001'].sort_values(by='END_DATE', ascending=False))[['END_DATE']]#提取交易日历,'000001'有完整的交易日历
    df['C_PERIOD']=df['END_DATE']
    df=df.groupby(by='TICKER_SYMBOL', as_index=False).apply(merge_DF, End_date)  #将完整的交易日历合并到数据表中
    df=df.loc[df['FISCAL_PERIOD']!=0].reset_index(drop=True)    # 删除无效行
    return df
#营收数据季度化及求营收季度增长率
def REVENUE_q_rate(ff): #计算季度营收增长率数据
    
    def REVENUE_q_data(df):    #计算季度营收值 
        
        def REVENUE_shift(ff): #财报营收值移位相减得季度营收值
            ff=ff.sort_values(by='END_DATE',ascending=False)
            ff['s_REVENUE']=ff['REVENUE'].shift(-1)
            ff=ff.fillna(0)
            ff['Q_REVENUE'] = ff['REVENUE']- ff['s_REVENUE']
            #del ff['s_REVENUE']
            return ff        
        df['year']=df['END_DATE'].dt.year
        df=df.groupby(by=['TICKER_SYMBOL','year'], as_index=False).apply(REVENUE_shift)
        df=df.reset_index(drop=True)
        return df 
    
    def r_q_rate(df):  #计算增长率
        df=df.sort_values(by='END_DATE',ascending=False)
        df['t_REVENUE']=df['Q_REVENUE'].shift(-1)
        df['Q_REVENUE_R'] = (df['Q_REVENUE']-df['t_REVENUE'])/df['t_REVENUE']   #计算增长率
        #del ff['t_REVENUE']
        df.dropna(inplace=True)
        df=df.reset_index(drop=True)
        return df

    ff=REVENUE_q_data(ff)
    ff=ff.groupby(by=['TICKER_SYMBOL'], as_index=False).apply(r_q_rate)
    ff=ff.reset_index(drop=True)
    return ff

def continuation_filter(tf):#财务数据连续性筛选及季度营收增长率计算
    mf=pd.DataFrame()
    i=0
    j=0
    #print u'n  股票数  数据行数'
    for n in range(1,43,1):  #删除每股非连续数据        
        df=tf.loc[tf['C_PERIOD']==n]  #连续财务周期筛选
        df=df.sort_values(by=['END_DATE'],ascending=False)
        df=df.groupby(by=['TICKER_SYMBOL'], as_index=False).apply(lambda x: x[0:n])#取每股的前n期连续数据,保证数据的有效性 
        l=len(df)
        if l==0: t=0
        else:   
            df=REVENUE_q_rate(df)
            t= len(df['TICKER_SYMBOL'].unique())
            i=i+t
            j=j+l
           # print 'n=%d:'%n,t,l
        mf=pd.concat([mf,df])
    # print i,j
    # print u"删除数据%d行"%(len(factor)-j)    
    return mf

#极值处理
def filter_extreme_percentile(series,min = 0.10,max = 0.90): #1.百分位法
    #series = series.sort_values() 
    q = series.quantile([min,max])
    return np.clip(series,q.iloc[0],q.iloc[1]) 
 #2.MAD: 中位数去极值。第一步,找出所有因子的中位数 Xmedian;第二步:得到每个因子与中位数的绝对偏差值 Xi−Xmedian;第三步:得到绝对偏差值的中位数 MAD;第四步:确定参数 n ,从而确定合理的范围为 [Xmedian−nMAD,Xmedian nMAD],并针对超出合理 范围的因子值做如下的调整:
def filter_extreme_MAD(series,n=3):
    median = series.quantile(0.5) 
    new_median = ((series - median).abs()).quantile(0.50) 
    max_range = median + n*new_median
    min_range = median - n*new_median
    return np.clip(series,min_range,max_range)
def filter_extreme_3sigma(series,n=3): #3. 3σ 法 
    mean = series.mean() 
    std = series.std() 
    max_range = mean + n*std 
    min_range = mean - n*std 
    return np.clip(series,min_range,max_range) 
def max_replace(df,max=100): #4.将营收增长率大于max的值填充为平均值-----------------------------------
    df1=df.loc[df['Q_REVENUE_R']>max].sort_values(by=['END_DATE'],ascending=[True])
    tickers=df1['TICKER_SYMBOL'].unique()
    print ( u'超范围的股票有%d,共%d条。'%(len(tickers),len(df1)))
    # print df1.head().to_html()
    # print tickers
    # for ticker in tickers[:2]:
    #     ff=df1.loc[df1['TICKER_SYMBOL']==ticker].reset_index(drop=True)
    #     print ff['Q_REVENUE_R'].max(),ff.tail(10).to_html()
    # print '+'*160
    df=df.loc[df['Q_REVENUE_R']<=max] 
    #-------------------对因子值大于max的数据用均值填充------------------------------
    df1['Q_REVENUE_R']= np.nan  
    df=pd.concat([df,df1]).sort_values(by=['TICKER_SYMBOL','END_DATE'],ascending=[True,False])#.ffill()
    df=df.groupby(by=['TICKER_SYMBOL'], as_index=False).apply(lambda x: x.fillna(x['Q_REVENUE_R'].mean())).reset_index(drop=True) #均值填充极值
    # for ticker in tickers[:2]:
    #     ff=df.loc[df['TICKER_SYMBOL']==ticker].reset_index(drop=True)
    #     print ff['Q_REVENUE_R'].max(),ff.tail(10).to_html()
    # print '+'*160
    return df

#模型3数据预处理:计算营收季度增长率,去极值
def model3_data_pre_process(factor_df):
    factor_p=factor_df.copy()#
    factor_p['REVENUE']=factor_p['REVENUE'].fillna(0)
    factor_p= factor_p[factor_p['REVENUE']!=0] #去除'REVENUE'=0的项
    factor_p=get_period(factor_p)  #计算季度数据
    factor_p=factor_p[factor_p['FISCAL_PERIOD'].notnull()]  #删除无效值
    goal_tickers=get_prd_tickers().TICKER_SYMBOL   #获取股票池
    factor_p=factor_p[factor_p.TICKER_SYMBOL.isin(goal_tickers)]   #  获取股票池股票因子数据
    factor_p=continuation_filter(factor_p)#财务数据连续性筛选及季度营收增长率计算
    factor_p=factor_p[factor_p['C_PERIOD']>=10]#删除连续周期小于10的股票
    factor_p=max_replace(factor_p,max=3) #营收增长率大于max的值填充为平均值-
    factor=factor_p.pivot_table(index='END_DATE',columns='TICKER_SYMBOL',values='Q_REVENUE_R').sort_index(axis=0,ascending=False)#.T
    factor.sort_index(inplace = True,axis=0) # index日期按照从过去到以后,上到下排序,ascending=0:默认升序;ascending=1:降序
    #---------------------------------------------------------------------
    #--------------------三种去极值方法,可与max_replace配合使用-----------------------
    #factor=factor.apply(filter_extreme_percentile)#1.filter_extreme_percentile(series,min = 0.10,max = 0.90) 百分位法;
    #factor=factor.apply(filter_extreme_MAD,1) #2.filter_extreme_MAD(series,n=3) 中位数去极值;
    # factor=factor.apply(filter_extreme_3sigma) #3.filter_extreme_3sigma(series,n=3) 3σ 法 
    return factor,factor_p                #factor:转换格式后的增长率数据 ;factor_p:原数据及增长率数据

def model3_score(result_series,factor,pred_date): #模型3数据还原及打分   
    pred_dates=['2018-06-30','2018-03-31','2017-12-31','2017-09-30','2017-06-30','2017-03-31','2016-12-31']
    #print (date,pred_dates[pred_dates.index(date)+1] )
    factor_s=factor.copy()
    df=pd.DataFrame()
    df['PRED_REVENUE_rate']=result_series
    df=df.reset_index()
    this_season=factor_s[factor_s['END_DATE']==pred_dates[pred_dates.index(pred_date)+1]][['TICKER_SYMBOL','END_DATE','REVENUE','Q_REVENUE']].reset_index(drop=True)#当季数据
    if (pred_date!='2018-06-30'):
        next_season=factor_s[factor_s['END_DATE']==pred_date][['TICKER_SYMBOL','REVENUE','MARKET_VALUE']].reset_index(drop=True)#下一季数据
        next_season.columns=['TICKER_SYMBOL','NEXT_REVENUE','NEXT_MARKET_VALUE']    
        #print (len(next_season),next_season.head())
        this_season=this_season.merge(next_season,on='TICKER_SYMBOL')    
    this_season=this_season.merge(df,on='TICKER_SYMBOL')
    print (len(this_season) ,this_season.head())
    if ((pred_date=='2018-03-31') or (pred_date=='2017-03-31')):
        this_season['REVENUE_prd']=this_season['Q_REVENUE']*(this_season['PRED_REVENUE_rate']+1)
    else:
        this_season['REVENUE_prd']=this_season['Q_REVENUE']*(this_season['PRED_REVENUE_rate']+1)+this_season['REVENUE']
    
    del this_season['Q_REVENUE']
    #print ('3',len(this_season) ,this_season.head())
    if (pred_date!='2018-06-30'):
        df_s=evaluation_indicator(this_season).reset_index(drop=True)
        plt_scatter1(this_season)       
    return this_season

############################################################# ARMA #########################################################
# 函数
def test_stationarity(timeseries,adf_output):
    # ADF检验
    """
    adfuller返回值为tuple(里面嵌套一个tuple):第0 1个值为adf检验值,和p值,p值大于adf_p,不平稳;
    adfuller:里面的变量为series类型
    """
    dftest = adfuller(timeseries, autolag='AIC')     
    # ADF检验结果保存
    adf_output.iloc[0][0]=dftest[0]
    adf_output.iloc[1][0]=dftest[1]
    adf_output.iloc[2][0]=dftest[2]
    adf_output.iloc[3][0]=dftest[3]
    adf_output.iloc[4][0]=dftest[4]['1%']
    adf_output.iloc[5][0]=dftest[4]['5%']
    adf_output.iloc[6][0]=dftest[4]['10%']    
    return adf_output
def time_series_model(dataframe,date):
    """
    函数输入数据:series:待处理时间序列模型   date:预测日期('2017-12-31')
    """
    # 1、取数据+数据dataframe预处理   
    data_clr=dataframe.copy()
    # 2、Sigmoid函数=1/1+e-x
    c=1.0
    data_clr=c/(1+np.exp(-data_clr))
    data_clr_drop=data_clr[data_clr.index<date]
    # 4、结果初始化
    #创建存放ADF检验结果的dataframe,p值(test_stationarity(timeseries).iloc[1][0])<adf_p,说明时间序列非平稳
    adf_output=pd.DataFrame(index=['Test Statistic Value','p-value','Lags Used','Number of Observations Used','Critical Value(1%)','Critical Value(5%)','Critical Value(10%)'],columns=['Value'])                           
    # 创建存放结果的dataframe,index 股票代码,columns 需要保存的数据,每个初始元素为float64
    df_result=pd.DataFrame(np.random.randn(data_clr_drop.columns.size,10),index=data_clr_drop.columns,columns=['p','q',\
                         'ARMA_Error','ADF_p','ADF_Error','normal','predict_value','length of data','Error',date])
    df_result[:len(df_result)]=0.0 # 置零
    # index设为stock代码
    df_result.set_index(dataframe.columns,inplace=True) 
    # 5、循环
    #num=range(0,10,1)#test
    num=range(0,data_clr_drop.columns.size) #[i+100 for i in range(data_clr.columns.size-100)] ;  num=range(0,data_clr_drop.columns.size)
    for i in num:
        t1=time.time()
        # 1)第i列和index作为series,存在data_s中;季节差分;存储原始数据长度
        data_s1=pd.Series(data_clr_drop[data_clr_drop.columns[i]].values, index=data_clr_drop.index) # 原始数据,存在data_s1中    
        if data_s1.isnull().drop_duplicates(keep='last')[0]:
            data_s1=data_s1[data_s1.index>(data_s1.isnull().drop_duplicates(keep='last').index)[0]]#去除数据中间出现的空值
        data_s=copy.deepcopy(data_s1.dropna()) # 部分数据没有40个样本,删除na
        df_result.iloc[i]['length of data']=len(data_s)
        # 2)绘制1只stock的营收增长率图形(大概看一下数据情况,和异常值)
        # 3)ADF检验,p值大于adf_p,说明时间序列非平稳;
        adf_p=0.05 # ADF检验p值,阈值设置
        df_result.iloc[i]['ADF_p']=test_stationarity(data_s,adf_output).iloc[1][0] # 调用ADF检验函数;保存最终(差分或未差分)ADF检验的p值
        # ADF检验p值大于adf_p,置1报错标志
        if df_result.iloc[i]['ADF_p']>=adf_p:
            df_result.iloc[i]['ADF_Error']=1.0
         
        # 4)BIC AIC HQIC检验,ARMA(p,q)阶数确定
        order = st.arma_order_select_ic(data_s,max_ar=3,max_ma=3,ic=['aic', 'bic', 'hqic']) # BIC AIC HQIC检验
    
        # 确定ARMA阶数
        t_order=order.bic_min_order
        if order.bic_min_order==(0,0):
            t_order=order.aic_min_order
            if order.aic_min_order==(0,0):
                t_order=order.hqic_min_order
                if order.hqic_min_order==(0,0):
                    df_result.iloc[i]['ARMA_Error']=1.0 # 如果BIC AIC HQIC检验全为(0,0),ARMA报错置1; t_order=(0,0)
    
        # 保存阶数p q
        df_result.iloc[i]['p'], df_result.iloc[i]['q']=t_order#,type(order.bic_min_order) # 模型确定的(p,q)  tuple:里面为int64

        # 5) ARMA建模+fit你和+预测+差分恢复,根据上一步确认p,q
        if (t_order!=(0,0)) and (df_result.iloc[i]['ADF_p']<adf_p): # 满足条件再拟合,否则容易报错
            # a) ARMA拟合
            """
             会发生p,q均为0,放弃ARMA模型建模和拟合
            """
            model = ARMA(data_s, order=t_order)
            
            # b) .fit拟合(根据是否报错决定后续操作) 误差量化 预测
            # 试错,报错也不让模型中断
            try:               
                result_arma = model.fit()  # .fit参数没弄明白                 
            except: # 报错处理
                df_result.iloc[i]['Error']=1.0
            else: # 拟合未报错,则进行下面的误差量化 预测
                # b-2) ARMA预测  
                df_result.iloc[i]['predict_value']=(result_arma.predict(start='2017-03-31', end=date))[-1] # 取出date日期的预测值
                #print result_arma.predict(start='2017-03-31', end=date)
                # b-3) ARMA模型拟合成功,normal值置1
                df_result.iloc[i]['normal']=1.0         
            
        # 6) sigmoid函数恢复
        if df_result.iloc[i]['normal']==1.0: # 拟合ARMA成功
            df_result.iloc[i][date]=-np.log((c-df_result.iloc[i]['predict_value'])/df_result.iloc[i]['predict_value'])#逆Sigmoid
            #df_result.iloc[i][date]=df_result.iloc[i]['predict_value']
            
        # 7) 显示当前循环第几次
        print (i)
        t2=time.time()
        print (u'用时:',t2-t1,'s')
    result=df_result[df_result['normal']==1.0][date]     
    # 6、函数返回值
    return result

def model_3(all_test_date,factor_df):
    score=pd.DataFrame()
    df,factor=model3_data_pre_process(factor_df)#数据预处理
    for date in all_test_date:
        if date<'2017-03-31':
            print (date,":model3 has no predicted value in this date !")
            break
        score=time_series_model(df,date)#模型3预测
        #print (type(result))
        score=model3_score(score,factor,date)##模型3数据还原及打分
    result=time_series_model(df,'2018-06-30')#模型3预测
    result=model3_score(result,factor,'2018-06-30')##模型3数据还原及打分
    all_0631_goal=result[['TICKER_SYMBOL','REVENUE_prd']]
    all_0631_goal['priority'] =1   #设置数据优先级
    all_0631_goal=all_0631_goal.set_index('TICKER_SYMBOL')
    all_0631_goal.columns=['goal_REVENUE','priority']
    return all_0631_goal
 
# 8.model_4---------------------------------------------------------------------------------------
def extract_for_model_4(in_df, test_date) :
    df = in_df.copy()
    df = df.set_index('TICKER_SYMBOL')
    train_df = df[df['END_DATE'] != test_date]
    train_df = train_df[train_df['END_DATE'] !='2018-06-30']#这一期数据不作为训练数据,因为没有y值
    test_df = df[df['END_DATE'] == test_date]
    goal_df = df[df['END_DATE'] == '2018-03-31']    #二季度预测数据
    goal_df = goal_df[['END_DATE','REVENUE','Q_REVENUE','Q_REVENUE_R','Q_REVENUE_R_1','Q_REVENUE_R_2','TYPE_NAME_EN']]
    goal_df.rename(columns={'REVENUE':'LAST_REVENUE', 'Q_REVENUE':'LAST_Q_REVENUE', 'Q_REVENUE_R':'Q_REVENUE_R_1','Q_REVENUE_R_1':'Q_REVENUE_R_2','Q_REVENUE_R_2':'Q_REVENUE_R_3'}, inplace = True)
    return train_df, test_df, goal_df

# 机器学习
def sklearn_model_4(train_df, test_df,goal_df, x_name, y_name):
    #获取训练、测试、预测数据
    train_in = train_df.copy()
    test_in = test_df.copy()
    goal_in = goal_df.copy()
    x_train_in = train_in[x_name].values
    y_train_in = train_in[y_name].values.reshape(-1, 1)
    x_test_in  = test_in[x_name].values
    y_test_in  = test_in[y_name].values.reshape(-1, 1)
    x_goal_in = goal_in[x_name].values
    #数据标准化处理
    from sklearn.preprocessing import StandardScaler
    ss_x=StandardScaler()
    ss_y=StandardScaler()
    x_train=ss_x.fit_transform(x_train_in)
    x_test=ss_x.transform(x_test_in)
    y_train=ss_y.fit_transform(y_train_in)
    y_test=ss_y.transform(y_test_in)
    x_goal=ss_x.transform(x_goal_in)     #使用ss_x有没有问题?
    #放入SVR模型学习,使用的是'linear',多项式'poly',径向基'rbf'效果咋样?
    from sklearn.svm import SVR
    import numpy as np

#     linear_svr = SVR(kernel='linear')
    # poly_svr = SVR(kernel='poly')
    
    rbf_svr = SVR(kernel='rbf')
    rbf_svr.fit(x_train,y_train.ravel())
    y_test_prd = rbf_svr.predict(x_test)
    y_goal_prd = rbf_svr.predict(x_goal)
    
    # 逆标准化
    y_test_inverse = ss_y.inverse_transform(y_test)
    y_test_prd_inverse = ss_y.inverse_transform(y_test_prd)
    y_goal_prd_inverse = ss_y.inverse_transform(y_goal_prd)
    
    del test_df[y_name]
    test_df['RE_rate'] = y_test_inverse               
    test_df['RE_rate_prd'] = y_test_prd_inverse  
    goal_in['goal_RE_rate_prd'] = y_goal_prd_inverse 
        
    return test_df,goal_in

def factor_shift(in_df) :
    df = in_df.copy()
    df = df.sort_values('END_DATE',ascending=False)
    df['Q_REVENUE_R_1'] = df['Q_REVENUE_R'].shift(-1)
    df['Q_REVENUE_R_2'] = df['Q_REVENUE_R'].shift(-2)
    df['Q_REVENUE_R_3'] = df['Q_REVENUE_R'].shift(-3)
#     df['REVENUE_rate_4'] = df['REVENUE_rate'].shift(-4)
#     df['REVENUE_rate_5'] = df['REVENUE_rate'].shift(-5)
    df['LAST_REVENUE'] = df['REVENUE'].shift(-1)
    df['LAST_Q_REVENUE'] = df['Q_REVENUE'].shift(-1)
    df = df.dropna()
    return df

def model_4(all_industries,factor_df,percent = 0.9 ):
    dfnn = model3_data_pre_process(factor_df)
    df_copy0 = dfnn[1].copy()[['TICKER_SYMBOL','END_DATE','REVENUE','Q_REVENUE','Q_REVENUE_R','MARKET_VALUE','TYPE_NAME_EN']]
    df_shift = df_copy0.groupby(by='TICKER_SYMBOL').apply(lambda x: factor_shift(x)).reset_index(drop=True)
    
    all_test_date = ['2014-06-30','2015-06-30','2016-06-30','2017-06-30']#   only for model_4
#     percent = 0.9        #筛选预测效果较好的百分比
    
    prd_ticker_list = get_prd_tickers().TICKER_SYMBOL
    goal_set = set(prd_ticker_list)
    everytimes_goal_prd = pd.DataFrame(index = prd_ticker_list)
    

    for test_date in all_test_date:
        i = 1
        t1 = time.time()
        
        all_result_prd_list = []
        all_goal_prd_list = []

        print (test_date,'总共%d个行业'%len(all_industries))
        for industry in all_industries:
            df_copy=df_shift.copy()
            df_copy=df_copy.loc[df_copy['TYPE_NAME_EN']==industry]    #分行业训练模型,选行业           
            df_copy.dropna(axis=1, thresh=int(len(df_copy)*1.0/10),inplace=True)   # 某列的数据量小于1/10删除此因子
            train_df, test_df, goal_df= extract_for_model_4(df_copy, test_date)
            x_name = ['Q_REVENUE_R_1', 'Q_REVENUE_R_2', 'Q_REVENUE_R_3']
            y_name = 'Q_REVENUE_R'

            result_prd,goal_prd = sklearn_model_4(train_df, test_df,goal_df, x_name, y_name)
            result_prd = result_prd[result_prd.index.isin(prd_ticker_list)]
            goal_prd = goal_prd[goal_prd.index.isin(prd_ticker_list)]
            
            i+=1
            all_result_prd_list.append(result_prd)       # 把测试预测结果分行业append
            all_goal_prd_list.append(goal_prd)  # 把目标预测结果分行业append,2018-06-31

        final_result = pd.concat(all_result_prd_list,axis=0)
        allindustries_goal_prd_df = pd.concat(all_goal_prd_list,axis=0)
        final_result['NEXT_MARKET_VALUE'] = final_result['MARKET_VALUE']

        final_result['NEXT_REVENUE'] = final_result['REVENUE']
        final_result['REVENUE_prd'] = final_result['REVENUE']

        final_result['NEXT_REVENUE'][final_result['END_DATE'].dt.month ==3] = final_result['LAST_Q_REVENUE'][final_result['END_DATE'].dt.month ==3]*(1+final_result['RE_rate'][final_result['END_DATE'].dt.month ==3])
        final_result['REVENUE_prd'][final_result['END_DATE'].dt.month ==3] = final_result['LAST_Q_REVENUE'][final_result['END_DATE'].dt.month ==3]*(1+final_result['RE_rate_prd'][final_result['END_DATE'].dt.month ==3])

        final_result['NEXT_REVENUE'][final_result['END_DATE'].dt.month !=3] =final_result['LAST_REVENUE'][final_result['END_DATE'].dt.month !=3] +  final_result['LAST_Q_REVENUE'][final_result['END_DATE'].dt.month !=3]*(1+final_result['RE_rate'][final_result['END_DATE'].dt.month !=3])
        final_result['REVENUE_prd'][final_result['END_DATE'].dt.month !=3] = final_result['LAST_REVENUE'][final_result['END_DATE'].dt.month !=3] + final_result['LAST_Q_REVENUE'][final_result['END_DATE'].dt.month !=3]*(1+final_result['RE_rate_prd'][final_result['END_DATE'].dt.month !=3])

        final_result = final_result[['END_DATE','NEXT_REVENUE','REVENUE','REVENUE_prd','NEXT_MARKET_VALUE']]
    #     print (finally_result.shape,finally_result.head())
        t2 = time.time()
#         print ('运行时间:%d s'%(t2-t1))
        allindustries_goal_prd_df['goal_REVENUE_prd'] = allindustries_goal_prd_df['LAST_REVENUE']
        allindustries_goal_prd_df['goal_REVENUE_prd'] = allindustries_goal_prd_df['LAST_REVENUE'] + allindustries_goal_prd_df['LAST_Q_REVENUE']*(1+allindustries_goal_prd_df['goal_RE_rate_prd'])

        allindustries_goal_prd_df = allindustries_goal_prd_df[['goal_REVENUE_prd']]
        everytimes_goal_prd = everytimes_goal_prd.merge(allindustries_goal_prd_df,left_index=True,right_index=True,how='outer')

#         plt_scatter_1(final_result)                               #每一个季度的预测结果展示
        goal_set = quantile_goal(final_result,goal_set,percent)  #选取指定百分位股票

#         write_prd_tocsv(everytimes_goal_prd,test_date,Flag='model_4',a=1)      #输出预测值到CSV文件,以哪一个季度作为测试集结果应该是没太大区别
#         write_prd_tocsv(final_result,test_date,Flag='model_4',a=0)      #输出预测值到CSV文件,以哪一个季度作为测试集结果应该是没太大区别

    everytimes_goal_prd['goal_REVENUE'] = everytimes_goal_prd.mean(axis = 1)
    everytimes_goal_prd['priority'] =1   #设置数据优先级
#     print ('多期指定百分位最优股票交集打分情况:')    
    goal_list4 = list(goal_set)  
    goal_final_result_model_4 = final_result[final_result.index.isin(goal_list4)]  
    #不需要重新训练模型,直接就用这个模型训练选出的最适合股票就行,图形展示效果和分数
#     plt_scatter_1(goal_final_result_model_4)
    
    return everytimes_goal_prd,goal_list4
    
# 8.评审公式打分、画图展示、结果输出--------------------------------------------------------------------
# 评委会评估指标,返回一个含有预测股票score的dataframe,按评审公式打分
def evaluation_indicator(df_prd) :
    df = df_prd.copy()
    df['NEXT_MARKET_VALUE'] = df['NEXT_MARKET_VALUE']/1e8
    df['NEXT_MARKET_VALUE'] = np.clip(df['NEXT_MARKET_VALUE'], 2, max(df['NEXT_MARKET_VALUE']))   #以亿为单位
    df['NEXT_MARKET_VALUE'] = np.log2(df['NEXT_MARKET_VALUE'])
    df['temp'] = abs(df['REVENUE_prd']/df['NEXT_REVENUE'] - 1.0)
    df['temp'] = np.clip(df['temp'], min(df['temp']), 0.8)
    df['score'] = df['temp']*df['NEXT_MARKET_VALUE']
    return df

#选取指定百分位股票
def quantile_goal(final_result,goal_set,percent):
    df_score = evaluation_indicator(final_result)
    if len(df_score) ==1:
        df_quantile = df_score.copy()
    else:
        df_quantile = df_score[df_score['score']<df_score['score'].quantile(percent)]
    score_list = df_quantile.index.tolist()
    set1 = set(score_list)
    goal_set = goal_set&set1
    return goal_set

# 画图看效果
def plt_scatter_1(final_result) :
    df_score = evaluation_indicator(final_result)
    print ('打分股票数:',len(df_score))
    print ('目标股票评委会评估分数,0分为最高分:',df_score['score'].mean())
    
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
#     print ('这里画图真实值和预测值的变化关系,离中间的直线y=x直接越近的点代表预测损失越低')
    ax.scatter(final_result['NEXT_REVENUE'], final_result['REVENUE_prd'])
    ax.plot([min(final_result['NEXT_REVENUE']), max(final_result['NEXT_REVENUE'])], [min(final_result['NEXT_REVENUE']), max(final_result['NEXT_REVENUE'])], 'k--', lw=4)
    ax.set_xlabel('True_value')
    ax.set_ylabel('Predicted_value')
    plt.show()
#     print (finally_result.shape)
#     print (finally_result.head().to_html())
       
# 将最后结果以官方要求存入csv文件
# 输入说明:df_prd中TICKER_SYMBOL列为股票代码,Revenue_prd列为预测结果
def write_prd_tocsv(df_prd,test_date='2018-06-30',Flag='model_1',a=1):
    df = pd.read_csv(tickers_Path, header=None) #读取预测目标列表
    df.columns = ['TICKER_SYMBOL']
    df['TICKER_SYMBOL_Nu.'] = df['TICKER_SYMBOL'].str.slice(0,6)
    df['Flag'] = 1
    tickers = df['TICKER_SYMBOL_Nu.'].tolist()

    if a == 1:
        df_goal_merge = pd.merge(df,df_prd,left_on='TICKER_SYMBOL_Nu.',right_index=True,how='outer') 
        df_goal_merge = df_goal_merge.set_index('TICKER_SYMBOL')
#         display(df_goal_merge.head())
        df_goal_merge['goal_REVENUE'] = df_goal_merge['goal_REVENUE']/1000000          #以百万为单位
        
        formater="{0:.02f}".format
        df_goal_merge = df_goal_merge['goal_REVENUE'].apply(formater)   #保留两位小数
#         display(df_goal_merge.head())

        from datetime import datetime
        tt = datetime.now().date().isoformat()
        df_goal_merge.to_csv('FDDC_financial_submit_%s_%s_%s.csv'%(Flag,test_date,tt), sep=',',index=True,header=None)
    else:
        df_result_merge = pd.merge(df,df_prd,left_on='TICKER_SYMBOL_Nu.',right_index=True,how='outer') 
        df_result_merge = df_result_merge[['TICKER_SYMBOL','REVENUE_prd']][df_result_merge['Flag']==1]
        df_result_merge = df_result_merge.set_index('TICKER_SYMBOL')
        # df_result_merge['REVENUE_prd'] = df_result_merge['REVENUE_prd']/1000000          #以百万为单位
        # formater="{0:.02f}".format
        # df_result_merge = df_result_merge['REVENUE_prd'].apply(formater)   #保留两位小数
        from datetime import datetime
        tt = datetime.now().date().isoformat()
        df_result_merge.to_csv('REVENUE_prd_%s_%s_%s.csv'%(Flag,test_date,tt), sep=',',index=True,header=None)
    
    print ('write is ok')
        

def main():
#     Excel_to_csv(Balance_sh_Path,Balance_sh_csv_Path)
#     Excel_to_csv(Cash_F_st_Path,Cash_F_st_csv_Path)
#     Excel_to_csv(Income_st_Path,Income_st_csv_Path)

    #get factor fata
    factors_income  =['N_INCOME', 'INCOME_TAX', 'NOPERATE_INCOME', 'BIZ_TAX_SURCHG', 'COGS','GENL_ADMIN_EXP', 'REVENUE','OPERATE_PROFIT', 'INT_EXP']

    factors_balance =['T_LIAB', 'T_ASSETS','T_SH_EQUITY','DIV_PAYABLE']

    factors_cash =['N_CF_OPERATE_A', 'N_CF_FR_INVEST_A', 'N_CF_FR_FINAN_A', 'N_CE_END_BAL', 'C_OUTF_FR_FINAN_A']

    indu_tickers = '' #null for all industry
    PIT_date = '2018-06-30' 
    
    # 读取三大报表因子数据和市值数据,并放入同一个dataframe
    Factor_df = Factor_Merge(indu_tickers, factors_income, factors_balance, factors_cash,PIT_date)

    base_value = 500
    all_test_date = ['2014-03-31','2015-03-31','2016-03-31','2017-03-31']
    goal_set = set(get_prd_tickers().TICKER_SYMBOL)
    tickers = pd.read_excel(Market_Data_Path,sheet_name=u'DATA')
    tickers['TICKER_SYMBOL'] = tickers['TICKER_SYMBOL'].astype(str).str.zfill(6) 
    all_industries = tickers[(tickers['END_DATE_'] == '2018-05-31') & (tickers.TICKER_SYMBOL.isin(goal_set))].TYPE_NAME_EN.drop_duplicates()
 
    df_model1,final_goal_list1 = model_1(all_test_date,all_industries,Factor_df)
    df_model2,final_goal_list2 = model_2(all_test_date,all_industries,Factor_df,percent = 0.4)
    df_model4,final_goal_list4 = model_4(all_industries,Factor_df)
    

    df2 = df_model2.copy()
    df2 = df2[df2.index.isin(final_goal_list2)]
    df2 = df2[['goal_REVENUE','priority']].sort_index()
    
    df4 = df_model4.copy()
    df4 = df4[df4.index.isin(final_goal_list4)]
    df4 = df4[['goal_REVENUE','priority']].sort_index()
    
    df4 = df4.append(df2)
    df4=df4.reset_index()
    df4=df4.sort_values(['TICKER_SYMBOL', 'priority']).drop_duplicates(['TICKER_SYMBOL'], keep='last')
    
    df = pd.read_csv(tickers_Path, header=None) 
    df.columns = ['TICKER_SYMBOL_a']
    df['TICKER_SYMBOL'] = df['TICKER_SYMBOL_a'].str.slice(0,6)
    
    set1 = set(df.TICKER_SYMBOL)
    
    set2 = set1 - set(df4.TICKER_SYMBOL)
    if len(set2)>0:
        df44 = df_model4.copy()[['goal_REVENUE', 'priority']].reset_index()
        df44 = df44[df44.TICKER_SYMBOL.isin(set2)] 
        df4 = df4.append(df44)

    set2 = set1- set(df4.TICKER_SYMBOL)
    if len(set2)>0:
        df22 = df_model2.copy()[['goal_REVENUE', 'priority']].reset_index()
        df22 = df22[df22.TICKER_SYMBOL.isin(set2)] 
        df4 = df4.append(df22)
    
    set2 = set1- set(df4.TICKER_SYMBOL)
    if len(set2)>0:
        df11 = df_model1.copy()[['goal_REVENUE', 'priority']].reset_index()
        df11 = df11[df11.TICKER_SYMBOL.isin(set2)] 
        df4 = df4.append(df11)
    
    df4 = df4.sort_values(['TICKER_SYMBOL', 'priority']).drop_duplicates(['TICKER_SYMBOL'], keep='first') #ge data by priority
    df4 = df4[df4.TICKER_SYMBOL.isin(df.TICKER_SYMBOL)]   
    df_merge = df4.merge(df,on='TICKER_SYMBOL',how='inner')
    df_result = df_merge[['TICKER_SYMBOL_a','goal_REVENUE']].set_index('TICKER_SYMBOL_a')
    df_result['goal_REVENUE'] = df_result['goal_REVENUE']/1000000          #million
    formater="{0:.02f}".format
    df_result = df_result['goal_REVENUE'].apply(formater)   #round 2
    import datetime
    nowTime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    df_result.to_csv('../submit/submit_%s.csv'%nowTime, sep=',',index=True,header=None)

t1 = time.time()
main()
t2 = time.time()
print ('Main total used time:',t2-t1)
Main total used time: 1014.0587222576141
部分结果:
000001.XSHE,32235.43
000005.XSHE,811.81
000006.XSHE,1841.47
000009.XSHE,6205.56
000010.XSHE,5932.63
000011.XSHE,1367.86
000012.XSHE,5911.00
000014.XSHE,1009.92
000017.XSHE,1537.76
000018.XSHE,7146.10
000021.XSHE,8316.92
000025.XSHE,1507.55
000026.XSHE,1239.08
000027.XSHE,7874.28
……

猜你喜欢

转载自blog.csdn.net/weixin_42432468/article/details/89389265
今日推荐