FDDC2018金融算法挑战赛01-A股上市公司季度营收预测

天池大赛链接

我所用到的数据

1、income_gb_2代表的是我从天池原有的income_statement中的general business导出的,balance_gb_2和cash_gb_2 

首亦然。

2、 Macro为宏观数据,Market为市场数据

导入相关包,将工作目录改为数据所在目录

from pandas import DataFrame
from numpy import nan as NA
from pandas import Series
import os 
import pandas as pd
import numpy as  np
import random
import time
import threading as td
import multiprocessing as mp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA

#改变工作目录
os.chdir('E://kaggle//tc2')

1、由于资产负债表、利润表和现金流量表内各个数据间在存在等式关系,例如:资产=负债+ 股东权益,利润总额=营业收入-营业成本-各项费用等。根据这些内在逻辑剔除异常值

2、由于内在等式关系, 对于缺失值则直接填充0,不改变原有等式关系

3、数据中有些列是前方列的和,为排除多重共线性,将这些列予以剔除

#根据财务领域知识剔除异常值与线性相关的列
#利润表
#导入数据
income_gb2=pd.read_csv('income_gb_2.csv')
#填充缺失值
income_gb2=income_gb2.fillna(0)
#建立空列表,用于收集需要剔除的观测样本
income_drop_index=[]
#检测异常样本
for i in range(np.shape(income_gb2)[0]):
    if (income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) >1000 or \
       (income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) <-1000 or \
       (income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) >1000 or \
       (income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) <-1000 or \
       (income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
        income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) > 1000 or \
       (income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
        income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) < -1000 :
           income_drop_index.append(i)       
    print((i/np.shape(income_gb2)[0])*100)  

#剔除观测样本  
income_gb2_drop=income_gb2.drop(income_drop_index,axis=0)
#根据业务逻辑剔除数据中线性相关的列,防止多重共线性 
income_gb2_drop=income_gb2.drop(['T_REVENUE','T_COGS','OPERATE_PROFIT','N_INCOME','T_COMPR_INCOME'],axis=1)   
income_gb2_drop.to_csv('income_gb2_drop.csv',index=None)       
#资产负债表
#处理方式同上
balance_gb2=pd.read_csv('balance_gb_2.csv')
balance_gb2=balance_gb2.fillna(0)
balance_gb2_drop=balance_gb2
balance_gb2_drop1=balance_gb2.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
                                   'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
                                   'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1)
balance_drop_index_total=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
        balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) >10000 or \
       (balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
        balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) < -10000 :
        balance_drop_index_total.append(i) 
    print((i+1)/209872)

balance_drop_index_sum=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) < -10000 :
        balance_drop_index_sum.append(i) 
    print((i+1)/209872)     


balance_drop_index_TCA=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) >10000 or \
       (balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) < -10000 :
        balance_drop_index_TCA.append(i) 
    print((i+1)/209872)     

balance_drop_index_TNCA=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) < -10000 :
        balance_drop_index_TNCA.append(i) 
    print((i+1)/209872)     
#
balance_drop_index_T_CL=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) < -10000 :
        balance_drop_index_T_CL.append(i) 
    print((i+1)/209872)      
    
balance_drop_index_T_NCL=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) < -10000 :
        balance_drop_index_T_NCL.append(i) 
    print((i+1)/209872)    
    
balance_drop_index_T_EQUITY_ATTR_P=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) < -10000 :
        balance_drop_index_T_EQUITY_ATTR_P.append(i) 
    print((i+1)/209872)

balance_drop_index_T_SH_EQUITY=[]    
for i in range(np.shape(balance_gb2_drop)[0]) :
    if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')] ) >10000 or \
       (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
        balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')]) < -10000 :
        balance_drop_index_T_SH_EQUITY.append(i) 
    print((i+1)/209872)      
    
balance_drop_index_final=balance_drop_index_sum+balance_drop_index_TCA+balance_drop_index_TNCA+balance_drop_index_T_CL+balance_drop_index_T_NCL+balance_drop_index_T_EQUITY_ATTR_P+balance_drop_index_T_SH_EQUITY
balance_drop_index_final=list(set(balance_drop_index_final))
balance_gb2_drop_final=balance_gb2.drop(balance_drop_index_final,axis=0) 
balance_gb2_drop_final=balance_gb2_drop_final.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
                                   'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
                                   'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1) 
balance_gb2_drop_final.to_csv('balance_gb2_drop.csv',index=None)  

#现金流量表  
#处理方式同上  
cash_gb2=pd.read_csv('cash_gb_2.csv')
cash_gb2=cash_gb2.fillna(0)   

cash_drop_index_OPERATE_A=[]    
for i in range(np.shape(cash_gb2)[0]) :
    if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_SALE_G_S'):list(cash_gb2.columns).index('C_INF_FR_OPERATE_A')].sum() - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('C_PAID_G_S'):list(cash_gb2.columns).index('C_OUTF_OPERATE_A')].sum() + \
        cash_gb2.ix[i,list(cash_gb2.columns).index('ANOCF')] - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_OPERATE_A')]) >10000 :
        cash_drop_index_OPERATE_A.append(i) 
    print((i+1)/209872)      
    
cash_drop_index_INVEST_A=[]    
for i in range(np.shape(cash_gb2)[0]) :
    if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('PROC_SELL_INVEST'):list(cash_gb2.columns).index('C_INF_FR_INVEST_A')].sum() - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('PUR_FIX_ASSETS_OTH'):list(cash_gb2.columns).index('C_OUTF_FR_INVEST_A')].sum() + \
        cash_gb2.ix[i,list(cash_gb2.columns).index('ANICF')] - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_FR_INVEST_A')]) >10000 :
        cash_drop_index_INVEST_A.append(i) 
    print((i+1)/209872)      

cash_drop_index_FINAN_A=[]    
for i in range(np.shape(cash_gb2)[0]) :
    if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_CAP_CONTR'):list(cash_gb2.columns).index('C_INF_FR_FINAN_A')].sum() - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('C_PAID_FOR_DEBTS'):list(cash_gb2.columns).index('C_OUTF_FR_FINAN_A')].sum() + \
        cash_gb2.ix[i,list(cash_gb2.columns).index('ANFCF')] - \
        cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_FR_FINAN_A')] -\
        cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_MINO_S_SUBS')] + \
        cash_gb2.ix[i,list(cash_gb2.columns).index('DIV_PROF_SUBS_MINO_S')]) >10000 :
        cash_drop_index_FINAN_A.append(i) 
    print((i+1)/209872)     

cash_drop_index_BAL=[]    
for i in range(np.shape(cash_gb2)[0]) :
    if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('N_CHANGE_IN_CASH'):list(cash_gb2.columns).index('N_CE_END_BAL')].sum() - \
           cash_gb2.ix[i,list(cash_gb2.columns).index('N_CE_END_BAL')]) >10000 :
        cash_drop_index_BAL.append(i) 
    print((i+1)/209872)
    
cash_drop_index_final=cash_drop_index_OPERATE_A+cash_drop_index_INVEST_A+cash_drop_index_FINAN_A+cash_drop_index_BAL
cash_drop_index_final=list(set(cash_drop_index_final))    
cash_gb2_drop_final=cash_gb2.drop(cash_drop_index_final,axis=0)     
cash_gb2_drop_final=cash_gb2_drop_final.drop(['C_INF_FR_OPERATE_A','C_OUTF_OPERATE_A','N_CF_OPERATE_A','C_INF_FR_INVEST_A','C_OUTF_FR_INVEST_A','N_CF_FR_INVEST_A',
                                   'C_INF_FR_FINAN_A','C_OUTF_FR_FINAN_A','N_CF_FR_FINAN_A',
                                   'N_CHANGE_IN_CASH','N_CE_END_BAL'],axis=1) 
cash_gb2_drop_final.to_csv('cash_gb2_drop.csv',index=False)

1、整理数据为后面形成训练集做准备。如同一个数据有多个值选取最近更新的数据

#打开文件
cash_gb0=pd.read_csv('cash_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=cash_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=cash_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=cash_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])

cash_gb0=pd.concat([cash_gb0,date_pub],axis=1)
cash_gb0=pd.concat([cash_gb0,date_rep],axis=1)
cash_gb0=pd.concat([cash_gb0,date_end],axis=1)
cash_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
cash_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=cash_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:   
    cash_slice=cash_gb0.ix[i]
    end_date_unique=cash_slice['END_DATE_mktime'].unique()
    sum1 += 1
    for j in end_date_unique:
        index_t1=cash_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
        index_t2=cash_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]        
        cash_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
    print(sum1/len(ticker_unique))        

cash_gb1=cash_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将'party_id'列不为-1的行全部转换为缺失值
#将索引转换为列
cash_gb1=cash_gb1.reset_index()
for i in range(np.shape(cash_gb1)[0]):
    if cash_gb1.ix[i,'PARTY_ID'] !=-1:
        cash_gb1.iloc[i]=NA
    else:           
        continue
    print(i/np.shape(cash_gb1)[0])
cash_gb1.to_csv('cash_gb1.csv')

cash_gb2=cash_gb1
#删除全为缺失值一行
cash_gb2=cash_gb2.dropna(how='all')
#设置层次化索引
cash_gb2=cash_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
cash_gb2=cash_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
                        'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
cash_gb2=cash_gb2.sortlevel(0)
#文件输出
cash_gb2.to_csv('cash_data.csv')

#打开文件
balance_gb0=pd.read_csv('balance_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=balance_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=balance_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=balance_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])

balance_gb0=pd.concat([balance_gb0,date_pub],axis=1)
balance_gb0=pd.concat([balance_gb0,date_rep],axis=1)
balance_gb0=pd.concat([balance_gb0,date_end],axis=1)
balance_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
balance_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=balance_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:   
    balance_slice=balance_gb0.ix[i]
    end_date_unique=balance_slice['END_DATE_mktime'].unique()
    sum1 += 1
    for j in end_date_unique:
        index_t1=balance_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
        index_t2=balance_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]        
        balance_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
    print(sum1/len(ticker_unique))        
#将'party_id'列不为-1的行全部转换为缺失值
balance_gb1=balance_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将索引转换为列
balance_gb1=balance_gb1.reset_index()
for i in range(np.shape(balance_gb1)[0]):
    if balance_gb1.ix[i,'PARTY_ID'] !=-1:
        balance_gb1.iloc[i]=NA
    else:           
        continue
    print(i/np.shape(balance_gb1)[0])
balance_gb1.to_csv('balance_gb1.csv')

balance_gb2=balance_gb1
#删除全为缺失值一行
balance_gb2=balance_gb2.dropna(how='all')
#设置层次化索引
balance_gb2=balance_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
balance_gb2=balance_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
                        'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
balance_gb2=balance_gb2.sortlevel(0)
#文件输出
balance_gb2.to_csv('balance_data.csv')

#打开文件
income_gb0=pd.read_csv('income_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=income_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=income_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=income_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])

income_gb0=pd.concat([income_gb0,date_pub],axis=1)
income_gb0=pd.concat([income_gb0,date_rep],axis=1)
income_gb0=pd.concat([income_gb0,date_end],axis=1)
income_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
income_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=income_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:   
    income_slice=income_gb0.ix[i]
    end_date_unique=income_slice['END_DATE_mktime'].unique()
    sum1 += 1
    for j in end_date_unique:
        index_t1=income_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
        index_t2=income_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]        
        income_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
    print(sum1/len(ticker_unique))        
#将'party_id'列不为-1的行全部转换为缺失值
income_gb1=income_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将索引转换为列
income_gb1=income_gb1.reset_index()
for i in range(np.shape(income_gb1)[0]):
    if income_gb1.ix[i,'PARTY_ID'] !=-1:
        income_gb1.iloc[i]=NA
    else:           
        continue
    print(i/np.shape(income_gb1)[0])
income_gb1.to_csv('income_gb1.csv')
income_gb2=income_gb1
#删除全为缺失值一行
income_gb2=income_gb2.dropna(how='all')
#设置层次化索引
income_gb2=income_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
income_gb2=income_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
                        'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
income_gb2=income_gb2.sortlevel(0)
#文件输出
income_gb2.to_csv('income_data.csv')  

1、字符串日期列通过函数分开为年列和月列。

2、将资产负债表,利润表和现金流量表根据证券代码和日期索引进行内联结合并。

#将资产负债表、利润表、现金流量表融合    
balance_gb3=pd.read_csv('balance_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
cash_gb3=pd.read_csv('cash_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
income_gb3=pd.read_csv('income_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
merge1=pd.merge(income_gb3,balance_gb3,left_index=True,right_index=True,how='inner')    
merge2=pd.merge(merge1,cash_gb3,left_index=True,right_index=True,how='inner')       

#财务报表数据第二版
merge3=merge2.reset_index()
def f1(x):
        return int(x[:4])
def f2(x):
    if len(x) ==10:
        return int(x[5:7])
    elif len(x)  ==8:
        return int(x[5:6])
    elif x[4:7].count('/') == 2 :
        return int(x[5:6])
    else:
        return int(x[5:7]) 
#将日期列通过函数分开为年列和月列
merge3['YEAR']=merge3['END_DATE'].map(f1)
merge3['MONTH']=merge3['END_DATE'].map(f2)    
merge3.drop(['END_DATE'],axis=1,inplace=True)
merge3=merge3.set_index(['TICKER_SYMBOL','YEAR','MONTH'])
merge3=merge3.sortlevel(0)
#保存数据
merge3.to_csv('merge_data(7.26).csv')

市场数据进行预处理

#市场数据第二版
market=pd.read_csv('Market.csv')
#查看是否有缺失值
market.isnull().sum()
#利用元素级函数将日期分为年月
def f1(x):
        return int(x[:4])
def f2(x):
    if len(x) ==10:
        return int(x[5:7])
    elif len(x)  ==8:
        return int(x[5:6])
    elif x[4:7].count('/') == 2 :
        return int(x[5:6])
    else:
        return int(x[5:7]) 
market['YEAR']=market['END_DATE_'].map(f1)
market['MONTH']=market['END_DATE_'].map(f2)    
market.drop(['SECURITY_ID','TYPE_ID','TYPE_NAME_CN','END_DATE_'],axis=1,inplace=True)
market=market.set_index(['TICKER_SYMBOL','YEAR','MONTH'])
market=market.sortlevel(0)
market.to_csv('market_final.csv')

对宏观数据进行预处理

#宏观数据
macro=pd.read_csv('Macro.csv')
#查看是否有缺失值并剔除
macro.isnull().sum()
macro.dropna(how='any',inplace=True)
macro=macro.set_index('FREQUENCY_CD')
macro=macro.sortlevel(0)

def f1(x):
        return int(x[:4])
def f2(x):
    if len(x) ==10:
        return int(x[5:7])
    elif len(x)  ==8:
        return int(x[5:6])
    elif x[4:7].count('/') == 2 :
        return int(x[5:6])
    else:
        return int(x[5:7]) 
year_test=[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,2018]
#年度数据    
macro_A=macro.ix['A']
macro_A['YEAR']=macro_A['PERIOD_DATE'].map(f1)
macro_A.drop(['name_cn','PERIOD_DATE'],axis=1,inplace=True)
macro_A=macro_A.set_index(['indic_id','YEAR'])
macro_A=macro_A.sortlevel(0)
macro_A.to_csv('macro_A_final.csv')
        
#月度数据转换为季度数据
#提取月度数据
macro_M=macro.ix['M']
#将字符串时间列通过函数改变为年列和月列
macro_M['YEAR']=macro_M['PERIOD_DATE'].map(f1)
macro_M['MONTH']=macro_M['PERIOD_DATE'].map(f2)
macro_M.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_M=macro_M.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_M=macro_M.sortlevel(0)
#寻找确实观测值
macro_M_na_year=[]
macro_M_na_month=[]
for i in list(macro_M.index.levels[0]):
    try:
        for j in year_test:     
            macro_M.ix[i,j]  
    except:
            macro_M_na_year.append(i)
    else:
        for j in year_test: 
            for k in list(set(macro_M.ix[i,j]['MONTH'].values)):
                try :
                    macro_M.ix[i,j,k].values
                except:
                    macro_M_na_month.append([i,j,k])
macro_M.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)        
macro_M=macro_M.reset_index(['YEAR','MONTH'])
#若整年数据缺失则剔除这个类型的数据
macro_M=macro_M.drop(macro_M_na_year,axis=0)
macro_M=macro_M.reset_index()
#某类数据仅缺少部分月份数据则用年平均值予以填充
for i in macro_M_na_month:
    part1=DataFrame([[i[0],i[1],i[2],0,NA]],columns=list(macro_M.columns))
    macro_M=pd.concat([macro_M,part1],ignore_index=True)
for i in range(np.shape(macro_M)[0]):
    if macro_M.ix[i,1] < 2006:
        macro_M.ix[i]=NA
macro_M.dropna(how='all',inplace=True)
macro_M=macro_M.set_index(['indic_id','YEAR'])
macro_M=macro_M.sortlevel(0)
macro_M=macro_M.fillna(macro_M.mean(level=[0,1]))
macro_M=macro_M.reset_index()
macro_M=macro_M.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_M=macro_M.sortlevel(0)
#按月份分为4个季度
for i in list(macro_M.index.levels[0]):
        for j in year_test:
            try:
                for k in list(set(macro_M.ix[i,j]['MONTH'].values)):
                    if   k <= 3:
                        macro_M.ix[(i,j,k),'name_cn'] = 3
                    elif k <= 6:
                        macro_M.ix[(i,j,k),'name_cn'] = 6
                    elif k <= 9 :
                        macro_M.ix[(i,j,k),'name_cn']=9 
                    else:
                        macro_M.ix[(i,j,k),'name_cn'] =12
            except:
                print(i,j)
macro_M.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_M=macro_M.reset_index()
macro_M=macro_M.set_index(['indic_id','YEAR','name_cn'])
#求得某类数据某年季度数据
macro_M=macro_M.sum(level=[0,1,2])
macro_M.drop(['MONTH'],axis=1,inplace=True)
macro_M.to_csv('macro_M_final.csv')
#将周数据转换为季度数据
#处理方式同上
macro_W=macro.ix['W']
macro_W['YEAR']=macro_W['PERIOD_DATE'].map(f1)
macro_W['MONTH']=macro_W['PERIOD_DATE'].map(f2)
macro_W.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_W['name_cn']=0
macro_W=macro_W.set_index(['indic_id','YEAR','MONTH'])
macro_W=macro_W.sortlevel(0)
macro_W=macro_W.sum(level=('indic_id','YEAR','MONTH'))
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['YEAR'])
macro_W=macro_W.drop(list(range(2002,2006,1)))
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['indic_id','YEAR','MONTH'],drop=False)
for i in list(macro_W.index.levels[0]):
        for j in year_test:
            try:
                for k in list(set(macro_W.ix[i,j]['MONTH'].values)):
                    if   k <= 3:
                        macro_W.ix[(i,j,k),'name_cn'] = 3
                    elif k <= 6:
                        macro_W.ix[(i,j,k),'name_cn'] = 6
                    elif k <= 9 :
                        macro_W.ix[(i,j,k),'name_cn']=9 
                    else:
                        macro_W.ix[(i,j,k),'name_cn'] =12
            except:
                print(i,j)
macro_W.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['indic_id','YEAR','name_cn'])
macro_W=macro_W.sum(level=[0,1,2])
macro_W.drop(['MONTH'],axis=1,inplace=True)
macro_W.to_csv('macro_W_final.csv')
#将日数据转换为季度数据
#处理方法同上
macro_D=macro.ix['D']
macro_D['YEAR']=macro_D['PERIOD_DATE'].map(f1)
macro_D['MONTH']=macro_D['PERIOD_DATE'].map(f2)
macro_D.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_D=macro_D.set_index('YEAR')
#剔除2006年以前的数据
macro_D.drop(list(range(1995,2006,1)),inplace=True)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_D=macro_D.sortlevel(0)

macro_D_na_year=[]
macro_D_na_month=[]
#查看日数据有从2006年开始的完整数据,每个月是否有缺失值
for i in list(macro_D.index.levels[0]):
    try:
        for j in year_test:     
            macro_D.ix[i,j]  
    except:
            macro_D_na_year.append(i)
    else:
        for j in year_test: 
            for k in list(set(macro_D.ix[i,j]['MONTH'])):
                try :
                    len(macro_D.ix[i,j,k]['MONTH'].values)>=20
                    l=len(macro_D.ix[i,j,k]['MONTH'].values)
                except:
                    macro_D_na_month.append([i,j,k,l])
macro_D=macro_D.drop(['indic_id','YEAR','MONTH'],axis=1)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index('indic_id')
macro_D=macro_D.drop(macro_D_na_year)                    
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_D=macro_D.sortlevel(0)
for i in list(macro_D.index.levels[0]):
        for j in year_test:
            try:
                for k in list(set(macro_D.ix[i,j]['MONTH'].values)):
                    if   k <= 3:
                        macro_D.ix[(i,j,k),'name_cn'] = 3
                    elif k <= 6:
                        macro_D.ix[(i,j,k),'name_cn'] = 6
                    elif k <= 9 :
                        macro_D.ix[(i,j,k),'name_cn']=9 
                    else:
                        macro_D.ix[(i,j,k),'name_cn'] =12
            except:
                print(i,j)
macro_D.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','name_cn'])
macro_D=macro_D.sum(level=[0,1,2])
macro_D.drop(['MONTH'],axis=1,inplace=True)
macro_D.to_csv('macro_D_final.csv')

构建训练集并提取列名

#构建训练集
train2=DataFrame()
sum_count_lost=0
list_count_lost=[]
sum_count_lost1=0
list_count_lost1=[]
sum_na=0
list_na=[]
year_test=[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
ticker_list=list(merge3.index.levels[0])

#获取列名     
train_columns=[]
merge_columns=[]
for i in [0,'0_3_','1_12_','1_9_','1_6_','1_3_','2_12_','2_9_','2_6_','2_3_','3_12_','3_9_','3_6_','3_3_']:
    if i == 0:
        merge_columns.append('0_6_'+merge3.columns[0])
    else:
        for k in range(len(list(merge3.columns))):
            merge_columns.append(i+list(merge3.columns)[k])

market_columns=[]
for i in ['0_','1_','2_','3_']:
    if i == '0_':
        for j in range(5,0,-1):
            for k in range(len(list(market.columns))):
                market_columns.append(str(i)+str(j)+'_'+list(market.columns)[k])
    else:
        for j in range(12,0,-1):
            for k in range(len(list(market.columns))):
                market_columns.append(str(i)+str(j)+'_'+list(market.columns)[k])
macro_A_columns=[]
for i in list(macro_A.index.levels[0]):
    for j in ['1_','2_','3_']:
        macro_A_columns.append(j+str(i))

macro_M_columns=[]
for i in list(macro_M.index.levels[0]):
    for j in ['0_','1_','2_','3_']:
        if j =='0_':
            for k in [3]:
                macro_M_columns.append(j+str(k)+'_'+str(i))  
        else:        
            for k in sorted(list(macro_M.index.levels[2]),reverse=True):
                macro_M_columns.append(j+str(k)+'_'+str(i))

macro_W_columns=[]
for j in ['1_','2_','3_']:
    for k in sorted(list(macro_W.index.levels[2]),reverse=True):
        macro_W_columns.append(j+str(k)+'_'+'2160000101')

macro_D_columns=[]
for i in list(macro_D.index.levels[0]):
    for j in ['1_','2_','3_']:
        for k in sorted(list(macro_D.index.levels[2]),reverse=True):
            macro_D_columns.append(j+str(k)+'_'+str(i))
train_columns=train_columns+merge_columns+market_columns+macro_A_columns+macro_M_columns+macro_W_columns+macro_D_columns   
test_columns=train_columns[1:]
#形成训练集
      
for i in ticker_list :
    label_unique=[]
    for q in list(set(merge3.ix[i].index.labels[0])):
        label_unique.append(list(merge3.ix[i].index.levels[0])[q])
    year_list=sorted(label_unique,reverse=True)
    year_len=len(year_list)    
    for j in year_list:
        if year_len > 3:
            year_len=year_len-1            
            if j <2018:                
                train=DataFrame()
                try:#主要为了当年有剔除缺失值的样本                  
                    #获取当年的半年报营业收入,1季度报和近三年个季度报告
                    for l in [j,j-1,j-2,j-3]:
                        if l == j:
                            for k in [6,3]:
                                if k == 6:                       
                                    train=pd.concat([train,Series(merge3.ix[(i,l,k),:].iloc[0,0],index=[i])],axis=1,ignore_index=True)
                                else:
                                    train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
                        else:
                            for k in [12,9,6,3]:                            
                                train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
#                   获取当年的前五个月市场数据,上一年的月度市场数据
                    for l in [j,j-1,j-2,j-3]:
                        if l == j:                        
                            for o in list(range(5,0,-1)):
                                train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
                        else:
                            for o in list(range(12,0,-1)):                  
                                train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)                                                 
                except Exception as e:
                    print(e)
                    sum_na +=1
                    list_na.append([i,l,k])
                    print(i,l,k,sum_na)
                    continue                  
                #获取近三年宏观年度数据            
                for m in list(macro_A.index.levels[0]):
                    for l in [j-1,j-2,j-3]:
                        train=pd.concat([train,Series(macro_A.ix[(m,l)][0],index=[i])],axis=1,ignore_index=True)
#               #近三年宏观月度数据转换而来的季度数据         
                for m in list(macro_M.index.levels[0]):
                    for l in [j,j-1,j-2,j-3]:
                        if l == j:
                            for k in [3]:
                                train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)    
                        else:
                            for k in sorted(list(macro_M.index.levels[2]),reverse=True):
                                train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)                         
                
                #近三年宏观周度数据转换而来的季度数据
                for l in [j-1,j-2,j-3]:
                    for k in sorted(list(macro_W.index.levels[2]),reverse=True):
                        train=pd.concat([train,Series(macro_W.ix[(2160000101,l,k),:][0],index=[i])],axis=1,ignore_index=True)
                #近三年宏观日度数据转换而来的季度数据
                for m in list(macro_D.index.levels[0]):
                    for l in [j-1,j-2,j-3]:
                        for k in sorted(list(macro_D.index.levels[2]),reverse=True):
                            train=pd.concat([train,Series(macro_D.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)    
                train2=pd.concat([train2,train],axis=0,ignore_index=True)
    complieted=((ticker_list.index(i)+1)/3493)*100
    print('已完成:',complieted,'%')
train2.to_csv('train_set.csv',header=False,index=False)  

形成测试集

#形成测试集    
submit_nes=pd.read_csv('submit_nes.csv')
submit_bank=pd.read_csv('submit_bank.csv')
submit_sec=pd.read_csv('submit_sec.csv')
submit_ins=pd.read_csv('submit_ins.csv')
submit_bank=list(submit_bank['TICKER_SYMBOL2'])
submit_sec=list(submit_sec['TICKER_SYMBOL2'])
submit_ins=list(submit_ins['TICKER_SYMBOL2'])
submit_nes_change=[]
for i in list(submit_nes.values.tolist()):
    
    if list(submit_nes.values.tolist()).index(i) <872:
        
        submit_nes_change.append(int((i[0].strip('.XSHE'))))
    else:
        submit_nes_change.append(int((i[0].strip('.XSHG'))))
submit_gb_id=[]
no_count=0
for i in submit_nes_change:
    if i in submit_bank or i in submit_sec or i in submit_ins:
        no_count +=1
        continue
    else:
        submit_gb_id.append(i)

list_test_na=[]
sum_test_na=0
test=DataFrame()     
for i in submit_gb_id :
    j=2018               
    train=DataFrame()
    try:#主要为了当年有剔除缺失值的样本                  
        #获取当年的1季度报和近三年个季度报告
        for l in [j,j-1,j-2,j-3]:
            if l == j:
                for k in [3]:
                    if k == 6:                       
                        train=pd.concat([train,Series(merge3.ix[(i,l,k),:].iloc[0,0],index=[i])],axis=1,ignore_index=True)
                    else:
                        train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
            else:
                for k in [12,9,6,3]:                            
                    train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
#                   当年的前五个月市场数据,上一年的月度市场数据
        for l in [j,j-1,j-2,j-3]:
            if l == j:                        
                for o in list(range(5,0,-1)):
                    train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
            else:
                for o in list(range(12,0,-1)):                  
                    train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)                                                 
    except Exception as e:
        print(e)
        sum_test_na +=1
        list_test_na.append([i,l,k])
        print(i,l,k,sum_test_na)
        continue                  
    #近三年宏观年度数据            
    for m in list(macro_A.index.levels[0]):
        for l in [j-1,j-2,j-3]:
            train=pd.concat([train,Series(macro_A.ix[(m,l)][0],index=[i])],axis=1,ignore_index=True)
#   近三年宏观月度数据转换而来的季度数据         
    for m in list(macro_M.index.levels[0]):
        for l in [j,j-1,j-2,j-3]:
            if l == j:
                for k in [3]:
                    train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)    
            else:
                for k in sorted(list(macro_M.index.levels[2]),reverse=True):
                    train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)                         
    
    #近三年宏观周度数据转换而来的季度数据
    for l in [j-1,j-2,j-3]:
        for k in sorted(list(macro_W.index.levels[2]),reverse=True):
            train=pd.concat([train,Series(macro_W.ix[(2160000101,l,k),:][0],index=[i])],axis=1,ignore_index=True)
     #近三年宏观日度数据转换而来的季度数据
    for m in list(macro_D.index.levels[0]):
        for l in [j-1,j-2,j-3]:
            for k in sorted(list(macro_D.index.levels[2]),reverse=True):
                train=pd.concat([train,Series(macro_D.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)    
    test=pd.concat([test,train],axis=0,ignore_index=True)
    #观测进度
    complieted=((submit_gb_id.index(i)+1)/1460)*100
    print('已完成:',complieted,'%')
test.to_csv('test_set.csv',header=False,index=False)      

1、构建拥有列名的完整训练集数据

2、与测试集数据并剔除重复列

3、将分类变量转换为哑变量

#组合成训练集数据库
train_df=pd.read_csv('train_set.csv',header=None) 
test_df=pd.read_csv('test_set.csv',header=None) 
train_df=DataFrame(np.array(train_df),columns=train_columns)
test_df=DataFrame(np.array(test_df),columns=test_columns)
drop_columns=[]
#剔除数据完全相同的列
for i in train_columns:
    if i == '0_5_TYPE_NAME_EN':
        continue
    else:
        if 'TYPE_NAME_EN' in i:
            drop_columns.append(i)
train_df=train_df.drop(drop_columns,axis=1)
test_df=test_df.drop(drop_columns,axis=1)
#将分类变量转化为哑变量
lbl=LabelEncoder()
tlbl=LabelEncoder()
lbl.fit(list(train_df['0_5_TYPE_NAME_EN'].values))
tlbl.fit(list(test_df['0_5_TYPE_NAME_EN'].values))
train_df['0_5_TYPE_NAME_EN']=lbl.transform(list(train_df['0_5_TYPE_NAME_EN'].values))
test_df['0_5_TYPE_NAME_EN']=tlbl.transform(list(test_df['0_5_TYPE_NAME_EN'].values))
train_df.to_csv('train_final.csv',index=False)
test_df.to_csv('test_final.csv',index=False)

1、将与因变量相关度为0.99的变量剔除

2、利用PCA进行降维

#根据相关性剔除特征向量和利用PCA降维度
train_df=pd.read_csv('train_final.csv')
test_df=pd.read_csv('test_final.csv')
y_train=train_df.ix[:,0:1]
x_train=train_df.ix[:,1:]
drop_corr_columns=[]
check_corr_columns=[]
thresh_hold=0.99
x_train_corr=x_train.corr().abs()
for i in range(np.shape(x_train.columns)[0]):
    for j in range(i+1,np.shape(x_train.columns)[0]):
        if x_train_corr.ix[i,j]>=thresh_hold:
            if x_train.columns[i] not in drop_corr_columns:
                drop_corr_columns.append(list(x_train.columns)[i])
                check_corr_columns.append([str(x_train.columns[i])+'+'+str(x_train.columns[j])+'='+str(round(x_train_corr.ix[i,j],2))])
    print('已完成:',((i+1)/3482)*100,'%')
print('有%f个多余特征' % len(drop_corr_columns))  
           
x_train_afcorr=x_train.drop(drop_corr_columns,axis=1)
test_df_afcorr=test_df.drop(drop_corr_columns,axis=1)

train_list_tempo=x_train_afcorr['0_5_TYPE_NAME_EN']
test_list_tempo=test_df_afcorr['0_5_TYPE_NAME_EN']
x_train_afcorr.drop('0_5_TYPE_NAME_EN',axis=1,inplace=True)
test_df_afcorr.drop('0_5_TYPE_NAME_EN',axis=1,inplace=True)

pca=PCA(n_components=92)
pca.fit(x_train_afcorr) 
pca_var_rat=pca.explained_variance_ratio_
pca_var=pca.explained_variance_
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)    
x_train_new=pca.fit_transform(x_train_afcorr)   
x_train_new=pd.concat([DataFrame(x_train_new),train_list_tempo],axis=1)
test_new=pca.fit_transform(test_df_afcorr)    
test_new=pd.concat([DataFrame(test_new),test_list_tempo],axis=1)
x_train_new.to_csv('x_train_pca.csv',index=False)
test_new.to_csv('test_pca.csv',index=False)

1、运用XGBOOST算法训练数据

2、使用cross_val_score寻找最佳的学习器数量

3、使用GridSearchCV调整决策树的深度、 最小叶子的比例、每棵树所用到的样本比例、每棵树所用到的特征比例、正则参数等

#建立xgbt
train_df=pd.read_csv('train_final.csv')
x_train=pd.read_csv('x_train_pca.csv')
test=pd.read_csv('test_pca.csv')
y_train=(train_df.ix[:,0:1])
   
#寻找最佳学习器数目 
k_estimators=list(range(1,1000,2))
k_score_mean=[]
k_score_std=[]
for i in k_estimators:
        xgb3=XGBRegressor(objective='reg:linear',
                 learning_rate=0.1,
                 max_depth=8,
                 min_child_weight=1,
                 subsample=0.3,
                 colsample_bytree=0.8,
                 colsample_bylevel=0.7,
                 seed=3,
                 eval_metric='rmse',
                 reg_alpha=2,
                 reg_lambda=0.1,
                 n_estimators=i)
        score=cross_val_score(xgb3,x_train.values,y_train.values,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
        print(i)
        print(score.mean())
        print(score.std())
        k_score_mean.append(score.mean())
        k_score_std.append(score.std()) 
        
    
plt.plot(k_estimators,k_score_mean)
plt.xlabel('value of k for xgb2')
plt.ylabel('neg__mean_squared_error')
plt.show()

#寻找最佳步长和最小叶子比例
xgb2=XGBRegressor(objective='reg:linear',
                 learning_rate=0.1,
                 max_depth=6,
                 min_child_weight=1,
                 subsample=0.3,
                 colsample_bytree=0.8,
                 colsample_bylevel=0.7, 
                 seed=3,
                 eval_metric='rmse',
                 n_estimators=216)
    

param_test={'max_depth':list(range(6,10,1)),'min_child_weight':list(range(1,3,1))}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_

#寻找subsample和colsample_bytree
xgb2=XGBRegressor(objective='reg:linear',
                 learning_rate=0.1,
                 max_depth=8,
                 min_child_weight=1,
                 subsample=0.3,
                 colsample_bytree=0.8,
                 colsample_bylevel=0.7,
                 seed=3,
                 eval_metric='rmse',
                 n_estimators=401)
    

param_test={'subsample':[i/10 for i in range(3,9)],'colsample_bytree':[i/10 for i in range(6,10)]}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_

#寻找更好的正则参数
reg_alpha=[2,2.5,3]#之前测过【0.1,1,1.5,2】
reg_lambda=[0,0.05,0.1]#之前测过【0.1,0.5,1,2】
xgb2=XGBRegressor(objective='reg:linear',
                 learning_rate=0.1,
                 max_depth=8,
                 min_child_weight=1,
                 subsample=0.3,
                 colsample_bytree=0.8,
                 colsample_bylevel=0.7,
                 seed=3,
                 eval_metric='rmse',
                 n_estimators=401)
    

param_test={'reg_alpha':reg_alpha,'reg_lambda':reg_lambda}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_

xgb2=XGBRegressor(objective='reg:linear',
         learning_rate=0.1,
         max_depth=8,
         min_child_weight=3,
         subsample=0.3,
         colsample_bytree=0.8, 
         colsample_bylevel=0.7,
         seed=3,
         eval_metric='rmse',
         reg_alpha=2,
         reg_lambda=0.1,
         n_estimators=466)
xgb2.fit(x_train.values,y_train.values)
pred=xgb2.predict(test.values)

猜你喜欢

转载自blog.csdn.net/pandacode/article/details/81453298
今日推荐