使用随机森林进行特征选择的具体方法

import numpy as np
import pandas as pd
from sklearn import svm
#from sklearn.linear_model import LogisticRegression

#——————————————————导入训练数据——————————————————————
data0 = pd.read_csv('Data22.csv',index_col=None,parse_dates = True) #pd.read_csv默认生成DataFrame对象
data1 = data0.iloc[:-227,-1].values[:,np.newaxis]#错位一天,以前一天来预测后一天
data2 =data0.iloc[227:,2:8].values  #取第2-9列
data = np.concatenate((data1,data2),axis=1)
train_x=data[:,:6]
train_y=data[:,6]
import matplotlib.pyplot as plt
feature_names=['ex_Value','Season','IsHoliday','DW','Weather','Temperature']
X=train_x[:288*30]#30天的数据
Y=train_y[:288*30]
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor

#Load boston housing dataset as an example

names = feature_names

rf = RandomForestRegressor(n_estimators=100, max_depth=4)
scores = []
for i in range(X.shape[1]):
     score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
     scores.append((round(np.mean(score), 3), names[i]))
print(sorted(scores, reverse=True)) 
[(0.708, 'ex_Value'), (0.138, 'Temperature'), (0.124, 'Weather'), (0.02, 'DW'), (0.018, 'IsHoliday'), (-0.0, 'Season')]

180天,特征相关性检测:[(0.736, ‘ex_Value’), (0.162, ‘Temperature’), (0.103, ‘Season’), (0.031, ‘Weather’), (0.009, ‘IsHoliday’), (0.001, ‘DW’)]

7天,特征相关性检测:[(0.888, ‘ex_Value’), (-0.001, ‘Season’), (-0.002, ‘Temperature’), (-0.003, ‘Weather’), (-0.004, ‘DW’), (-0.006, ‘IsHoliday’)]

30天,特征相关性检测:[(0.716, ‘ex_Value’), (0.13, ‘Temperature’), (0.124, ‘Weather’), (0.023, ‘DW’), (0.02, ‘IsHoliday’), (-0.001, ‘Season’)]

360天特征相关性检测:[(0.747, ‘ex_Value’), (0.102, ‘Temperature’), (0.037, ‘Season’), (0.021, ‘Weather’), (0.005, ‘IsHoliday’), (0.001, ‘DW’)]

1天,特征相关性检测:[(0.919, ‘ex_Value’), (0.101, ‘Weather’), (0.085, ‘Temperature’), (0.057, ‘DW’), (-0.005, ‘Season’), (-0.02, ‘IsHoliday’)]

猜你喜欢

转载自blog.csdn.net/elite666/article/details/80696228
今日推荐