Python语言利用随机森林实现特征重要性排序

from __future__ import division
import tensorflow as tf
import math
import csv
from sklearn import metrics
import numpy as np
from pylab import*
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
from sklearn import preprocessing
import matplotlib.pyplot as plt
i=0
j=[]
data = []
X = []
indicess = []
file = open("E:/predict_rf_para.txt", 'a')
with open(r'D:\夏季.csv') as f:
    reader = csv.reader(f)
    for row in reader:
            if i == 0:
                i += 1
                continue
          else:
             data.append(row[:])
print("data",data[1])
data = np.array(data)
print("the shape of data",np.shape(data))
m,n = np.shape(data)
print("the shape of data",m,n)
for i in range(m):
    for j in range(n):
        data[i][j] = data[i][j].astype('float64')
y = data[:,-1]
y1 = data[:,-1]
set1 = data[:,:-1]
print("set1",set1)
set2 = data[:,-1]
def create_interval_dataset(dataset1, dataset2, xback, yback):
    dataX, dataY = [], []
    for i in range(0, len(dataset1) - xback - yback, 1):
        dataX.append(dataset1[i:i+xback])
        dataY.append(dataset2[i+xback:i+xback+yback])
    return np.asarray(dataX), np.asarray(dataY)
dataX, dataY = create_interval_dataset(set1, set2, xback=1, yback=1)
print("dataY",np.shape(dataY))
print("dataX",np.shape(dataX))
for i in range(1):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataX, dataY, test_size=0.7)
    X_test = np.reshape(X_test, [-1, 7])
    print("*********************")
    y_test = y_test.astype('float64') 
    clf = RandomForestRegressor(oob_score=True, n_estimators=1500)
    clf = clf.fit(X_train, y_train.ravel())
    predicted = clf.predict(X_test)
    predicted = predicted.astype('float64')
X, y = X_train, y_train
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)

list=['0','1','2','3','4','5','6']
indices = np.argsort(importances)[::]
dict = dict(map(lambda x,y:[x,y], list,importances))
dict1 = sorted(dict.items(),key = lambda  item:item[1],reverse = True)
print("importances",importances)
indices = np.argsort(importances)[::-1]
indicess = []
for i in range(len(dict1)):
     indicess.append(dict1[i][0])
indice = dict1[i][0]
print("Feature ranking:")
file.write("\n"+"\n")
file.write("                    雨天天气下随机森林算法中各个特征的排序              "+"\n")
file.write("———————————————————————————————————"+"\n")
for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, dict1[f][0], importances[indices[f]]))
    #print("%d. feature %s (%f)" % (f + 1, dict1[f][0], dict1[f][1]))
    file.write(str(f+1) +"  ")
    file.write(str(dict1[f][0])+": ")
    file.write(str(importances[indices[f]]))
    file.write("\n")

file.close()
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(indicess)
#plt.plot(datax,data0, 'k', label='真实值数据点')
#plt.xlim([-1, X.shape[1]])
plt.show()

猜你喜欢

转载自blog.csdn.net/pwtd_huran/article/details/79729144
今日推荐