RF评价特征重要度,画出特征排行
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report
def read_data():
# load pickle
#df = pd.read_pickle("./output/killed_collision_normal2class.pkl")
df = pd.read_pickle("./output/killed_collision_normal2class.pkl")
X_train, X_test, y_train, y_test=train_test_split(df.drop(columns=["KILLED"]), df["KILLED"],
test_size=0.3, random_state=0)
return df, X_train, X_test, y_train, y_test
#---------读取数据集
pd_data,X_train, X_test, y_train, y_test = read_data()
def feature_importance(features_num=20):
if(features_num > X_train.shape[1]):
print("the features num is too big for the trainData")
return
forest = RandomForestClassifier(n_estimators=500,random_state=0,n_jobs=-1,max_features=20)
forest.fit(X_train,y_train)
y_true, y_pred = y_test, forest.predict(X_test)
print(classification_report(y_true, y_pred))
importance = forest.feature_importances_
indices = np.argsort(importance)[::-1]
print("----the importance of features and its importance_score------")
j=1
features_names=[]
im_list= []
for i in indices[0:features_num]:
f_name = X_train.columns.values[i]
print(j,f_name,importance[i])
features_names.append(X_train.columns.values[i])
im_list.append(importance[i])
j+=1
draw_importance(features_names,im_list)
def draw_importance(features,importances):
indices = np.argsort(importances)
print(indices)
print(features)
plt.title('Feature Importances')
plt.barh(range(len(indices)), np.array(importances)[indices], color='b', align='center')
plt.yticks(range(len(indices)), np.array(features)[indices])
plt.xlabel('Relative Importance')
plt.show()
if __name__=="__main__":
feature_importance()