sklearn of Random Forests

'' ' 
    Collection algorithm: 
        1. forward excitation 
        2. Self-polymerizing: time from the matrix to the total sample with replacement sampling method randomly selected sample of a decision tree, thus forming multiple decision trees comprise different training samples, 
                    to weaken the influence of some strong sample of the model predictions and improve generalization characteristics of the model. 
        3. Random Forests: Based on the self-polymerization, every time you build a decision tree model, not only random selection of some of the samples, but also randomly select some of the characteristics, such as a collection of algorithms, 
                    not only to avoid the impact of the strong sample of predicted results, and also weakened the influence of the strong features of the predictive power of the model is more generalized. (Moderation -> true value) 
            random forest-related API: 
                Import sklearn.ensemble AS SE 
                # random forest regression model (belonging to the set of algorithms) 
                # MAX_DEPTH: Decision Trees maximum depth of 10 
                # n_estimators: 1000 decision tree building, training model 
                # min_samples_split: subtable if this number is less than the minimum number of samples, then no further split down 
                model = se.RandomForestRegressor (max_depth = 10, n_estimators = 1000, min_samples_split = 2)

    Case: shared bicycle needs analysis, to determine how to share the bicycle delivery. 
        1. Read data bike_day.csv 
        2. collation set input and output set into a training set and test set 
        3. Select Model ---- Random Forest, training model 
        4. r2 score using a test set of output 
        5. Output characteristics importance, and rendering image 
'' ' 
Import numpy AS NP
 Import matplotlib.pyplot AS MP
 Import sklearn.metrics AS SM
 Import sklearn.ensemble AS SE   # collection algorithm module 
Import sklearn.utils AS SU   # disrupted data 

' '' ===== ============================ analysis bike_day.csv ================== ============= '' ' 
# reading data method. 1 
# data = [] 
# with open('./ml_data/bike_day.csv','r') as f:
#     for line in f.readlines():
#         data.append(line[:-1].split(','))
# print(data)
# data = np.array(data)

# 读取数据方法2
data = np.loadtxt('./ml_data/bike_day.csv', unpack=False, dtype='U20', delimiter=',')
print(data.shape)
day_headers = data[0, 2:13]
print(day_headers)
x = np.array(data[1:, 2:13], dtype='f8'
and)Np.array = (Data [. 1 :, -1], DTYPE = ' F8 ' ) 

# dividing training set and test set 
X, Y = su.shuffle (X, Y, random_state =. 7)   # disrupted sample 
train_size = int (len (the X-) * 0.9 ) 
train_x, test_x, train_y, test_y = the X-[: train_size], the X-[train_size:], the y-[: train_size], the y-[train_size:] 

# training model 
model = se.RandomForestRegressor (max_depth = 10, n_estimators = 1000, = min_samples_split. 3 ) 
model.fit (train_x, train_y) 

# model test 
pred_test_y = model.predict (test_x) 

# model assessment 
Print ( ' bike_day of r2_score score: ', Sm.r2_score (test_y, pred_test_y)) 

# output model feature importance 
day_fi = model.feature_importances_ 

'' ' ========================== ======= =============================== analysis bike_hour.csv '' ' 
# read data 
data = [] 
with Open ( ' ./ml_data/bike_hour.csv ' , ' R & lt ' ) AS F:
     for Line in f.readlines (): 
        data.append (Line [: -1] .split ( ' , ' )) 

Data = np.array (Data) 
hour_headers = Data [0, 2:14 ]
 Print(hour_headers) 
X = np.array (Data [. 1 :, 2:14], DTYPE = ' F8 ' ) 
Y = np.array (Data [. 1 :, -1], DTYPE = ' F8 ' ) 

# partitioning test set and the training set 
X, Y = su.shuffle (X, Y, random_state =. 7)   # disrupted sample 
train_size = int (len (X) * 0.9 ) 
train_x, test_x, train_y, test_y = X [: train_size], X [ train_size:], Y [: train_size], Y [train_size:] 

# training model 
model = se.RandomForestRegressor (MAX_DEPTH = 10, n_estimators = 1000, = min_samples_split. 3 ) 
model.fit (train_x, train_y) 

# model test 
pred_test_y =model.predict (test_x) 

# model assessment 
Print ( ' bike_hour of r2_score score: ' , sm.r2_score (test_y, pred_test_y)) 

# output model feature importance 
hour_fi = model.feature_importances_ 

# shown the importance of image features bike_day 
mp. Figure ( ' the Feature Importance ' , facecolor = ' LightGray ' ) 
mp.rcParams [ ' font.sans serif- ' ] = ' SimHei ' 
mp.subplot ( 211 ) 
mp.title ( ' Bike_day the FI ' ) 
mp.ylabel ( 'Importance the Feature ' ) 
mp.grid (lineStyle = " : " ) 
sorted_indexes = day_fi.argsort () [:: -. 1]   # standard ordering is, in descending 
X = np.arange (day_headers.size) 
mp.bar ( x, day_fi [sorted_indexes], 0.7, Color = ' DodgerBlue ' , label = ' BDFI ' ) 
mp.xticks (x, day_headers [sorted_indexes])   # set x coordinate 
mp.tight_layout () 
mp.legend () 

# draw the importance of image features bike_hour 
mp.subplot (212 ) 
mp.title ( ' Bike_hour the FI ' )
mp.ylabel ( ' the Feature Importance ' ) 
mp.grid (lineStyle = " : " ) 
sorted_indexes = hour_fi.argsort () [:: -. 1]   # subscript sorted in descending 
X = np.arange (hour_headers.size ) 
mp.bar (x, hour_fi [sorted_indexes], 0.7, Color = ' OrangeRed ' , label = ' BHFI ' ) 
mp.xticks (x, hour_headers [sorted_indexes])   # set x coordinate 
mp.tight_layout () 
mp.legend () 

mp.show () 




output: 
( 732, 16 ) 
[ ' Season' 'yr' 'mnth' 'holiday' 'weekday' 'workingday' 'weathersit' 'temp'
 'atemp' 'hum' 'windspeed']
bike_day的r2_score得分: 0.8929064136199945
['season' 'yr' 'mnth' 'hr' 'holiday' 'weekday' 'workingday' 'weathersit'
 'temp' 'atemp' 'hum' 'windspeed']
bike_hour的r2_score得分: 0.9185230199218621

  

Guess you like

Origin www.cnblogs.com/yuxiangyang/p/11184890.html