Machine Learning Notes 10--Decision Tree and Random Forest 1--Overview of Random Forest

1. Disadvantages of decision trees

 

In the picture above, the red circle is the part of pruning.

2. Pruning

 

3. Random Forest

 

Note: The modification made by Random Forest is to select k attributes from all attributes, and then select the best segmentation attribute from k attributes .

 

4. Code example

(1) Decision tree case

 

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def iris_type(s):
    it = {b'Iris-setosa': 0, B ' Iris-versicolor ' :. 1, B ' Iris-virginica. ' : 2 }
     return IT [S] 


# calyx length, calyx width, petal length, petal width 
# iris_feature = 'sepal length', 'sepal width' , 'petal length', 'petal width' 
iris_feature = u ' calyx length ' , u ' calyx width ' , u ' petal length ' , u ' petal width ' 

if  __name__ == " __main__ " : 
    mpl.rcParams [ ' font.sans-serif ' ] = [u' SimHei ' ] 
    mpl.rcParams [ ' axes.unicode_minus ' ] = False 

    path = ' 8.iris.data '   # Data File Path 
    Data = np.loadtxt (path, DTYPE = a float, DELIMITER = ' , ' , = {Converters . 4 : iris_type}) 
    X, Y = np.split (Data, (. 4,), Axis =. 1 )
     # for visualization, only the first two features 
    X = X [:,: 2 ] 
    x_train, x_test, y_train, android.permission.FACTOR. = train_test_split (x, y, test_size = 0.3, random_state = 1 )
     # ss = StandardScaler () 
    #ss.fit = SS (x_train) 

    # tree parameter estimation 
    # min_samples_split = 10: If the number of samples contained greater than 10 nodes, the (possible) its branch 
    # min_samples_leaf = 10: If after a branch node, The number of samples of each sub-node obtained is greater than 10, then complete the branch; otherwise, do not branch. 
    Model = Pipeline ([ 
        ( ' ss ' , StandardScaler ()), 
        ( ' DTC ' , DecisionTreeClassifier (criterion = ' entropy ' , max_depth . 3 = ))])
     # CLF = DecisionTreeClassifier (Criterion = 'Entropy', MAX_DEPTH =. 3) 
    Model = model.fit (x_train, y_train) 
    y_test_hat = model.predict (x_test)       # test data

    # Save 
    # dot -Tpng -o 1.png 1.dot 
    f = open ( ' iris_tree.dot ' , ' w ' ) 
    tree.export_graphviz (model.get_params ( ' DTC ' ) [ ' DTC ' ], out_file = f) 

    # Paint 
    N, M = 100, 100   # horizontal and vertical values of the respective number of samples 
    x1_min, X1_Max = X [0 :,] .min (), X [0 :,] .max ()   # range of 0 to 
    x2_min, X = X2_Max [:,. 1] .min (), X [:,. 1] .max ()   # first range of 1 
    T1 = np.linspace (x1_min, X1_Max, N) 
    T2 =np.linspace (X2_Min, X2_Max, M) 
    X1, X2 = np.meshgrid (T1, T2)   # generates the sampling grid points 
    x_show np.stack = ((x1.flat, x2.flat), Axis =. 1)   # Test Point 

    # # meaningless, just to make up for two other dimensions 
    # # Before opening the comment, make sure to comment out x = x [:,: 2] 
    # x3 = np.ones (x1.size) * np.average (x [ :, 2]) 
    # x4 = np.ones (x1.size) * np.average (x [:, 3]) 
    # x_test = np.stack ((x1.flat, x2.flat, x3, x4), axis = 1) # test point 

    cm_light = mpl.colors.ListedColormap ([ ' # A0FFA0 ' , ' # FFA0A0 ' , ' # A0A0FF ' ]) 
    cm_darkMpl.colors.ListedColormap = ([ ' G ' , ' R & lt ' , ' B ' ]) 
    y_show_hat = model.predict (x_show)   # prediction value 
    y_show_hat = y_show_hat.reshape (x1.shape)   # make the same shape as the input 
    plt.figure (facecolor = ' W ' ) 
    plt.pcolormesh (X1, X2, y_show_hat, CMap = cm_light)   # shows the predicted value 
    plt.scatter (x_test [:, 0], x_test [:, 1], c = y_test .ravel (), edgecolors = ' k ' , s = 100, cmap = cm_dark, marker = ' o ' )  # Test data 
    plt.scatter (X [:, 0], X [:,. 1], C = y.ravel (), edgecolors = ' K ' , S = 40, = cm_dark CMap)   # all data 
    plt.xlabel ( iris_feature [0], fontsize = 15 ) 
    plt.ylabel (iris_feature [ 1], fontsize = 15 ) 
    plt.xlim (x1_min, x1_max) 
    plt.ylim (x2_min, x2_max) 
    plt.grid (True) 
    plt.title (u ' iris data classification decision tree ' , fontSize =. 17 ) 
    plt.show () 

    # prediction results on the training set 
    android.permission.FACTOR. y_test.reshape = (-1 )
     Print (y_test_hat)
     Print (android.permission.FACTOR.) 
    result= (== y_test_hat android.permission.FACTOR.)    # True prediction is correct, False the prediction error 
    ACC = np.mean (Result)
     Print ( ' Accuracy:% .2f %% ' % (100 * ACC)) 

    # overfitting: Error Rate 
    depth = np.arange (1, 15 ) 
    err_list = []
     for d in depth: 
        clf = DecisionTreeClassifier (criterion = ' entropy ' , max_depth = d) 
        clf = clf.fit (x_train, y_train) 
        y_test_hat = clf.predict ( x_test)   # test data
        result = (y_test_hat == y_test)   # True predicts correct, False predicts error 
        err = 1- np.mean (result) 
        err_list.append (err) 
        print (d, ' Accuracy:% .2f %% ' % ( 100 * err)) 
    plt.figure (facecolor = ' w ' ) 
    plt.plot (depth, err_list, ' ro- ' , lw = 2 ) 
    plt.xlabel (u ' Decision tree depth ' , fontsize = 15 ) 
    plt.ylabel (u ' error rate ' , fontsize = 15 ) 
    plt.title (u ' decision tree depth and overfitting', fontsize=17)
    plt.grid(True)
    plt.show()

 

Effect picture:

 

Guess you like

Origin www.cnblogs.com/luckyplj/p/12679516.html