1. Disadvantages of decision trees
In the picture above, the red circle is the part of pruning.
2. Pruning
3. Random Forest
Note: The modification made by Random Forest is to select k attributes from all attributes, and then select the best segmentation attribute from k attributes .
4. Code example
(1) Decision tree case
#!/usr/bin/python # -*- coding:utf-8 -*- import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline def iris_type(s): it = {b'Iris-setosa': 0, B ' Iris-versicolor ' :. 1, B ' Iris-virginica. ' : 2 } return IT [S] # calyx length, calyx width, petal length, petal width # iris_feature = 'sepal length', 'sepal width' , 'petal length', 'petal width' iris_feature = u ' calyx length ' , u ' calyx width ' , u ' petal length ' , u ' petal width ' if __name__ == " __main__ " : mpl.rcParams [ ' font.sans-serif ' ] = [u' SimHei ' ] mpl.rcParams [ ' axes.unicode_minus ' ] = False path = ' 8.iris.data ' # Data File Path Data = np.loadtxt (path, DTYPE = a float, DELIMITER = ' , ' , = {Converters . 4 : iris_type}) X, Y = np.split (Data, (. 4,), Axis =. 1 ) # for visualization, only the first two features X = X [:,: 2 ] x_train, x_test, y_train, android.permission.FACTOR. = train_test_split (x, y, test_size = 0.3, random_state = 1 ) # ss = StandardScaler () #ss.fit = SS (x_train) # tree parameter estimation # min_samples_split = 10: If the number of samples contained greater than 10 nodes, the (possible) its branch # min_samples_leaf = 10: If after a branch node, The number of samples of each sub-node obtained is greater than 10, then complete the branch; otherwise, do not branch. Model = Pipeline ([ ( ' ss ' , StandardScaler ()), ( ' DTC ' , DecisionTreeClassifier (criterion = ' entropy ' , max_depth . 3 = ))]) # CLF = DecisionTreeClassifier (Criterion = 'Entropy', MAX_DEPTH =. 3) Model = model.fit (x_train, y_train) y_test_hat = model.predict (x_test) # test data # Save # dot -Tpng -o 1.png 1.dot f = open ( ' iris_tree.dot ' , ' w ' ) tree.export_graphviz (model.get_params ( ' DTC ' ) [ ' DTC ' ], out_file = f) # Paint N, M = 100, 100 # horizontal and vertical values of the respective number of samples x1_min, X1_Max = X [0 :,] .min (), X [0 :,] .max () # range of 0 to x2_min, X = X2_Max [:,. 1] .min (), X [:,. 1] .max () # first range of 1 T1 = np.linspace (x1_min, X1_Max, N) T2 =np.linspace (X2_Min, X2_Max, M) X1, X2 = np.meshgrid (T1, T2) # generates the sampling grid points x_show np.stack = ((x1.flat, x2.flat), Axis =. 1) # Test Point # # meaningless, just to make up for two other dimensions # # Before opening the comment, make sure to comment out x = x [:,: 2] # x3 = np.ones (x1.size) * np.average (x [ :, 2]) # x4 = np.ones (x1.size) * np.average (x [:, 3]) # x_test = np.stack ((x1.flat, x2.flat, x3, x4), axis = 1) # test point cm_light = mpl.colors.ListedColormap ([ ' # A0FFA0 ' , ' # FFA0A0 ' , ' # A0A0FF ' ]) cm_darkMpl.colors.ListedColormap = ([ ' G ' , ' R & lt ' , ' B ' ]) y_show_hat = model.predict (x_show) # prediction value y_show_hat = y_show_hat.reshape (x1.shape) # make the same shape as the input plt.figure (facecolor = ' W ' ) plt.pcolormesh (X1, X2, y_show_hat, CMap = cm_light) # shows the predicted value plt.scatter (x_test [:, 0], x_test [:, 1], c = y_test .ravel (), edgecolors = ' k ' , s = 100, cmap = cm_dark, marker = ' o ' ) # Test data plt.scatter (X [:, 0], X [:,. 1], C = y.ravel (), edgecolors = ' K ' , S = 40, = cm_dark CMap) # all data plt.xlabel ( iris_feature [0], fontsize = 15 ) plt.ylabel (iris_feature [ 1], fontsize = 15 ) plt.xlim (x1_min, x1_max) plt.ylim (x2_min, x2_max) plt.grid (True) plt.title (u ' iris data classification decision tree ' , fontSize =. 17 ) plt.show () # prediction results on the training set android.permission.FACTOR. y_test.reshape = (-1 ) Print (y_test_hat) Print (android.permission.FACTOR.) result= (== y_test_hat android.permission.FACTOR.) # True prediction is correct, False the prediction error ACC = np.mean (Result) Print ( ' Accuracy:% .2f %% ' % (100 * ACC)) # overfitting: Error Rate depth = np.arange (1, 15 ) err_list = [] for d in depth: clf = DecisionTreeClassifier (criterion = ' entropy ' , max_depth = d) clf = clf.fit (x_train, y_train) y_test_hat = clf.predict ( x_test) # test data result = (y_test_hat == y_test) # True predicts correct, False predicts error err = 1- np.mean (result) err_list.append (err) print (d, ' Accuracy:% .2f %% ' % ( 100 * err)) plt.figure (facecolor = ' w ' ) plt.plot (depth, err_list, ' ro- ' , lw = 2 ) plt.xlabel (u ' Decision tree depth ' , fontsize = 15 ) plt.ylabel (u ' error rate ' , fontsize = 15 ) plt.title (u ' decision tree depth and overfitting', fontsize=17) plt.grid(True) plt.show()
Effect picture: