Machine learning model self-code reproduction: regression tree

According to the mathematical principles of the model, simple code self-reproduction and usage testing are only used for self-learning. The principle of the model will not be described in detail here, but only some of the formulas that I will use will be listed.

If there are errors or deficiencies in the text or code, please feel free to correct me.

Regression tree is a kind of decision tree, a binary tree based on CART tree. Different from the original classification decision tree, since the dependent variable is a continuous variable, the GINI coefficient is no longer used as the criterion for dividing the attributes, but the mean square error is used as a substitute to determine the divided attributes and the divided points.

Here, implement a regression tree by yourself; and perform post-pruning according to the number of leaf node samples, the height of the tree, and the mean square error. Due to limited capabilities, the abstraction of functions was not done well during the implementation. Many functions were added after thinking about it, resulting in low efficiency of the algorithm, which can be regarded as learning experience.

import numpy as np
import pandas as pd
import random
import collections
from sklearn.preprocessing import  PolynomialFeatures
from sklearn.model_selection import train_test_split
from collections import deque

class TreeNode:
    def __init__(self,labels_idx=None,left=None,right=None,split_idx=None,is_discrete=None,split_value=None,father=None) -> None:
        """
        回归树树结构
        left:左子树
        right:右子树
        labels_idx:在训练中训练集的label对应的下标
        is_discrete:是否是离散量
        split_idx:划分特征对应的下标
        split_value:划分点
        father:父亲节点
        """
        self.labels_idx = labels_idx
        self.left = left
        self.right = right
        self.split_idx = split_idx
        self.is_discrete = is_discrete
        self.split_value = split_value
        self.father = father

class RegressionTree:
    def __init__(self,data,labels,is_discrete,validate_ratio=0.1):
        """
        初始化
        is_discrete:列表,传入特征是否是变量
        validate_ratio:保留验证集的比例
        """
        self.data = np.array(data)
        self.labels=np.array(labels)
        self.feature_num = self.data.shape[1]
        self.is_discrete = is_discrete
        self.validate_ratio = validate_ratio
        self.leaves = []
        if validate_ratio>0:
            all_index = range(data.shape[0])
            self.train_idx,self.test_idx = train_test_split(all_index,test_size=validate_ratio)
            self.validate_data = self.data[self.test_idx,:]
            self.validate_label = self.labels[self.test_idx]
            self.train_data = self.data[self.train_idx,:]
            self.train_label = self.labels[self.train_idx]
    
    def get_mse(self,y_pred,y_true):
        """
        计算MSE
        """
        y_pred = np.array(y_pred)
        y_true = np.array(y_true)
        return np.mean(np.square(y_pred-y_true))

    def generate_tree(self,idxs,min_ratio):
        """
        递归生成树结构
        idxs:子树结构所含元素的下标
        min_ratio:叶子节点至少应当占(训练集+验证集)的比例
        """
        root = TreeNode(labels_idx=idxs)
        if len(idxs)/self.data.shape[0]<=min_ratio:
            return root
        idx,split_value = self.choose_feature(self.data[idxs,:],self.labels[idxs])
        root.split_value = split_value
        root.split_idx = idx
        left_idxs = []
        right_idxs = []
        if self.is_discrete[idx]:
            for i in idxs:
                if self.data[i,idx] != split_value:
                    right_idxs.append(i)
                else:
                    left_idxs.append(i)
        else:
            for i in idxs:
                if self.data[i,idx] <= split_value:
                    right_idxs.append(i)
                else:
                    left_idxs.append(i)
        left_idxs = np.array(left_idxs)
        right_idxs = np.array(right_idxs)
        root.left = self.generate_tree(left_idxs,min_ratio)
        if root.left:
            root.left.father = root
        root.right = self.generate_tree(right_idxs,min_ratio)
        if root.right:
            root.right.father = root
        return root

    def train(self,max_depth = 0,min_ratio=0.05):
        
        """
        训练过程,包括创建决策树与剪枝
        max_depth:树的最大高度
        min_ratio:叶子节点至少应当占(训练集+验证集)的比例
        """
        if self.validate_ratio>0:
            idx = self.train_idx
        else:
            idx = range(len(self.labels))
        
        self.tree = self.generate_tree(idx,min_ratio)
        #当验证集比例>0时,采取后剪枝策略
        if self.validate_ratio>0:
            self.find_leaves(self.tree)
            nodes = deque(self.leaves)
            while len(nodes)>0:
                n=len(nodes)
                for _ in range(n):
                    node = nodes.popleft()
                    if not node.father:
                        nodes = []
                        break
                    valid_pred = self.predict(self.validate_data)
                    mse_before = self.get_mse(valid_pred,self.validate_label)
                    backup_left = node.father.left
                    backup_right= node.father.right
                    node.father.left = None
                    node.father.right = None
                    valid_pred = self.predict(self.validate_data)
                    mse_after = self.get_mse(valid_pred,self.validate_label)
                    if mse_after>mse_before:
                        node.father.left = node.father.left
                        node.father.right = node.father.right
                    else:
                        nodes.append(node.father)
        #限制最大高度
        if max_depth>0:
            nodes = deque([self.tree])
            d=1
            while len(nodes)>0 and d<max_depth:
                n = len(nodes)
                for _ in range(n):
                    node = nodes.popleft()
                    if node.left:
                        nodes.append(node.left)
                    if node.right:
                        nodes.append(node.right)
                d += 1
            if len(nodes)>0:
                for node in nodes:
                    node.left=None
                    node.right=None
        
        
    def find_leaves(self,node):
        """
        寻找叶子节点
        """
        if not node.left and not node.right:
            self.leaves.append(node)
            return None
        else:
            if node.left:
                self.find_leaves(node.left)
            if node.right:
                self.find_leaves(node.right)
        

    def predict_one(self,x,node=None):
        """
        根据决策树预测给定的单个样本
        """
        if node == None:
            node = self.tree
        while node.left and node.right:
            idx = node.split_idx
            if self.is_discrete[idx]:
                if x[idx]==node.split_value:
                    node = node.left
                else:
                    node = node.right
            else:
                if x[idx]>node.split_value:
                    node = node.right
                else:
                    node = node.left

        res_idx = node.labels_idx
        return np.mean(self.labels[res_idx])
    
    def predict(self,x,node=None):
        """
        预测给定的样本集
        """
        x = np.array(x)
        predicts = []
        for i in range(x.shape[0]):
            res = self.predict_one(x[i,:],node)
            predicts.append(res)
        return predicts

    def sum_std(self,x):
        """
        计算均方误差
        """
        return np.sum(np.square(x-np.mean(x)))/len(x)
    
    def choose_feature(self,x,left_labels):
        """
        选择可以让子节点均方误差和最小的特征以及分割方式
        """
        std_list = []
        split_value_list = []
        for i in range(x.shape[1]):
            final_split_value,final_sum_std=self.calc_std(x[:,i],self.is_discrete[i],left_labels)
            std_list.append(final_sum_std)
            split_value_list.append(final_split_value)
        idx = np.argmin(std_list)
        return idx,split_value_list[idx]
    
    def calc_std(self,feature,is_discrete,labels):
        """
        对于一个特征,检索能够使得子节点均方误差最小的分割方式
        """
        final_sum_std = float("inf")
        final_split_value = 0
        idx = range(len(feature))
        feature_with_idx = np.c_[idx,feature]
        labels = np.array(labels)
        if is_discrete:
            values = list(set(feature))
            idx_dict = {v:[] for v in values}
            for i,fea in feature_with_idx:
                idx_dict[fea].append(i)
            for v in values:
                anti_idx = [i for i in idx if i not in idx_dict[v]]
                left = labels[idx_dict[v]]
                right = labels[anti_idx]
                if left.shape[0]==0 or right.shape[0] == 0:
                    continue
                sum_std = self.sum_std(left)+self.sum_std(right)
                if sum_std<final_sum_std:
                    final_sum_std = sum_std
                    final_split_value = v
        else:
            feature_with_idx = feature_with_idx[feature_with_idx[:,1].argsort()]
            feature = feature_with_idx[:,1]
            idx = feature_with_idx[:,0]
            for i in range(len(feature)-1):
                if feature[i]==feature[i+1]:
                    continue
                split_value = (feature[i]+feature[i+1])/2
                idx_left = idx[:i+1]
                idx_right = idx[i+1:]
                sum_std = self.sum_std(labels[idx_left.astype('int64')])+self.sum_std(labels[idx_right.astype('int64')])
                if sum_std<final_sum_std:
                    final_sum_std = sum_std
                    final_split_value = split_value
                    
        return final_split_value,final_sum_std

Tested with the mpg (car emissions) dataset

import pandas as pd

df = pd.read_excel("mpg.xlsx")
df.replace("?",pd.NA,inplace=True)
df.dropna(axis=0,inplace=True)
label = df.iloc[:,0].values
data = df.iloc[:,1:5].values
x_train,x_test,y_train,y_test = train_test_split(data,label)

rt = RegressionTree(x_train,y_train,is_discrete=[True,False,False,False],validate_ratio=0.1)
rt.train(max_depth=10)
res = rt.predict(x_test)
rt.get_mse(res,y_test)
"""
53.20787956384344
"""


from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
dr.fit(x_train,y_train)
res2 = dr.predict(x_test)
rt.get_mse(res2,y_test)
"""
27.945918367346938
"""

The effect is not as good as sklearn's library; and it is not stable, sometimes it seems that "only the root node" phenomenon appears. If you find any deficiencies or mistakes, please correct me.

Guess you like

Origin blog.csdn.net/thorn_r/article/details/124232111