According to the mathematical principles of the model, simple code self-reproduction and usage testing are only used for self-learning. The principle of the model will not be described in detail here, but only some of the formulas that I will use will be listed.
If there are errors or deficiencies in the text or code, please feel free to correct me.
Regression tree is a kind of decision tree, a binary tree based on CART tree. Different from the original classification decision tree, since the dependent variable is a continuous variable, the GINI coefficient is no longer used as the criterion for dividing the attributes, but the mean square error is used as a substitute to determine the divided attributes and the divided points.
Here, implement a regression tree by yourself; and perform post-pruning according to the number of leaf node samples, the height of the tree, and the mean square error. Due to limited capabilities, the abstraction of functions was not done well during the implementation. Many functions were added after thinking about it, resulting in low efficiency of the algorithm, which can be regarded as learning experience.
import numpy as np
import pandas as pd
import random
import collections
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from collections import deque
class TreeNode:
def __init__(self,labels_idx=None,left=None,right=None,split_idx=None,is_discrete=None,split_value=None,father=None) -> None:
"""
回归树树结构
left:左子树
right:右子树
labels_idx:在训练中训练集的label对应的下标
is_discrete:是否是离散量
split_idx:划分特征对应的下标
split_value:划分点
father:父亲节点
"""
self.labels_idx = labels_idx
self.left = left
self.right = right
self.split_idx = split_idx
self.is_discrete = is_discrete
self.split_value = split_value
self.father = father
class RegressionTree:
def __init__(self,data,labels,is_discrete,validate_ratio=0.1):
"""
初始化
is_discrete:列表,传入特征是否是变量
validate_ratio:保留验证集的比例
"""
self.data = np.array(data)
self.labels=np.array(labels)
self.feature_num = self.data.shape[1]
self.is_discrete = is_discrete
self.validate_ratio = validate_ratio
self.leaves = []
if validate_ratio>0:
all_index = range(data.shape[0])
self.train_idx,self.test_idx = train_test_split(all_index,test_size=validate_ratio)
self.validate_data = self.data[self.test_idx,:]
self.validate_label = self.labels[self.test_idx]
self.train_data = self.data[self.train_idx,:]
self.train_label = self.labels[self.train_idx]
def get_mse(self,y_pred,y_true):
"""
计算MSE
"""
y_pred = np.array(y_pred)
y_true = np.array(y_true)
return np.mean(np.square(y_pred-y_true))
def generate_tree(self,idxs,min_ratio):
"""
递归生成树结构
idxs:子树结构所含元素的下标
min_ratio:叶子节点至少应当占(训练集+验证集)的比例
"""
root = TreeNode(labels_idx=idxs)
if len(idxs)/self.data.shape[0]<=min_ratio:
return root
idx,split_value = self.choose_feature(self.data[idxs,:],self.labels[idxs])
root.split_value = split_value
root.split_idx = idx
left_idxs = []
right_idxs = []
if self.is_discrete[idx]:
for i in idxs:
if self.data[i,idx] != split_value:
right_idxs.append(i)
else:
left_idxs.append(i)
else:
for i in idxs:
if self.data[i,idx] <= split_value:
right_idxs.append(i)
else:
left_idxs.append(i)
left_idxs = np.array(left_idxs)
right_idxs = np.array(right_idxs)
root.left = self.generate_tree(left_idxs,min_ratio)
if root.left:
root.left.father = root
root.right = self.generate_tree(right_idxs,min_ratio)
if root.right:
root.right.father = root
return root
def train(self,max_depth = 0,min_ratio=0.05):
"""
训练过程,包括创建决策树与剪枝
max_depth:树的最大高度
min_ratio:叶子节点至少应当占(训练集+验证集)的比例
"""
if self.validate_ratio>0:
idx = self.train_idx
else:
idx = range(len(self.labels))
self.tree = self.generate_tree(idx,min_ratio)
#当验证集比例>0时,采取后剪枝策略
if self.validate_ratio>0:
self.find_leaves(self.tree)
nodes = deque(self.leaves)
while len(nodes)>0:
n=len(nodes)
for _ in range(n):
node = nodes.popleft()
if not node.father:
nodes = []
break
valid_pred = self.predict(self.validate_data)
mse_before = self.get_mse(valid_pred,self.validate_label)
backup_left = node.father.left
backup_right= node.father.right
node.father.left = None
node.father.right = None
valid_pred = self.predict(self.validate_data)
mse_after = self.get_mse(valid_pred,self.validate_label)
if mse_after>mse_before:
node.father.left = node.father.left
node.father.right = node.father.right
else:
nodes.append(node.father)
#限制最大高度
if max_depth>0:
nodes = deque([self.tree])
d=1
while len(nodes)>0 and d<max_depth:
n = len(nodes)
for _ in range(n):
node = nodes.popleft()
if node.left:
nodes.append(node.left)
if node.right:
nodes.append(node.right)
d += 1
if len(nodes)>0:
for node in nodes:
node.left=None
node.right=None
def find_leaves(self,node):
"""
寻找叶子节点
"""
if not node.left and not node.right:
self.leaves.append(node)
return None
else:
if node.left:
self.find_leaves(node.left)
if node.right:
self.find_leaves(node.right)
def predict_one(self,x,node=None):
"""
根据决策树预测给定的单个样本
"""
if node == None:
node = self.tree
while node.left and node.right:
idx = node.split_idx
if self.is_discrete[idx]:
if x[idx]==node.split_value:
node = node.left
else:
node = node.right
else:
if x[idx]>node.split_value:
node = node.right
else:
node = node.left
res_idx = node.labels_idx
return np.mean(self.labels[res_idx])
def predict(self,x,node=None):
"""
预测给定的样本集
"""
x = np.array(x)
predicts = []
for i in range(x.shape[0]):
res = self.predict_one(x[i,:],node)
predicts.append(res)
return predicts
def sum_std(self,x):
"""
计算均方误差
"""
return np.sum(np.square(x-np.mean(x)))/len(x)
def choose_feature(self,x,left_labels):
"""
选择可以让子节点均方误差和最小的特征以及分割方式
"""
std_list = []
split_value_list = []
for i in range(x.shape[1]):
final_split_value,final_sum_std=self.calc_std(x[:,i],self.is_discrete[i],left_labels)
std_list.append(final_sum_std)
split_value_list.append(final_split_value)
idx = np.argmin(std_list)
return idx,split_value_list[idx]
def calc_std(self,feature,is_discrete,labels):
"""
对于一个特征,检索能够使得子节点均方误差最小的分割方式
"""
final_sum_std = float("inf")
final_split_value = 0
idx = range(len(feature))
feature_with_idx = np.c_[idx,feature]
labels = np.array(labels)
if is_discrete:
values = list(set(feature))
idx_dict = {v:[] for v in values}
for i,fea in feature_with_idx:
idx_dict[fea].append(i)
for v in values:
anti_idx = [i for i in idx if i not in idx_dict[v]]
left = labels[idx_dict[v]]
right = labels[anti_idx]
if left.shape[0]==0 or right.shape[0] == 0:
continue
sum_std = self.sum_std(left)+self.sum_std(right)
if sum_std<final_sum_std:
final_sum_std = sum_std
final_split_value = v
else:
feature_with_idx = feature_with_idx[feature_with_idx[:,1].argsort()]
feature = feature_with_idx[:,1]
idx = feature_with_idx[:,0]
for i in range(len(feature)-1):
if feature[i]==feature[i+1]:
continue
split_value = (feature[i]+feature[i+1])/2
idx_left = idx[:i+1]
idx_right = idx[i+1:]
sum_std = self.sum_std(labels[idx_left.astype('int64')])+self.sum_std(labels[idx_right.astype('int64')])
if sum_std<final_sum_std:
final_sum_std = sum_std
final_split_value = split_value
return final_split_value,final_sum_std
Tested with the mpg (car emissions) dataset
import pandas as pd
df = pd.read_excel("mpg.xlsx")
df.replace("?",pd.NA,inplace=True)
df.dropna(axis=0,inplace=True)
label = df.iloc[:,0].values
data = df.iloc[:,1:5].values
x_train,x_test,y_train,y_test = train_test_split(data,label)
rt = RegressionTree(x_train,y_train,is_discrete=[True,False,False,False],validate_ratio=0.1)
rt.train(max_depth=10)
res = rt.predict(x_test)
rt.get_mse(res,y_test)
"""
53.20787956384344
"""
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
dr.fit(x_train,y_train)
res2 = dr.predict(x_test)
rt.get_mse(res2,y_test)
"""
27.945918367346938
"""
The effect is not as good as sklearn's library; and it is not stable, sometimes it seems that "only the root node" phenomenon appears. If you find any deficiencies or mistakes, please correct me.