"Machine Learning Formula Derivation and Code Implementation" chapter11-GBDT

"Machine Learning Formula Derivation and Code Implementation" study notes, record your own learning process, please buy the author's book for detailed content.

GBDT

BoostingIt is a general term for a class of algorithms that promote weak classifiers to strong classifiers. 提升树( boosting tree) is a boosting method for weak classifiers as decision trees. For the boosted tree model, the combination of additive model and forward step-by-step algorithm is a typical solution method. When the loss function is square loss and exponential loss, each iteration of the forward step-by-step algorithm is easier to solve, but if For general loss functions, it is not easy to iterate each step of the forward distribution algorithm. Therefore, some studies have proposed a method of using the negative gradient of the loss function in the value of the current model to solve a more general boosting tree model, called 梯度提升树.

1 GBDT algorithm introduction

GBDTThe full name is gradient boosting decision tree ( gradient boosting decision tree), when its base model (weak classifier) CART决策树, the corresponding gradient boosting model is called GBDT, and the corresponding gradient boosting tree for regression problems is called GBRT( gradient boosting regression tree). The combination of multiple decision trees is the boosting tree model, and the process of using the gradient descent method to optimize the boosting tree model is the gradient boosting tree model. In most cases, the general loss function is difficult to directly optimize and solve, so there is a gradient boosting tree model based on the negative gradient to solve the boosting tree model. The gradient boosting tree uses a gradient descent method, so that the value of the negative gradient of the loss function in the current model is used as an approximation of the residual in the regression boosting tree.
insert image description here
insert image description here
insert image description here

2 GBDT algorithm implementation

import numpy as np

# GBDT损失函数
class SquareLoss:
    def loss(self, y, y_pred): # 平方损失函数
        return 0.5 * np.power((y - y_pred), 2)
    
    def gradient(self, y, y_pred): # 平方损失的一阶导数
        return -(y - y_pred)

We directly introduce it CART决策树to define GBDT类, class attributes include some basic hyperparameters of GBDT, such as tree's 棵数, 学习率, 结点最小分裂样本数, 树最大深度and so on. CART implementation

from cart import RegressionTree

# GBDT类定义
class GBDT(object):
    
    def __init__(self, n_estimators, learning_rate, min_samples_split, min_gini_impurity, max_depth, regression):
        self.n_estimators = n_estimators # 树的棵树
        self.learning_rate = learning_rate # 学习率
        self.min_samples_split = min_samples_split # 结点的最小分裂样本数
        self.min_gini_impurity = min_gini_impurity # 结点最小基尼不纯度
        self.max_depth = max_depth # 最大深度
        self.regression = regression # 默认为回归树
        self.loss = SquareLoss() # 如果是分类树,需要定义分类树损失函数
        self.estimators = []
        self.test = RegressionTree(min_samples_split=self.min_samples_split, min_gini_impurity=self.min_gini_impurity, max_depth=self.max_depth)
        for _ in range(self.n_estimators):
            self.estimators.append(RegressionTree(min_samples_split=self.min_samples_split, min_gini_impurity=self.min_gini_impurity, max_depth=self.max_depth))
    
    def fit(self, X, y):
        self.estimators[0].fit(X, y) # 前向分布模型初始化,第一棵树
        y_pred = self.estimators[0].predict(X) # 第一棵树的预测结果

        for i in range(1, self.n_estimators): # 前向分布迭代训练
            gradient = self.loss.gradient(y, y_pred)
            self.estimators[i].fit(X, gradient)
            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))
    
    def predict(self, X):
        y_pred = self.estimators[0].predict(X) # 回归树预测
        for i in range(1, self.n_estimators):
            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))
        if not self.regression: # 分类树预测
            proba = 1 / (1 + np.exp(-y_pred))
            proba = np.vstack([1 - proba, proba]).T
            y_pred = np.argmax(proba, axis=1)
        return y_pred

Implemented GBDTwith GBRT:

# GBDT
class GBDTClassifier(GBDT):
    def __init__(self, n_estimators=2, learning_rate=.5, min_samples_split=2, min_info_gain=999, max_depth=float('inf')):
        super(GBDTClassifier, self).__init__(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            min_samples_split=min_samples_split,
            min_gini_impurity=min_info_gain,
            max_depth=max_depth,
            regression=False
        )

# GBRT
class GBDTRegressor(GBDT):
    def __init__(self, n_estimators=2, learning_rate=0.1, min_samples_split=3, min_var_reduction=999, max_depth=float('inf')):
        super(GBDTRegressor, self).__init__(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            min_samples_split=min_samples_split,
            min_gini_impurity=min_var_reduction,
            max_depth=max_depth,
            regression=True
        )
# 回归树测试
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = GBDTRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 72.14748355263158
# 分类树测试
from sklearn.datasets._samples_generator import make_blobs # 导入模拟二分类数据生成模块
X, y = make_blobs(n_samples=150, n_features=2, centers=2, cluster_std=1.2, random_state=40) # 生成模拟二分类数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
model = GBDTClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
0.9777777777777777

3 Implementation of GBDT and GBRT based on sklearn

# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
reg = GradientBoostingRegressor(n_estimators=200, learning_rate=0.5, max_depth=4, random_state=0)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
11.275700412479738
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_blobs(n_samples=150, n_features=2, centers=2, cluster_std=1.2, random_state=40) # 生成模拟二分类数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
cls = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5, max_depth=4, random_state=0)
cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
1.0

Notebook_Github address

Guess you like

Origin blog.csdn.net/cjw838982809/article/details/131236079