机器学习 集成学习篇——python实现Bagging和AdaBOOST算法

机器学习 集成学习篇——python实现Bagging和AdaBOOST算法

摘要

本文通过python实现了集成学习中的Bagging和AdaBOOST算法,并将代码进行了封装,方便读者调用。

Bagging算法

import numpy as np
import pandas as pd
class Cyrus_bagging(object):
    def __init__(self,estimator,n_estimators = 20):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.models = None
    def fit(self,x,y):
        x = np.array(x)
        y = np.array(y).reshape((-1,))
        indices = np.arange(x.shape[0])
        self.models = []
        for i in range(self.n_estimators):
            index = np.random.choice(indices,x.shape[0])
            x0 = x[index]
            y0 = y[index]
            self.models.append(self.estimator.fit(x0,y0))
    def predict(self,x):
        res = np.zeros([x.shape[0],self.n_estimators])
        for i in range(self.n_estimators):
            res[:,i] = self.models[i].predict(x)
        result = []
        for i in range(res.shape[0]):
            pd_res = pd.Series(res[i,:]).value_counts()
            result.append(int(pd_res.argmax()))
        return np.array(result)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knn = KNeighborsClassifier()
model = Cyrus_bagging(knn)
model.fit(x_train,y_train)
y_pre = model.predict(x_test)
print(classification_report(y_test,y_pre))

示例使用的数据为了与不使用集成算法的模型的准确率区分开来,所以使用较少特征的数据,因而准确率不是特别高,不过与未使用集成算法的模型相比,准确率已经优出不少。

precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       0.67      0.67      0.67         9
          2       0.70      0.70      0.70        10

avg / total       0.80      0.80      0.80        30

Adaboost算法

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
class CyrusAdaBoost(object):
    def __init__(self,estimator,n_estimators = 20):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.error_rate = None
        self.model = None
    def update_w(self,y,pre_y,w):
        error_rate = 1 - accuracy_score(y,pre_y)
        for i in range(w.shape[0]):
            if y[i] == pre_y[i]:
                w[i] = w[i]*np.exp(-error_rate)
            else:
                w[i] = w[i]*np.exp(error_rate)
        return w/w.sum()
    def cal_label(self,result,alpha):
        label = []
        for i in range(result.shape[0]):
            count = np.zeros(int(result[i,:].max()+1))
            for j in range(result.shape[1]):
                count[int(result[i,j])] += alpha[j]
            label.append(count.argmax())
        return np.array(label)
    def fit(self,x,y):
        x = np.array(x)
        y = np.array(y).reshape((-1,))
        self.error_rate = []
        self.model = []
        w0 = np.ones(x.shape[0])
        w0 = w0/w0.sum()
        indices = np.arange(x.shape[0])
        for i in range(self.n_estimators):
            index = np.random.choice(indices,size = x.shape[0],p = w0)
            x0 = x[index]
            y0 = y[index]
            model0 = self.estimator.fit(x0,y0)
            pre_y0 = model0.predict(x0)
            error_rate = 1 - accuracy_score(y0,pre_y0)
            self.error_rate.append(error_rate)
            self.model.append(model0)
            w0 = self.update_w(y0,pre_y0,w0)
    def predict(self,x):
        res = np.zeros([x.shape[0],self.n_estimators])
        for i in range(self.n_estimators):
            res[:,i] = self.model[i].predict(x)
        alpha = 1 - np.array(self.error_rate)
        return self.cal_label(res,alpha)    
from sklearn.tree import DecisionTreeClassifier
model = CyrusAdaBoost(estimator=DecisionTreeClassifier(),n_estimators=50)
model.fit(x_train,y_train)
y_pre = model.predict(x_test)
print(accuracy_score(y_pre,y_test))
0.932

by CyrusMay 2020 06 12

这世界全部的漂亮
不过你的可爱模样
——————五月天(爱情的模样)——————

猜你喜欢

转载自blog.csdn.net/Cyrus_May/article/details/106714283