Python实现随机森林算法(不调用sklearn方法)

前言

网上关于随机森林原理介绍的文章或者资料很多，所以我的博客重点不是去详细地介绍随机森林的理论原理或者Bagging和Boosting的原理，也不是去写如何去掉包实现它，而是通过前段时间我自己写随机森林算法时发现网上很多python实现随机森林算法的代码都不是通过随机森林最原始的理论支撑去完成的，而且一些写法也是比较复杂和一些错误，所以我想自己写一篇用python实现随机森林算法的博客记录自己的学习过程，同时如果有其他人想学习也可以用的代码。

当然，我的代码可以直接运行，应该是没有任何错误的，我也在代码中我认为最最最关键的地方加了些注释，对于每个函数也有解释；但是我的代码是实现最基本的随机森林算法，没有多少优化，只是根据随机森林原始论进行代码编写实现的，所以算法的性能在所用的数据集上面准确率只有**75%-80%**的准确率，主要是学习和实现基础功能

整体函数罗列

csv_to_list() #将导入的csv文件转化为列表，方便操作
data_split() #构造训练集和验证集
train_split() #左右分枝地构建
gini_calculate() #基尼指数计算
get_split() #找出最优分割特征
to_terminal() #找到分类最多的标签，作为最终预测标签,这里其实和bagging函数的构造方法类似
split() #递归分类函数构建
build_tree() #构建决策树函数
predict() #决策树预测函数predict构建
bagging() #bagging套袋法构建，这里是随机森林算法核心思想!!
subsample() #决策树采样函数构建
random_forest() #随机森林模型构建
acc() #评估函数构造，用ACC作为评价指标
calculate_nfolds_score() #评估模型性能，返回4折模型得分

全部代码如下：

from random import randrange
import pandas as pd

# 将导入的csv文件转化为列表，方便操作
def csv_to_list(data1):
    data = []
    for i in range(len(data1)):
        data.append(list(data1.iloc[i])) #这里讲一下，将dataframe类型的数据转化为列表类型，方便计算
    return data
# 构造训练集和验证集
def data_split(dataset, n_folds):
    data = list()
    dataset_copy = list(dataset)      
    fold_size = len(dataset) / n_folds
    for i in range(n_folds):
        fold = list()                 
        while len(fold) < fold_size:            
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy[index])  
            dataset_copy.remove(dataset_copy[index])#!!这里着重讲一下，我改为了无放回的抽样，有放回抽样验证集准确率高，但是泛化能力弱
        data.append(fold)
    return data
#左右分枝地构建
def train_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# 基尼指数计算
def gini_calculate(groups, class_values):
    gini = 0.0
    D = len(groups[0]) + len(groups[1])
    for class_value in class_values:    
        for group in groups:          
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_value) / float(size)
            gini += float(size)/D * (proportion * (1.0 - proportion))   
    return gini

# 找出最优分割特征
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)  #这里是随机森林第二个核心思想，随机选取特征列进行决策树构造!!
        if index not in features:
            features.append(index)
    for index in features:
        for row in dataset:
            groups = train_split(index, row[index], dataset) 
            gini = gini_calculate(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {
    
    'index': b_index, 'value': b_value, 'groups': b_groups}

# 找到分类最多的标签，作为最终预测标签,这里其实和bagging函数的构造方法类似
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# 递归分类函数构建
def split(node, max_depth, min_size, n_features, depth): 
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:   #若分类还未结束，则选取数据中分类标签较多的作为结果，防止过拟合
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, n_features) 
        split(node['left'], max_depth, min_size, n_features, depth+1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, n_features)
        split(node['right'], max_depth, min_size, n_features, depth+1)

# 构建决策树函数
def build_tree(train, max_depth, min_size, n_features):
    # 返回最优列和相关的信息
    root = get_split(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root

# 决策树预测函数predict构建
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):      
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# bagging套袋法构建，这里是随机森林算法核心思想!!
def bagging(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count) #这里选取每个决策树预测最多的那个分类结果作为最终预测结果

# 决策树采样函数构建
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))# 有放回的随机采样
        sample.append(dataset[index])
    return sample

# 随机森林模型构建
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size, n_features) # 创建一个决策树
        trees.append(tree)
    predictions = [bagging(trees, row) for row in test] # bagging预测test
    return predictions

# 评估函数构造，用ACC作为评价指标
def acc(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# 评估模型性能，返回4折模型得分
def calculate_nfolds_score(dataset, algorithm, n_folds, *args):
    folds = data_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train = list(folds)
        train.remove(fold)
        train = sum(train, [])
        test_set = list()
        for label in fold:
            label_copy = list(label)
            label_copy[-1] = None
            test_set.append(label_copy)
        predicted = algorithm(train, test_set, *args)
        test_label = [label[-1] for label in fold] #test_label是实际标签，用于下面和预测值进行准确率计算
        acc_score = acc(test_label, predicted) #计算随机森林的预测结果的正确率
        scores.append(acc_score)
    return scores

data=pd.read_csv('sonar.csv',header=None)
data = csv_to_list(data)
n_folds = 4  #这里为什么用4折，因为我想平均划分数据集，由于数据集总共208行，所以五折划分不平均，
                                                          #这里只是为了方便所以讲数据集划分为4份     
max_depth = 5   
min_size = 1    
sample_size = 1.0  #这里的参数设置可以自己设置，这里我随便设置了参数的值带入模型构建
n_features = 15    
for n_trees in [1,10,20,30]: 
    scores = calculate_nfolds_score(data, random_forest, n_folds, max_depth, min_size,
                                sample_size, n_trees, n_features)
    print('构建随机森林所用决策树数量: %d 棵' % n_trees)
    print('每个验证集预测得分: %s' % scores)
    print('Mean Acc score: %.3f%%' % (sum(scores)/float(len(scores))))

上面的代码运行应该不会有什么语法错误，只有数据集的地址自己改一下，数据集的获取我放在后面链接里面
同时，上面的代码我只在我觉得最重要的地方加了注释，如果有些不懂的地方可以结合下面这位博主的代码注释进行理解，这位博主的代码有些地方操作比较复杂，我做了些优化和改进，但是他的注释非常详细，可以结合理解
https://github.com/apachecn/AiLearning/blob/e6ddd161f89f42d45fcee483b2292a8c7b2a9638/src/py2.x/ml/7.RandomForest/randomForest.py#L136
https://www.cnblogs.com/carlber/p/11812986.html
代码中数据集下载地址：https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/

最后

希望大家也积极分享自己的学习心得和问题进行讨论！希望大家活得开心！