机器学习实战专题(二)Decision_Tree

coding: utf-8

决策树

说明:

将数据集文件 ‘lenses.txt’ 放在当前文件夹



from math import log2
from collections import Counter
import numpy as np

1.1 构建决策树

1.1.1信息增益


def ent(dataset):
    dataset=np.array(dataset)
    m=dataset.shape[0]
    label_counts=Counter(dataset[:,-1])
    #计算信息熵
    #prob=v/m
    return sum(-(v/m)*log2(v/m) for v in label_counts.values())
    

def create_dataset():
    data_set=[[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    labels=['no surfacing','flippers']
    return data_set,labels





my_data,labels=create_dataset()
print(my_data)
print(ent(my_data))



mydata[0][-1]='maybe'
print(my_data)
print(ent(my_data))

1.1.2划分数据集



def split_dataset(dataset,axis,value):
    #np.array()可将数据类型转为str
    dataset=np.array(dataset)
    idx=np.where(dataset[:,axis]==value)
    
    return np.delete(dataset[idx],axis,axis=1)




my_data,labels=create_dataset()
split_dataset(my_data,0,'1')


split_dataset(my_data,0,'0')


def choose_best_feature2split(dataset):
    dataset=np.array(dataset)
    m,n=dataset.shape
    base_ent=ent(dataset)
    best_info_gain=0.0
    for i in range(n-1):
        feature_counts=Counter(dataset[:,i])
        feature_prob={k:v/m for (k,v) in feature_counts.items()}
        #计算每个特征的熵
        feature_ent={k:ent(split_dataset(dataset,i,k)) for (k,v) in feature_counts.items()}
        feature_cond_ent=sum(feature_prob[k]*feature_ent[k] for k in feature_counts.keys())
        info_gain=base_ent-feature_cond_ent
        #与初始熵比较
        if info_gain>best_info_gain:
            best_info_gain=info_gain
            best_feature=i
    return best_feature
        


choose_best_feature2split(my_data)

1.1.3 递归构建决策树



def majority_cnt(class_list):
    class_count=Counter(class_list)
    return max(class_count.items(),key=lambda x:x[1])[0]





def create_tree(dataset,labels):
    dataset=np.array(dataset)
    class_list=dataset[:,-1]
    if len(set(class_list))==1:
        return class_list[0]
    if len(dataset[0])==1:
        return majority_cnt(class_list)
    best_feature=choose_best_feature2split(dataset)
    best_feature_label=labels[best_feature]
    my_tree={best_feature_label:{}}
    del (labels[best_feature])
    feat_values=dataset[:,best_feature]
    unique_vals=set(feat_values)
    for val in unique_vals:
        sublabels=labels.copy()
        my_tree[best_feature_label][val]=create_tree(split_dataset(dataset,best_feature,val),sublabels)
    return my_tree




my_tree=create_tree(my_data,labels)

1.2 测试、存储分类器

1.2.1 测试



def predict(input_tree,feature_labels,test_vec):
    first_feature=tuple(input_tree.keys())[0]
    sub_tree=input_tree[first_feature]
    feature_index=feature_labels.index(first_feature)
    for key in sub_tree.keys():
        if test_vec[feature_index]==key:
            if isinstance(sub_tree[key],dict):
                return predict(sub_tree[key],feature_labels,test_vec)
            else:
                return sub_tree[key]





my_data,labels=create_dataset()





predict(my_tree,labels,['1','0'])
predict(my_tree,labels,['1','1'])

1.2.2 序列化存储决策树


import pickle
def save_model(input_tree,filename):
    fw=open(filename,'wb')
    pickle.dump(input_tree,fw)
    fw.close()



def load_model(filename):
    fr=open(filename,'rb')
    return pickle.load(fr)



save_model(my_tree,'decision_tree_storage.pkl')



load_model('decision_tree_storage.pkl')

1.3 用决策树进行眼镜预测



fr=open('lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lenses_labels=['age','prescript','astigmatic','tear_rate']
create_tree(lenses,lenses_labels)

猜你喜欢

转载自blog.csdn.net/weixin_38966454/article/details/90039033