写在之前

这是本人的统计学习方法作业之一，老师要求一定要用Matlab编程，本人在此之前未曾大量使用Matlab，因此某些算法可能因为不知道函数或者包而走了弯路。代码高亮查了一下，没找到Matlab的所以用了C的。部分算法参考了某些算法的python算法，如有建议或者错误欢迎指出，谢谢！

数据集

提供的flower.txt文件为鸢尾花数据集，共分为3类花（前50个样本为一类，中间50个样本为一类，后50个样本为一类，每一朵花包含4个维度的测量值（feature）。现只处理二分类问题，因而只使用前100个数据。其中，每组前40个用作训练，后10个用作测试。因此，该数据集训练样本80个，测试样本20个。

主程序

首先，在matlab工程新建script文件classification.m，参考代码如下：

classification.m

clear;

%% step1: 划分数据集
flower = load('flower.txt');
train_X = [flower(1:40, :);flower(51:90, :)];
train_Y = [ones(40, 1); zeros(40, 1)];
test_X = [flower(41:50, :);flower(91:end, :)];
test_Y = [ones(10, 1); zeros(10, 1)];

%% step2：学习和预测,预测test_y
output_perceptron = perceptron(train_X, train_Y, test_X);%因为随机取起点，所以每次运行分类结果不一定一样
output_knn = knn(train_X, train_Y, test_X); %简单测试，k取9效果比较好
output_logistic = logistic(train_X, train_Y, test_X);
output_entropy = entropy(train_X, train_Y, test_X);
output_tree = tree(train_X, train_Y, test_X);
output_nb = nb(train_X, train_Y, test_X); %数据集为连续型数据，需用高斯贝叶斯分类器

%% step3：分析结果，准确率召回率等
output_set = [output_perceptron;output_knn;output_logistic;output_entropy;output_tree;output_nb];
compare(test_Y,output_set);
function compare(test_Y,output)
    result = [];
    for i = 1:length(output(:,1))
        [ACC,PRE,REC] = evaluation(test_Y,output(i,:));
        result = [result;[ACC,PRE,REC]];
    end
    bar(result);
    grid on;
    legend('ACC','PRE','REC');
    set(gca,'XTickLabel',{'perceptron','knn','logistic','entropy','tree','nb'});
    xlabel('分类器种类');
    ylabel('指标值');
end
function [ACC,PRE,REC] = evaluation(test_Y,output) %评估分类性能，ACC分类准确率、PRE精确率、REC召回率
    TP = 0;
    FN = 0;
    FP = 0;
    TN = 0;
    for i = 1:length(test_Y)
        if test_Y(i)==1
            if output(i)==1
                TP = TP + 1;%将正类预测为正类数
            else
                FN = FN + 1;%将正类预测为负类数
            end
        else
            if output(i)==1
                FP = FP + 1;%将负类预测为正类数
            else
                TN = TN + 1;%将负类预测为负类数
            end
        end
    end
    ACC = (TP+TN)/(TP+FN+FP+TN);
    PRE = TP/(TP+FP);
    REC = TP/(TP+FN);
end

子程序

第二步：分别新建6个函数，perceptron，knn，logistic，entropy，tree，nb，实现相应7个分类器，将代码分别复制到以下对应文本框内。每个分类器的输入只有训练集和测试集的X，其返回结果为测试集的y（一个向量）。

perceptron.m

function [ test_Y ] = perceptron( X, Y, test_X )
%% 初始化w,b,alpha,
w = [0,0,0,0];
b = 0;
alpha = 1;  % learning rate
sample = X;
for i = 1:length(Y)
    if Y(i)==1
        sign(i) = 1;
    else
        sign(i) = -1;
    end
end
sign = sign';
maxstep = 1000;
%%  更新 w,b
for i=1:maxstep
    [idx_misclass, counter] = class(sample, sign, w, b);%等式右边为输入，左边为输出，理解为函数调用
    %obj_struct = class(struct_array,'class_name',parent_array) 
    %idx_misclass:误分类序号索引，counter:
    if (counter~=0)%~=代表不等于
        R = unidrnd(counter);%产生离散均匀随机整数(一个),即随机选取起点训练
        %fprintf('%d\n',R);
        w = w + alpha * sample(idx_misclass(R),:) * sign(idx_misclass(R));
        b = b + alpha * sign(idx_misclass(R));   
    else
        break
    end
end
%%
for i=1:length(test_X)
    if(w*test_X(i,:)'+b >=0)
        test_Y(i) = 1;
    else
        test_Y(i) = 0;
    end
end
end

function [idx_misclass, counter] = class(sample, label, w, b)
    counter = 0;
    idx_misclass = [];
    for i=1:length(label)
        if (label(i)*(w*sample(i,:)'+b)<=0) %如果有误分类点，进行迭代
            idx_misclass = [idx_misclass i];
            %fprintf('%d\n',idx_misclass);
            counter = counter + 1;        
        end   
    end
end

knn.m

function [ test_Y ] = knn( X, Y, test_X )
k = 7;
 for i = 1:length(test_X)
     for j = 1:length(X)
        dist(i,j) = norm(test_X(i)-X(j));
     end
     [B,idx] = mink(dist(i,:),k);
     result(i,:) = Y(idx);
 end
 for i = 1:length(test_X)
     if(sum(result(i,:))>=k/2)
         test_Y(i) = 1;
     else
         test_Y(i) = 0;
     end
 end
end

logistic.m

function [J, grad] = costFunction(theta, X, Y, lambda)
    m = length(Y);
    grad = zeros(size(theta));

    J = 1 / m * sum( -Y .* log(sigmoid(X * theta)) - (1 - Y) .* log(1 - sigmoid(X * theta)) ) + lambda / (2 * m) * sum( theta(2 : size(theta), :) .^ 2 );

    for j = 1 : size(theta)
        if j == 1
            grad(j) = 1 / m * sum( (sigmoid(X * theta) - Y)' * X(:, j) );
        else
            grad(j) = 1 / m * sum( (sigmoid(X * theta) - Y)' * X(:, j) ) + lambda / m * theta(j);
        end
    end

end

function g = sigmoid(z)
    g = zeros(size(z));
    g = 1 ./ (1 + exp(-z));
end

entropy.m

function [ test_Y ] = entropy( X, Y, test_X )
    %% 变量定义
    maxstep=10;
    w = [];
    labels = []; %类别的种类
    fea_list = []; %特征集合
    px = containers.Map(); %经验边缘分布概率
    pxy = containers.Map(); %经验联合分布概率,由于特征函数为取值为0，1的二值函数，所以等同于特征的经验期望值
    exp_fea = containers.Map(); %每个特征在数据集上的期望 
    N = 0; % 样本总量
    M =0; % 某个训练样本包含特征的总数，这里假设每个样本的M值相同，即M为常数。其倒数类似于学习率
    n_fea = 0; % 特征函数的总数
    fit(X,Y);
    test_Y = [];
    for q = 1:length(test_X) %测试集预测
        p_Y = predict(test_X(q));
        test_Y = [test_Y,p_Y];
    end
    %% 变量初始化
    function init_param(X,Y)
        N = length(X);
        labels = unique(Y);
        
        %初始化px pxy
        for i = 1:length(X)
            px(num2str(X(i,:))) = 0;
            pxy(num2str([X(i,:),Y(i)])) = 0;
        end
        
        %初始化exp_fea
        for i = 1:length(X)
            for j = 1:length(X(i,:))
                fea = [j,X(i,j),Y(i)];
                exp_fea(num2str(fea)) = 0;
            end
        end
        fea_func(X,Y);
        n_fea = length(fea_list);
        w = zeros(n_fea,1);
        f_exp_fea(X,Y); 
    end
    %% 构造特征函数
    function fea_func(X,Y)
        for i = 1:length(X)
            px(num2str(X(i,:))) = px(num2str(X(i,:))) + 1.0/double(N);
            pxy(num2str([X(i,:),Y(i)])) = pxy(num2str([X(i,:),Y(i)]))+ 1.0/double(N);
            for j = 1:length(X(1,:))
                key = [j,X(i,j),Y(i)];
                fea_list = [fea_list;key];
            end
        end
        fea_list = unique(fea_list,'rows');
        M = length(X(i,:));
    end
    function f_exp_fea(X,Y)
        for i = 1:length(X)
            for j = 1:length(X(i,:))
                fea = [j,X(i,j),Y(i)];
                exp_fea(num2str(fea)) = exp_fea(num2str(fea)) + pxy(num2str([X(i,:),Y(i)]));
            end
        end
    end
    %% 当前w下的条件分布概率,输入向量X和y的条件概率
    function py_X =f_py_X(X)
        py_X = containers.Map();
        for i = 1:length(labels)
            py_X(num2str(labels(i))) = 0.0;
        end
        for i = 1:length(labels)
            s = 0;
            for j = 1:length(X)
                tmp_fea = [j,X(j),labels(i)];
                if(ismember(tmp_fea,fea_list,'rows'))
                    s = s + w(ismember(tmp_fea,fea_list,'rows'));
                end
            end
            py_X(num2str(labels(i))) = exp(s);
        end
        normalizer = 0;
        k_names = py_X.keys();
        for i = 1:py_X.Count
            normalizer = normalizer + py_X(char(k_names(i)));
        end
        for i = 1:py_X.Count
            py_X(char(k_names(i))) = double(py_X(char(k_names(i))))/double(normalizer);
        end
    end
    %% 基于当前模型，获取每个特征估计期望
    function est_fea = f_est_fea(X,Y)
        est_fea = containers.Map();
        for i = 1:length(X)
            for j = 1:length(X(i,:))
                est_fea(num2str([j,X(i,j),Y(i)])) = 0;
            end
        end
        for i = 1:length(X)
            py_x = f_py_X(X);
            py_x = py_x(num2str(Y(i)));
            for j = 1:length(X(i,:))
                est_fea(num2str([j,X(i,j),Y(i)])) = est_fea(num2str([j,X(i,j),Y(i)])) + px(num2str(X(i,:)))*py_x;
            end
        end
    end
    %% GIS算法更新delta
    function delta = GIS()
        est_fea = f_est_fea(X,Y);
        delta = zeros(n_fea,1);
        for i = 1:n_fea
            try
                delta(i) = 1 / double(M*log(exp_fea(num2str(fea_list(i,:)))))/double(est_fea(num2str(fea_list(i,:))));
            catch
                continue;
            end
        end
        delta = double(delta) / double(sum(delta));
    end
    %% 训练，迭代更新wi
    function fit(X,Y)
        init_param(X,Y);
        i = 0;
        while( i < maxstep)
            i = i + 1;
            delta = GIS();
            w = w + delta;
        end
    end
    %% 输入x(数组)，返回条件概率最大的标签
    function best_label = predict(X)
        py_x = f_py_X(X);
        keys = py_x.keys();
        values = py_x.values();
        max = 0;
        best_label = -1;
        for i = 1:py_x.Count
            if(py_x(char(keys(i))) > max)
                max = py_x(char(keys(i)));
                best_label = str2num(char(keys(i)));
            end
        end
    end
end

tree.m

function [ test_Y ] = tree( X, Y, test_X )
    global tree;
    tree = [];
    disp("The Tree is :");
    build_tree(X,Y);
%     disp(tree);
    S = regexp(tree, '\s+', 'split');
    test_Y = [];
    var = 0;
    condi = 0;
    for i = 1:length(test_X)
        tx = test_X(i,:);
        for j=1:length(S)
            if contains(S(j),'node')
                 var = tx(str2num(char(S(j+1))));
                 condi = str2num(char(S(j+2)));
                 if var < condi
                     if contains(S(j+3),'leaf')
                         ty = char(S((j+3)));
                         ty = ty(6);
                         test_Y = [test_Y,str2num(ty)];
                         break
                     else
                         j = j + 3;
                     end
                 else
                     if contains(S(j+4),'leaf')
                         ty = char(S((j+4)));
                         ty = ty(6);
                         test_Y = [test_Y,str2num(ty)];
                         break
                     else
                         j = j + 4;
                     end
                 end
            end
        end
    end
end

function build_tree(x, y, L, level, parent_y, sig, p_value)
    % 自编的用于构建决策树的简单程序，适用于属性为二元属性，二分类情况。（也可对程序进行修改以适用连续属性）。
    % 输入：
    % x：数值矩阵，样本属性记录（每一行为一个样本）
    % y：数值向量，样本对应的labels
    % 其它参数调用时可以忽略，在递归时起作用。
    % 输出：打印决策树。
    global tree;
    if nargin == 2
       level = 0; 
       parent_y = -1;
       L = 1:size(x, 2);
       sig = -1;
       p_value = [];
%        bin_f = zeros(size(x, 2), 1);
%        for k=1:size(x, 2)
%            if length(unique(x(:,k))) == 2
%               bin_f(k) = 1; 
%            end
%        end
    end
    class = [0, 1];
    [r, label] = is_leaf(x, y, parent_y); % 判断是否是叶子节点
    if r   
        if sig ==-1
            disp([repmat('     ', 1, level), 'leaf (', num2str(label), ')'])
            tree = [tree,[' leaf(', num2str(label), ')']];
        elseif sig ==0
            disp([repmat('     ', 1, level), '<', num2str(p_value),' leaf (', num2str(label), ')']);
            tree = [tree,[' leaf(', num2str(label), ')']];
        else
            disp([repmat('     ', 1, level), '>', num2str(p_value),' leaf (', num2str(label), ')']);
            tree = [tree,[' leaf(', num2str(label), ')']];
        end
    else
        [ind, value, i_] = find_best_test(x, y, L); % 找出最佳的测试值
%         
%         if ind ==1
%            keyboard; 
%         end

        [x1, y1, x2, y2] = split_(x, y, i_, value); % 实施划分
        if sig ==-1
            disp([repmat('     ', 1, level), 'node (', num2str(ind), ', ', num2str(value), ')']);
            tree = [tree,[' node ', num2str(ind), ' ',num2str(value)]];
        elseif sig ==0
            disp([repmat('     ', 1, level), '<', num2str(p_value),' node (', num2str(ind), ', ', num2str(value), ')']);
            tree = [tree,[' node ',num2str(ind), ' ', num2str(value)]];
        else
            disp([repmat('     ', 1, level), '>', num2str(p_value),' node (', num2str(ind), ', ', num2str(value), ')']);
            tree = [tree,[' node ',num2str(ind), ' ', num2str(value)]];
        end
%         if bin_f(i_) == 1
            x1(:,i_) = []; 
            x2(:,i_) = [];
            L(:,i_) = [];
%             bin_f(i_) = [];
%         end
        build_tree(x1, y1, L, level+1, y, 0, value); % 递归调用
        build_tree(x2, y2, L, level+1, y, 1, value);
    end

    function [ind, value, i_] = find_best_test(xx, yy, LL) % 子函数：找出最佳测试值（可以对连续属性适用）
        imp_min = inf;
        i_ = 1;
        ind = LL(i_);
        for i=1:size(xx,2);
            if length(unique(xx(:,i))) ==1
                continue;
            end
%            [xx_sorted, ii] = sortrows(xx, i); 
%            yy_sorted = yy(ii, :);
           vv = unique(xx(:,i));
           imp_min_i = inf;
           best_point = mean([vv(1), vv(2)]);
           value = best_point;
           for j = 1:length(vv)-1
               point = mean([vv(j), vv(j+1)]);               
               [xx1, yy1, xx2, yy2] = split_(xx, yy, i, point);
               imp = calc_imp(yy1, yy2);
               if imp<imp_min_i
                   best_point = point;
                   imp_min_i = imp;
               end
           end
           if imp_min_i < imp_min
              value = best_point;
              imp_min = imp_min_i;
              i_ = i;
              ind = LL(i_);
           end
        end
    end

    function imp = calc_imp(y1, y2) % 子函数：计算熵
        p11 = sum(y1==class(1))/length(y1);
        p12 = sum(y1==class(2))/length(y1);
        p21 = sum(y2==class(1))/length(y2);
        p22 = sum(y2==class(2))/length(y2);
        if p11==0
            t11 = 0;
        else
           t11 = p11*log2(p11); 
        end
        if p12==0
            t12 = 0;
        else
           t12 = p12*log2(p12); 
        end
        if p21==0
            t21 = 0;
        else
           t21 = p21*log2(p21); 
        end
        if p22==0
            t22 = 0;
        else
           t22 = p22*log2(p22); 
        end

        imp = -t11-t12-t21-t22;
    end

    function [x1, y1, x2, y2] = split_(x, y, i, point) % 子函数：实施划分
       index = (x(:,i)<point);
       x1 = x(index,:);
       y1 = y(index,:);
       x2 = x(~index,:);
       y2 = y(~index,:);
    end

    function [r, label] = is_leaf(xx, yy, parent_yy) % 子函数：判断是否是叶子节点
        if isempty(xx)
            r = true;
            label = mode(parent_yy);
        elseif length(unique(yy)) == 1
            r = true;
            label = unique(yy);
        else
            t = xx - repmat(xx(1,:),size(xx, 1), 1);
            if all(all(t ==0))
                r = true;
                label = mode(yy);
            else
                r = false;
                label = [];
            end
        end
    end
end

nb.m

function [ test_Y ] = nb( X, Y, test_X )
    %% 计算每个分类的样本的概率
    labelProbability = tabulate(Y);
    %P_yi,计算P(yi)
    P_y1=labelProbability(1,3)/100;
    P_y2=labelProbability(2,3)/100;
    %% 计算每个属性(列)的均值与方差
    X1 = [];
    X2 = [];
    for i = 1:length(X)
        if(Y(i)==1)
            X1 = [X1;X(i,:)];
        else
            X2 = [X2;X(i,:)];
        end
    end
    %% 计算每个类的均值和方差
    x1_mean = mean(X1,1);
    x1_std = std(X1,0,1);
    x2_mean = mean(X2,1);
    x2_std = std(X2,0,1);

    test_Y = [];
    for i = 1:length(test_X)
        p1 = 1; %归属第一个类的概率
        p2 = 1; %归属第二个类的概率
        for j = 1:length(test_X(1,:))
            p1 = p1 * gaussian(test_X(i,j),x1_mean(j),x1_std(j)); %p(xj|y)
            p2 = p2 * gaussian(test_X(i,j),x2_mean(j),x2_std(j));
        end
        p1 = p1 * P_y1;
        p2 = p2 * P_y2;
        if(p1>p2)
            test_Y = [test_Y,1];
        else
            test_Y = [test_Y,0];
        end
    end
    %% 定义高斯(正太)分布函数
    function p = gaussian(x,mean,std)
        p = 1/sqrt(2*pi*std*std)*exp(-(x-mean)*(x-mean)/(2*std));
    end
end

结果分析

运行classification.m脚本，将所得到的6个分类器的结果与真实test_Y进行比较，对各分类器的结果进行简单分析比较。
运行结果为每个类的预测标签，通过测试集真实标签对比，构造处混淆矩阵，可以计算得到分类准确率ACC、分类精确率PRE、分类召回率REC，以其作为评价指标可绘制柱形图对比，结果如下图所示，具体代码见classification.m 的step3。
从分类结果上来看，感知机、罗吉斯特回归、决策树、朴素贝叶斯的表现最好，全部指标都达到了满值，一方面是这些分类器分类效果比较好，另一方面也由于训练样本不足而导致。分类效果一般的是KNN，这是由K的取值而导致的不稳定分类，分类效果最差的是最大熵，原因可能因为训练集数据是连续的，而实现最大熵的时候将每一条数据当成离散的特征函数，导致分类效果不佳。值得一提的是，朴素贝叶斯最开始实现的时候也会存在这个问题，但朴素贝叶斯处理连续数据时可做正态分布假设，实现时使用高斯朴素贝叶斯，其结果表现良好。
在这里插入图片描述