生成学习算法、高斯判别分析法、朴素贝叶斯-机器学习公开课第五讲

机器学习公开课第五讲

这篇笔记对应的是公开课视频的第五个,讲到的内容有生成学习算法(generative learning algorithm)、高斯判别分析法(Gaussian Discriminant Analysis)、朴素贝叶斯(naive Bayes)和拉普拉斯平滑(Laplace Smoothing).
在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述
算法实现(python代码):

import xlrd
import numpy as np
import math

# 加载数据,返回的特征集是数组,标签集是列表
def load_data(filename):
    workbook = xlrd.open_workbook(filename)
    boyinfo = workbook.sheet_by_index(0)
    col_num = boyinfo.ncols
    row_num = boyinfo.nrows
    # 定义特征集和标签集
    dataset = []
    labelset = []
    for i in range(1, row_num):
        row = boyinfo.row_values(i)[0:]
        dataset.append([row[0], row[1]])
        labelset.append(row[2])
    return np.array(dataset),np.array(labelset)

# 定义一个 plotData 函数,输入参数是 数据 X 和标志 flag: y,返回作图操作 plt, p1, p2 , 目的是为了画图
def plotData(X, y):
    # 找到标签为1和0的索引组成数组赋给pos 和 neg
    pos = np.where(y==1)
    neg = np.where(y==0)
    pos_num = len(pos[0])
    neg_num = len(neg[0])
    pos_dataset = []
    neg_dataset = []
    for i in pos[0]:
        pos_dataset.append(X[i].tolist())
    for i in neg[0]:
        neg_dataset.append(X[i].tolist())
    return pos_num,neg_num,np.array(pos_dataset),np.array(neg_dataset)

# 归一化数据
def normalization(X):
    Xmin = np.min(X, axis=0)
    Xmax = np.max(X, axis=0)
    Xmu = np.mean(X, axis=0)
    X_norm = (X-Xmu)/(Xmax-Xmin)
    return X_norm
# 计算fai,miu0,miu1,sigma
def calculate_para(X_norm,y,pos_data,neg_data,pos_num,neg_num):
    m = pos_num+neg_num
    fai = pos_num/m
    print("fai:", fai)
    miu1 = np.sum(pos_data, axis=0)/pos_num
    miu0 = np.sum(neg_data, axis=0)/neg_num
    print("miu1:", miu1)
    print("miu0:", miu0)
    sum = np.array(np.zeros((2, 2),dtype=float))
    X_normmat = np.mat(X_norm)
    for i in range(m):
        if(y[i]==0):
            sum += (X_normmat[i]-miu0).T * (X_normmat[i]-miu0)
        else:
            sum += (X_normmat[i]-miu1).T * (X_normmat[i]-miu1)
    sigma = sum/m
    print("sigma:", sigma)
    return fai,miu0,miu1,sigma

def norm_distribution(x,y,miu1,miu0,sigma,fai):
    n = len(miu1)
    miu_mat1 = np.mat(miu1)
    miu_mat0 = np.mat(miu0)
    sigma_mat = np.mat(sigma)
    delta = np.linalg.det(sigma)
    px_y0 = 1/(math.pi**(n/2)) * np.exp(-(1/2)*(x-miu_mat0)*sigma_mat.I*(x-miu_mat0).T)/delta**(1/2)
    px_y1 = 1/(math.pi**(n/2)) * np.exp(-(1/2)*(x-miu_mat1)*sigma_mat.I*(x-miu_mat1).T)/delta**(1/2)
    px = px_y1[0, 0]*fai + px_y0[0, 0]*(1-fai)
    py1_x = fai * px_y1[0, 0] / px
    return py1_x
if __name__ == '__main__':
    dataset,labelset = load_data("data.xls")
    m,n = dataset.shape
    # 对输入x进行归一化处理
    X_norm = normalization(dataset)
    pos_num, neg_num, pos_dataset, neg_dataset = plotData(X_norm,labelset)
    print("正样本数:", pos_num)
    print("负样本数:", neg_num)
    fai,miu0,miu1,sigma = calculate_para(X_norm, labelset.tolist(), pos_dataset, neg_dataset, pos_num, neg_num)
    py1_x = []
    for i in range(50):
        py1_x.append(norm_distribution(X_norm[i], labelset.tolist(), miu1, miu0, sigma, fai))
    # 验证高斯判别分析法的准确率
    for i in range(m):
        if(py1_x[i] > 0.5):
            py1_x[i] = 1.0
        else:
            py1_x[i] = 0.0
    y = labelset.tolist()
    cnt = 0
    for i in range(m):
        if(y[i]==py1_x[i]):
            cnt += 1
    print("accurancy rate: {0}".format(cnt/m))

运行代码,可以看到:
在这里插入图片描述
它的分类准确率到达了92%,而前面的logistic回归的模型准确率可以达到94%,两者使用的是同一个数据集;至于为什么逻辑斯蒂回归的准确率更高,我认为是使用的数据集本身可能不是很符合高斯分布,所以在模型准确率上不如logistic回归。
在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述在这里插入图片描述
算法实现(python代码):

import numpy as np
import re

# 将文本内容转换为列表形式
def text2list(filename):
    rows = open(filename).read()
    lists = re.split(r'\W*', rows)
    return [list.lower() for list in lists if len(list) > 2]
# 从文件夹中读取50封邮件及其标签,将内容转换为列表形式
def get_trainset():
    trainemails = []
    trainlabels = []
    for i in range(1, 21):
        wordlist1 = text2list("email/ham/%d.txt" % i)
        trainemails.append(wordlist1)
        trainlabels.append(1)
        wordlist0 = text2list("email/spam/%d.txt" % i)
        trainemails.append(wordlist0)
        trainlabels.append(0)
    return trainemails,trainlabels
# 构建词向量
def createvocab(trainemails):
    vocabulary = set([])
    for email in trainemails:
        vocabulary = vocabulary | set(email)
    return list(vocabulary)
# 将每一个列表转换为一个向量
def list2vector(email,vocabulary):
    vector = [0]*len(vocabulary)
    for words in email:
        if words in vocabulary:
            vector[vocabulary.index(words)] += 1
        else:
            print("words is not in my vocabulary")
    return vector
# 计算条件概率和p(y)
def naivebayes(trainemail_array,trainlabels):
    num_email = len(trainemail_array)
    num_character = len(trainemail_array[0])
    print(num_character,num_email)
    # 计算P(y=1)
    pAbusive = sum(trainlabels)/num_email
    print("pAbusive= {0}".format(pAbusive))
    # 定义P(xi|y=1)和P(xi|y=0)
    p1_vec = np.ones(num_character)
    p0_vec = np.ones(num_character)
    # 初始化分母
    p1Num = num_character
    p0Num = num_character
    for i in range(num_email):
        if(trainlabels[i] == 1):
            p1_vec += trainemail_array[i]
            p1Num += sum(trainemail_array[i])
        else:
            p0_vec += trainemail_array[i]
            p0Num += sum(trainemail_array[i])
    p1_vec = np.log(p1_vec/p1Num)
    p0_vec = np.log(p0_vec/p0Num)
    return p1_vec,p0_vec,pAbusive
# 分类函数
def classify(myemail,p1_vec,p0_vec,pAbusive):
    # 将得到的条件概率和要预测的邮件输入关联起来
    p1 = sum(myemail * p1_vec) + np.log(pAbusive)
    p0 = sum(myemail * p0_vec) + np.log(1-pAbusive)
    if(p1 > p0):
        return 1
    else:
        return 0

if __name__ == "__main__":
    trainemails, trainlabels = get_trainset()
    myvocab = createvocab(trainemails)
    # 将所有邮件有转换到向量形式
    trainemail_array = []
    for i in range(len(trainlabels)):
        trainemail_array.append(list2vector(trainemails[i], myvocab))
    p1_vec, p0_vec, pAbusive = naivebayes(trainemail_array, trainlabels)
    # 测试
    testemails = []
    testlabels = []
    for i in range(1, 11):
        wordlist1 = text2list("email/test/%d.txt" % i)
        testemails.append(wordlist1)
        if(i<6):
            testlabels.append(1)
        else:
            testlabels.append(0)
    testemail_array = []
    for i in range(len(testlabels)):
        testemail_array.append(list2vector(testemails[i], myvocab))
    cnt = 0
    for i in range(len(testlabels)):
        y = classify(testemail_array[i], p1_vec, p0_vec, pAbusive)
        if(y == testlabels[i]):
            cnt += 1
    print("accurancy rate: {0}".format(cnt/len(testlabels)))

运行代码:可以看到我们对邮件实现了准确分类。

猜你喜欢

转载自blog.csdn.net/h__ang/article/details/83783119