python EM算法4(身高体重数据集)

1 处理数据

import numpy as np
# 预处理数据
def loadData(filename):
    dataSet = []
    fr = open(filename)
    for line in fr.readlines():
        curLine = line.strip('\n').split('\t')
        fltLine = list(map(float, curLine))
        dataSet.append(fltLine)
    fr.close()
    return dataSet
# 计算高斯函数
def Gaussian(data,mean,cov):
    dim = np.shape(cov)[0]   # 计算维度
    covdet = np.linalg.det(cov) # 计算|cov|
    
    if covdet==0:              # 以防行列式为0
        covdet = np.linalg.det(cov+np.eye(dim)*0.01)
        covinv = np.linalg.inv(cov+np.eye(dim)*0.01)
    else:
        covinv = np.linalg.inv(cov) # 计算cov的逆
    #print(data,mean)
    m = data - mean
    z = -0.5 * np.dot(np.dot(m, covinv),m)    # 计算exp()里的值
    return 1.0/(np.power(np.power(2*np.pi,dim)*abs(covdet),0.5))*np.exp(z)  # 返回概率密度值

2 获取初始聚类中心

# 获取最初的聚类中心
def GetInitialMeans(data,K,criterion):
    dim = data.shape[1]  # 数据的维度
    means = [[] for k in range(K)] # 存储均值
    minmax=[]
    for i in range(dim):
        minmax.append(np.array([min(data[:,i]),max(data[:,i])]))  # 存储每一维的最大最小值
    minmax=np.array(minmax)
    while True:
        for i in range(K):
            means[i]=[]
            for j in range(dim):
                 means[i].append(np.random.random()*(minmax[j][1]-minmax[j][0])+minmax[j][0] ) #随机产生means
            means[i]=np.array(means[i])

        if isdistance(means,criterion):
            break
    return means
# 用于判断初始聚类簇中的means是否距离离得比较近
def isdistance(means,criterion=0.03):
    K=len(means)
    for i in range(K):
         for j in range(i+1,K):
            if criterion>np.linalg.norm(means[i]-means[j]):
                 return False
    return True

3 GMM主程序

def GMM(data,K,ITER):
    N = data.shape[0]
    dim = data.shape[1]
    means= Kmeans(data,K)
    #means=GetInitialMeans(data,K,0.03)
    convs=[0]*K
    # 初始方差等于整体data的方差
    for i in range(K):
        convs[i]=np.cov(data.T)
    #convs=np.full((K,dim,dim),np.diag(np.full(dim,0.1)))
    phi = [1.0/K] * K
    omega = [np.zeros(K) for i in range(N)]
    loglikelyhood = 0
    oldloglikelyhood = 1

    while np.abs(loglikelyhood - oldloglikelyhood) > 0.00001:
        #print(np.abs(loglikelyhood - oldloglikelyhood))
    #while ITER:
        oldloglikelyhood = loglikelyhood

        # E步
        for i in range(N):
            res = [phi[k] * Gaussian(data[i],means[k],convs[k]) for k in range(K)]
            sumres = np.sum(res)
            for k in range(K):           # gamma表示第n个样本属于第k个混合高斯的概率
                omega[i][k] = res[k] / sumres
        # M步
        for k in range(K):
            Nk = np.sum([omega[n][k] for n in range(N)])  # N[k] 表示N个样本中有多少属于第k个高斯
            phi[k] = 1.0 * Nk/N
            means[k] = (1.0/Nk)*np.sum([omega[n][k] * data[n] for n in range(N)],axis=0)
            xdiffs = data - means[k]
            convs[k] = (1.0/ Nk)*np.sum([omega[n][k]* xdiffs[n].reshape(dim,1) * xdiffs[n] for  n in range(N)],axis=0)
        # 计算最大似然函数
        loglikelyhood = np.sum(
            [np.log(np.sum([phi[k] * Gaussian(data[n], means[k], convs[k]) for k in range(K)])) for n in range(N)])
        ITER-=1
        #print(oldloglikelyhood,loglikelyhood)
    return phi,means,convs

在GMM中用到的Kmeans算法如下:

# K均值算法,估计大约几个样本属于一个GMM
import copy
def Kmeans(data,K):
    N = data.shape[0]  # 样本数量
    dim = data.shape[1]  # 样本维度
    means = GetInitialMeans(data,K,0.03)
    means_old = [np.zeros(dim) for k in range(K)]
    # 收敛条件
    while np.sum([np.linalg.norm(means_old[k] - means[k]) for k in range(K)]) > 0.0001:
        means_old = copy.deepcopy(means)
        numlog = [1] * K  # 存储属于某类的个数
        sumlog = [np.zeros(dim) for k in range(K)]
        # E步
        for i in range(N):
            dislog = [np.linalg.norm(data[i]-means[k]) for k in range(K)]
            tok = dislog.index(np.min(dislog))
            numlog[tok]+=1         # 属于该类的样本数量加1
            sumlog[tok]+=data[i]   # 存储属于该类的样本取值

        # M步
        for k in range(K):
            means[k]=1.0 / numlog[k] * sumlog[k]
    return means

def computeOmega(X,mu,sigma,phi,multiGaussian):
    n_samples=X.shape[0]
    n_clusters=len(phi)
    gamma=np.zeros((n_samples,n_clusters))
    p=np.zeros(n_clusters)
    g=np.zeros(n_clusters)
    for i in range(n_samples):
        for j in range(n_clusters):
            p[j]=multiGaussian(X[i],mu[j],sigma[j])
            g[j]=phi[j]*p[j]
        for k in range(n_clusters):
            gamma[i,k]=g[k]/np.sum(g)
    return gamma
def predict(data,m,c,p):
    pred=computeOmega(np.array(data),m,c,p,Gaussian)
    cluster_results=np.argmax(pred,axis=1)
    return cluster_results

4 测试身高体重数据集

d=[]
with open('d:/dataset1.txt','r') as f:
    for line in f.readlines():
        d.append(line.strip('\n').split('\t'))
d1=np.array(d)
d2=[list(map(float,i))for i in d1[:,:-1]]
data=np.array(d2)
t=d1[:,-1]
p2,m2,c2=GMM(data,2,50)
pt2=predict(data,m2,c2,p2)
pt2

结果如下:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

p2,m2,c2的值分别为:

p2
[0.7822324209405439, 0.21776757905945612]
m2
[array([170.5300851 ,  59.69283016]), array([175.14653505,  73.79246884])]
c2
[array([[63.72099898, 54.66242696],
        [54.66242696, 73.46041058]]), array([[ 21.00207568,  14.73278643],
        [ 14.73278643, 140.29202475]])]

误差和错误率:

t=d1[:,-1]
t[t=='f']=1
t[t=='m']=0
t=list(map(int,t))
t=np.array(t)
c=0
for i in t==pt2:
    if i==False:
        c+=1
print('错误数为:',c)
print('错误率为:',round(c/len(t),3))

结果为:
错误数为: 176
错误率为: 0.389

猜你喜欢

转载自blog.csdn.net/xxuffei/article/details/90180337