The combination of KMeans and deep learning autoencoder AutoEncoder improves the clustering effect

The feature is the consumption amount of the user's consumer goods, and the original data (part) is as follows:

id,goods_name,goods_amount
1,男士手袋,1882.0
2,淑女装,2491.0
3,淑女装,2492.0
2,女士手袋,345.0
4,基础内衣,328.0
5,商务正装,4985.0
5,时尚,969.0
5,女饰品,86.0
6,专业运动,399.0
6,童装(中大童),2033.0
6,男士配件,38.0

Seeing that there are different consumption records under the same id, this data cannot be used directly. I wrote a python program to process datadeal.py:

# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''

数据预处理
'''
import pandas as pd
import  numpy as np
import time
import re

#加载文件
x=pd.read_table('info.txt',sep = ",")
x=x.dropna(axis=0)
a1=list(x.iloc[:,0])
a2=list(x.iloc[:,1])
a3=list(x.iloc[:,2])
print("数据表:",x)

#A是商品类别
dicta=dict(zip(a2,zip(a1,a3)))
print("dicta:",dicta)
A=list(dicta.keys())
#B是用户id
B=list(set(a1))

#创建商品类别字典
a = np.arange(len(A))
lista = list(a)
dict_class = dict(zip(A,lista))

#将商品分类写入
f=open('class.txt','w')
for k ,v in dict_class.items():
     f.write(str(k)+'\t'+str(v)+'\n')
f.close()

start=time.clock()
#创建大字典存储数据
dictall = {}
for i in range(len(a1)):
    if a1[i] in dictall.keys():
        value = dictall[a1[i]]
        j = dict_class[a2[i]]
        value[j] = a3[i]
        dictall[a1[i]]=value
    else:
        value = list(np.zeros(len(A)))
        j = dict_class[a2[i]]
        value[j] = a3[i]
        dictall[a1[i]]=value
print('dictall:',dictall)

#将字典转化为dataframe
dictall1 = pd.DataFrame(dictall)
dictall_matrix = dictall1.T
print("dictall_matrix:",dictall_matrix)
dictall_matrix.to_csv("data_matrix.txt",index=True,header=None)
# fw2=open("dictall_matrix.txt",'w')
# fw2.write(dictall_matrix)
# fw2.close()
dictall_matrix
end = time.clock()
print ("赋值过程运行时间是:%f s"%(end-start))

df=pd.DataFrame(columns=['id','id1'])
df[id]=1
print(df)

After the data processing is completed, AutoEncoder encodes AE.py,

# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''

AE模型(Auto-encoder)
主要是能够把数据缩放,如果你输入的维数比较大,譬如实际的特征是几千维的,全部拿到算法里跑,效果不见得好,
因为并不是所有特征都是有用的,用AE模型后,你可以压缩成m维(就是隐含层的节点数)

'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing


class AutoEncoder():
    """ Auto Encoder
    layer      1     2    ...    ...    L-1    L
      W        0     1    ...    ...    L-2
      B        0     1    ...    ...    L-2
      Z              0     1     ...    L-3    L-2
      A              0     1     ...    L-3    L-2
    """

    def __init__(self, X, Y, nNodes):
        # training samples
        self.X = X
        self.Y = Y
        # number of samples
        self.M = len(self.X)
        # layers of networks
        self.nLayers = len(nNodes)
        # nodes at layers
        self.nNodes = nNodes
        # parameters of networks
        self.W = list()
        self.B = list()
        self.dW = list()
        self.dB = list()
        self.A = list()
        self.Z = list()
        self.delta = list()
        for iLayer in range(self.nLayers - 1):
            self.W.append(
                np.random.rand(nNodes[iLayer] * nNodes[iLayer + 1]).reshape(nNodes[iLayer], nNodes[iLayer + 1]))
            self.B.append(np.random.rand(nNodes[iLayer + 1]))
            self.dW.append(np.zeros([nNodes[iLayer], nNodes[iLayer + 1]]))
            self.dB.append(np.zeros(nNodes[iLayer + 1]))
            self.A.append(np.zeros(nNodes[iLayer + 1]))
            self.Z.append(np.zeros(nNodes[iLayer + 1]))
            self.delta.append(np.zeros(nNodes[iLayer + 1]))

        # value of cost function
        self.Jw = 0.0
        # active function (logistic function)
        self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z))
        # learning rate 1.2
        self.alpha = 2.5
        # steps of iteration 30000
        self.steps = 10000

    def BackPropAlgorithm(self):
        # clear values
        self.Jw -= self.Jw
        for iLayer in range(self.nLayers - 1):
            self.dW[iLayer] -= self.dW[iLayer]
            self.dB[iLayer] -= self.dB[iLayer]
        # propagation (iteration over M samples)
        for i in range(self.M):
            # Forward propagation
            for iLayer in range(self.nLayers - 1):
                if iLayer == 0:  # first layer
                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
                else:
                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
                # Back propagation
            for iLayer in range(self.nLayers - 1)[::-1]:  # reserve
                if iLayer == self.nLayers - 2:  # last layer
                    self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer] * (1 - self.A[iLayer]))
                    self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer]) / self.M
                else:
                    self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer + 1]) * (
                    self.A[iLayer] * (1 - self.A[iLayer]))
                # calculate dW and dB
                if iLayer == 0:
                    self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
                else:
                    self.dW[iLayer] += self.A[iLayer - 1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
                self.dB[iLayer] += self.delta[iLayer]
                # update
        for iLayer in range(self.nLayers - 1):
            self.W[iLayer] -= (self.alpha / self.M) * self.dW[iLayer]
            self.B[iLayer] -= (self.alpha / self.M) * self.dB[iLayer]

    def PlainAutoEncoder(self):
        for i in range(self.steps):
            self.BackPropAlgorithm()
            print("step:%d" % i, "Jw=%f" % self.Jw)

    def ValidateAutoEncoder(self):
        a = np.array([i for i in range(1, 6)])
        df = pd.DataFrame(a, columns=['weidu'])
        for i in range(self.M):
            print( self.X[i])

            for iLayer in range(self.nLayers - 1):
                if iLayer == 0:  # input layer
                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
                else:
                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
                print("\t layer=%d" % iLayer, self.A[iLayer])
                if iLayer==0:
                    df[str(i+1)]=self.A[iLayer]

        df.to_csv("jaingwei.txt",index=False)




data = []
index = []
f = open('./data_matrix.txt', 'r')
for line in f.readlines():
    ss = line.replace('\n', '').split(',')
    index.append(ss[0])
    ss1 = ss[1:]
    tmp = []
    for i in range(len(ss1)):
        tmp.append(float(ss1[i]))
    data.append(tmp)
f.close()

x = np.array(data)
# 归一化处理
xx = preprocessing.scale(x)
nNodes = np.array([10, 5, 10])
ae3 = AutoEncoder(xx, xx, nNodes)
ae3.PlainAutoEncoder()
ae3.ValidateAutoEncoder()

print("ae结果:",ae3.A[0])

# # 这是个例子,输出的结果也是这个
# xx = np.array([[0,0,0,0,0,0,0,1], [0,0,0,0,0,0,1,0], [0,0,0,0,0,1,0,0], [0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0], [0,0,1,0,0,0,0,0]])
# nNodes = np.array([ 8, 3, 8 ])
# ae2 = AutoEncoder(xx,xx,nNodes)
# ae2.PlainAutoEncoder()
# ae2.ValidateAutoEncoder()

Clustering using sklearn's Kmeans

# !/usr/bin/python
# coding:utf-8
# Author :wuyy

from matplotlib import pyplot
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster   import KMeans
from scipy import sparse
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle
from sklearn.externals import joblib


#加载数据
data = pd.read_table('jaingwei.txt',sep = ",")
data=data.T
x = data.ix[1:,0:5]
print(x)
card = data.ix[:,0]

x1 = np.array(x)
print("x1:",x1)
xx = preprocessing.scale(x1)

print("preprocessing.scale xx:",xx)
num_clusters = 3

clf = KMeans(n_clusters=num_clusters,  n_init=1, n_jobs = 1,verbose=1) #job=-1 并行化处理
clf.fit(xx)
print("label:",clf.labels_)
labels = clf.labels_
#score是轮廓系数
score = metrics.silhouette_score(xx, labels)
# clf.inertia_用来评估簇的个数是否合适,距离越小说明簇分的越好
print ("clf.inertia_",clf.inertia_)
print (score)

github address: https://github.com/wu-yy/Kmeans

Reprinted from: http://www.cnblogs.com/charlotte77/p/5366578.html

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325653589&siteId=291194637
Recommended