single-pass single-pass clustering

A general text on clustering for clustering are also a bunch of existing historical data, such as commonly used methods kmeans, dbscan and so on. If there is a need to cluster needs (ie, to a poly one) for streaming text, then these methods are not applicable, of course, there are many other dynamic clustering method for streaming data, dynamic clustering, there are many challenges, For example the number of clusters is not fixed, the threshold value is similar to clustering is not good design. These are to be continue to study it. Paper implements a simple sing-pass single-pass clustering, the similarity between the text using the cosine distance, text, vector can (idf get here, you can count on a large set of documents in use tfidf, and then in the new text the direct use of the word), the text can also be a vector is represented by some, such as word2vec, bert and other Chinese pre-training model.

II. Program

  1 import numpy as np
  2 import os
  3 import sys
  4 import pickle
  5 import collections
  6 from sklearn.feature_extraction.text import TfidfVectorizer
  7 from sklearn.decomposition import TruncatedSVD
  8 from gensim import corpora, models, matutils
  9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list
 10 from gensim.models import doc2vec, Doc2Vec
 11 from sklearn.metrics.pairwise import cosine_similarity
 12 
 13 '''
 14 大体流程:
 15 input:doc vector;threshold
 16 output:cluster
 17 begin
 18     input doc vector
 19     input threshold
 20     first doc as first cluster and it's vector as the center of the cluster
 21     while(doc vectors){
 22         while(clusters){
 23             max_sim,max_cluster = simlarity(doc vector,cluster);
 24         }
 25         if(max_sim > threshold){
 26             max_cluster.put(doc vector);
 27             max_cluster.update_center()
 28         }
 29         else{
 30             build new cluster(doc vector);
 31         }
 32     }
 33 end
 34 '''
 35 class SingelPassCluster(object):
 36 
 37     '''
 38         1.利用tfidf vec计算cossim
 39     '''
 40     def tfidf_vec(self, corpus, pivot=10, slope=0.25):
 41         = corpora.Dictionary Dictionary (Corpus)   # form dictionary mapping 
42 is          self.dict_size = len (Dictionary)
 43 is          Print ( ' Dictionary size: {} ' .format (len (Dictionary)))
 44 is          Corpus = [dictionary.doc2bow (text) for text in Corpus]   # vector word indicates 
45          TFIDF = models.TfidfModel (Corpus, Pivot Pivot =, = Slope Slope)
 46 is          corpus_tfidf = TFIDF [Corpus]
 47          return corpus_tfidf
 48  
49      DEF get_max_similarity(self, cluster_cores, vector):
 50         max_value = 0
 51         max_index = -1
 52         print('vector:{}'.format(vector))
 53         for k, core in cluster_cores.items():
 54             print('core:{}'.format(core))
 55             similarity = matutils.cossim(vector, core)
 56             if similarity > max_value:
 57                 max_value = similarity
 58                 max_index = k
 59         return max_index, max_value
 60 
 61     def single_pass(self, corpus_vec, corpus, theta):
 62         clusters = {}
 63         cluster_cores = {}
 64         cluster_text = {}
 65         num_topic = 0
 66         cnt = 0
 67         for vector, text in zip(corpus_vec, corpus):
 68             if num_topic == 0:
 69                 clusters.setdefault(num_topic, []).append(vector)
 70                 cluster_cores[num_topic] = vector
 71                 cluster_text.setdefault(num_topic, []).append(text)
 72                 num_topic += 1
 73             else:
 74                 max_index, max_value = self.get_max_similarity(cluster_cores, vector)
 75                 if max_value > theta:
 76                     clusters[max_index].append(vector)
 77                     text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
 78                                                         num_docs=len(clusters[max_index])).T  #Sparse dense turn 
79                      core = np.mean (text_matrix, Axis = 0)   # updated cluster center 
80                      core = matutils.any2sparse (core)   # The dense sparse vector Vector Switch core 
81                      cluster_cores [MAX_INDEX] = core
 82                      cluster_text [MAX_INDEX] .append (text)
 83                  the else :   # create a new cluster 
84                      clusters.setdefault (num_topic, []) the append (the Vector).
 85                      cluster_cores [num_topic] = the Vector
 86                      cluster_text.setdefault (num_topic, []) the append (text).
 87                     num_topic += 1
 88             cnt += 1
 89             if cnt % 100 == 0:
 90                 print('processing {}...'.format(cnt))
 91         return clusters, cluster_text
 92 
 93     def fit_transform(self, corpus, raw_data, theta=0.5):
 94         tfidf_vec = self.tfidf_vec(corpus)  # tfidf_vec是稀疏向量
 95         clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
 96         return clusters, cluster_text
 97 
 98 
 99     '''
100         2.利用doc2vec计算cossim
101     '''
102     def fit(self, doc2vec_model, corpus, raw_data, theta=0.5):
103         doc_vec = self.doc_vec(doc2vec_model, corpus)
104         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta)
105         return clusters, cluster_text
106 
107     def fit_2(self, doc_vec, text2index, theta):
108         clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta)
109         return clusters, cluster_text
110 
111     def doc_vec(self, doc2vec_model, x_train):
112         print('doc2vec infered vec...')
113         infered_vectors_list = []
114         for text, label in x_train:
115             vector = doc2vec_model.infer_vector(text)
116             infered_vectors_list.append(vector)
117             print('infered vector size:{}'.format(len(infered_vectors_list)))
118             if len(infered_vectors_list) >= 100:
119                 break
120         return infered_vectors_list
121 
122     def get_doc2vec_similarity(self, cluster_cores, vector):
123         max_value = 0
124         max_index = -1
125         for k, core in cluster_cores.items():  # core -> np.ndarray
126             similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1))
127             similarity = similarity[0, 0]
128             if similarity > max_value:
129                 max_value = similarity
130                 max_index = k
131         return max_index, max_value
132 
133     def doc2vec_single_pass(self, corpus_vec, corpus, theta):
134         clusters = {}
135         cluster_cores = {}
136         cluster_text = {}
137         num_topic = 0
138         cnt = 0
139         for vector, text in zip(corpus_vec, corpus):
140             if num_topic == 0:
141                 clusters.setdefault(num_topic, []).append(vector)
142                 cluster_cores[num_topic] = vector
143                 cluster_text.setdefault(num_topic, []).append(text)
144                 num_topic += 1
145             else:
146                 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector)
147                 if max_value > theta:
148                     clusters[max_index].append(vector)
149                     core = np.mean(clusters[max_index], axis=0)  # 更新簇中心
150                     cluster_cores[max_index] = core
151                     cluster_text[max_index].append(text)
152                 else:  # 创建一个新簇
153                     clusters.setdefault(num_topic, []).append(vector)
154                     cluster_cores[num_topic] = vector
155                     cluster_text.setdefault(num_topic, []).append(text)
156                     num_topic += 1
157             cnt += 1
158             if cnt % 100 == 0:
159                 print('processing {}...'.format(cnt))
160         return clusters, cluster_text
161 
162 
163 def sim(doc_vec):
164     vector = doc_vec[0]
165     print('vector:{}'.format(type(vector)))
166     for core in doc_vec:
167         similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1))
168         similarity = similarity[0, 0]
169         print("similarity:{}".format(similarity))
170 
171 if __name__ == '__main__':
172      base_path = os.path.abspath with (the os.path.join (The os.getcwd (), ' ../ .. ' ))
 173      process_text base_path + = ' /data/process_text.txt '   # samples processed path 
174      + = base_path doc2vec_path ' /data/doc2vec.pkl ' 
175      cluster_result = base_path + ' /data/cluster_result.txt ' 
176      doc_vec_path = base_path + ' /data/doc_vec.vec '   # after doc2vec recommended text vector 
177  
178      Corpus = load_data (process_text)
 179      raw_text = load_samples(process_text)
180 
181     index2corpus = collections.OrderedDict()
182     for index, line in enumerate(raw_text):
183         index2corpus[index] = line
184     text2index = list(index2corpus.keys())
185     print('docs total size:{}'.format(len(text2index)))
186 
187     single_cluster = SingelPassCluster()
188 
189     cal_vec_type = 'doc2vec'
190 
191     if cal_vec_type == 'tfidf':
192         clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4)
193 
194     if cal_vec_type == 'doc2vec':
195         with open(doc_vec_path, 'rb') as file:
196             infered_vectors_list = pickle.load(file)
197         clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6)
198 
199         '''
200         if os.path.exists(doc2vec_path):
201             print('doc2vec model loading...')
202             doc2vec_model = Doc2Vec.load(doc2vec_path)
203         x_train = read_data_to_list(process_text)
204         clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6)
205         '''
206 
207     if cal_vec_type == 'd2vsim':
208         if os.path.exists(doc2vec_path):
209             print('doc2vec model loading...')
210             doc2vec_model = Doc2Vec.load(doc2vec_path)
211         = x_train read_data_to_list (process_text)
 212          doc_vec = single_cluster.doc_vec (doc2vec_model, x_train)
 213          SIM (doc_vec)
 214  
215  
216      Print ( " ...................... .................................................. .................... " )
 217      Print ( " number of classes obtained are: {a} ... " .format (len (Clusters)))
 218      Print ( " ................................................ ............................................ \ the n- " )
 219      #Statement by the number of clusters in descending order clustering result 
220      clusterTopic_list the sorted = (cluster_text.items (), Key = the lambda X: len (X [. 1]), Reverse = True)
 221      with Open (cluster_result, ' W ' , = encoding ' UTF-. 8 ' ) AS file_write:
 222          for K in clusterTopic_list:
 223              cluster_text = []
 224              for index, value in the enumerate (K [. 1], Start =. 1 ):
 225                  cluster_text.append ( ' ( ' + STR (index) + ' ):' + Index2corpus [value])
 226              cluster_text = ' \ n- ' .join (cluster_text)
 227              file_write.write ( " [] cluster index: {} \ n [cluster number of documents]: {} \ n [cluster document] : \ n-{} " .format (K [0], len (K [. 1 ]), cluster_text))
 228              file_write.write ( ' \ n- ' )
 229              file_write.flush ()

 

Guess you like

Origin www.cnblogs.com/little-horse/p/11688801.html