A general text on clustering for clustering are also a bunch of existing historical data, such as commonly used methods kmeans, dbscan and so on. If there is a need to cluster needs (ie, to a poly one) for streaming text, then these methods are not applicable, of course, there are many other dynamic clustering method for streaming data, dynamic clustering, there are many challenges, For example the number of clusters is not fixed, the threshold value is similar to clustering is not good design. These are to be continue to study it. Paper implements a simple sing-pass single-pass clustering, the similarity between the text using the cosine distance, text, vector can (idf get here, you can count on a large set of documents in use tfidf, and then in the new text the direct use of the word), the text can also be a vector is represented by some, such as word2vec, bert and other Chinese pre-training model.
II. Program
1 import numpy as np 2 import os 3 import sys 4 import pickle 5 import collections 6 from sklearn.feature_extraction.text import TfidfVectorizer 7 from sklearn.decomposition import TruncatedSVD 8 from gensim import corpora, models, matutils 9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list 10 from gensim.models import doc2vec, Doc2Vec 11 from sklearn.metrics.pairwise import cosine_similarity 12 13 ''' 14 大体流程: 15 input:doc vector;threshold 16 output:cluster 17 begin 18 input doc vector 19 input threshold 20 first doc as first cluster and it's vector as the center of the cluster 21 while(doc vectors){ 22 while(clusters){ 23 max_sim,max_cluster = simlarity(doc vector,cluster); 24 } 25 if(max_sim > threshold){ 26 max_cluster.put(doc vector); 27 max_cluster.update_center() 28 } 29 else{ 30 build new cluster(doc vector); 31 } 32 } 33 end 34 ''' 35 class SingelPassCluster(object): 36 37 ''' 38 1.利用tfidf vec计算cossim 39 ''' 40 def tfidf_vec(self, corpus, pivot=10, slope=0.25): 41 = corpora.Dictionary Dictionary (Corpus) # form dictionary mapping 42 is self.dict_size = len (Dictionary) 43 is Print ( ' Dictionary size: {} ' .format (len (Dictionary))) 44 is Corpus = [dictionary.doc2bow (text) for text in Corpus] # vector word indicates 45 TFIDF = models.TfidfModel (Corpus, Pivot Pivot =, = Slope Slope) 46 is corpus_tfidf = TFIDF [Corpus] 47 return corpus_tfidf 48 49 DEF get_max_similarity(self, cluster_cores, vector): 50 max_value = 0 51 max_index = -1 52 print('vector:{}'.format(vector)) 53 for k, core in cluster_cores.items(): 54 print('core:{}'.format(core)) 55 similarity = matutils.cossim(vector, core) 56 if similarity > max_value: 57 max_value = similarity 58 max_index = k 59 return max_index, max_value 60 61 def single_pass(self, corpus_vec, corpus, theta): 62 clusters = {} 63 cluster_cores = {} 64 cluster_text = {} 65 num_topic = 0 66 cnt = 0 67 for vector, text in zip(corpus_vec, corpus): 68 if num_topic == 0: 69 clusters.setdefault(num_topic, []).append(vector) 70 cluster_cores[num_topic] = vector 71 cluster_text.setdefault(num_topic, []).append(text) 72 num_topic += 1 73 else: 74 max_index, max_value = self.get_max_similarity(cluster_cores, vector) 75 if max_value > theta: 76 clusters[max_index].append(vector) 77 text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size, 78 num_docs=len(clusters[max_index])).T #Sparse dense turn 79 core = np.mean (text_matrix, Axis = 0) # updated cluster center 80 core = matutils.any2sparse (core) # The dense sparse vector Vector Switch core 81 cluster_cores [MAX_INDEX] = core 82 cluster_text [MAX_INDEX] .append (text) 83 the else : # create a new cluster 84 clusters.setdefault (num_topic, []) the append (the Vector). 85 cluster_cores [num_topic] = the Vector 86 cluster_text.setdefault (num_topic, []) the append (text). 87 num_topic += 1 88 cnt += 1 89 if cnt % 100 == 0: 90 print('processing {}...'.format(cnt)) 91 return clusters, cluster_text 92 93 def fit_transform(self, corpus, raw_data, theta=0.5): 94 tfidf_vec = self.tfidf_vec(corpus) # tfidf_vec是稀疏向量 95 clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta) 96 return clusters, cluster_text 97 98 99 ''' 100 2.利用doc2vec计算cossim 101 ''' 102 def fit(self, doc2vec_model, corpus, raw_data, theta=0.5): 103 doc_vec = self.doc_vec(doc2vec_model, corpus) 104 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta) 105 return clusters, cluster_text 106 107 def fit_2(self, doc_vec, text2index, theta): 108 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta) 109 return clusters, cluster_text 110 111 def doc_vec(self, doc2vec_model, x_train): 112 print('doc2vec infered vec...') 113 infered_vectors_list = [] 114 for text, label in x_train: 115 vector = doc2vec_model.infer_vector(text) 116 infered_vectors_list.append(vector) 117 print('infered vector size:{}'.format(len(infered_vectors_list))) 118 if len(infered_vectors_list) >= 100: 119 break 120 return infered_vectors_list 121 122 def get_doc2vec_similarity(self, cluster_cores, vector): 123 max_value = 0 124 max_index = -1 125 for k, core in cluster_cores.items(): # core -> np.ndarray 126 similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1)) 127 similarity = similarity[0, 0] 128 if similarity > max_value: 129 max_value = similarity 130 max_index = k 131 return max_index, max_value 132 133 def doc2vec_single_pass(self, corpus_vec, corpus, theta): 134 clusters = {} 135 cluster_cores = {} 136 cluster_text = {} 137 num_topic = 0 138 cnt = 0 139 for vector, text in zip(corpus_vec, corpus): 140 if num_topic == 0: 141 clusters.setdefault(num_topic, []).append(vector) 142 cluster_cores[num_topic] = vector 143 cluster_text.setdefault(num_topic, []).append(text) 144 num_topic += 1 145 else: 146 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector) 147 if max_value > theta: 148 clusters[max_index].append(vector) 149 core = np.mean(clusters[max_index], axis=0) # 更新簇中心 150 cluster_cores[max_index] = core 151 cluster_text[max_index].append(text) 152 else: # 创建一个新簇 153 clusters.setdefault(num_topic, []).append(vector) 154 cluster_cores[num_topic] = vector 155 cluster_text.setdefault(num_topic, []).append(text) 156 num_topic += 1 157 cnt += 1 158 if cnt % 100 == 0: 159 print('processing {}...'.format(cnt)) 160 return clusters, cluster_text 161 162 163 def sim(doc_vec): 164 vector = doc_vec[0] 165 print('vector:{}'.format(type(vector))) 166 for core in doc_vec: 167 similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1)) 168 similarity = similarity[0, 0] 169 print("similarity:{}".format(similarity)) 170 171 if __name__ == '__main__': 172 base_path = os.path.abspath with (the os.path.join (The os.getcwd (), ' ../ .. ' )) 173 process_text base_path + = ' /data/process_text.txt ' # samples processed path 174 + = base_path doc2vec_path ' /data/doc2vec.pkl ' 175 cluster_result = base_path + ' /data/cluster_result.txt ' 176 doc_vec_path = base_path + ' /data/doc_vec.vec ' # after doc2vec recommended text vector 177 178 Corpus = load_data (process_text) 179 raw_text = load_samples(process_text) 180 181 index2corpus = collections.OrderedDict() 182 for index, line in enumerate(raw_text): 183 index2corpus[index] = line 184 text2index = list(index2corpus.keys()) 185 print('docs total size:{}'.format(len(text2index))) 186 187 single_cluster = SingelPassCluster() 188 189 cal_vec_type = 'doc2vec' 190 191 if cal_vec_type == 'tfidf': 192 clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4) 193 194 if cal_vec_type == 'doc2vec': 195 with open(doc_vec_path, 'rb') as file: 196 infered_vectors_list = pickle.load(file) 197 clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6) 198 199 ''' 200 if os.path.exists(doc2vec_path): 201 print('doc2vec model loading...') 202 doc2vec_model = Doc2Vec.load(doc2vec_path) 203 x_train = read_data_to_list(process_text) 204 clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6) 205 ''' 206 207 if cal_vec_type == 'd2vsim': 208 if os.path.exists(doc2vec_path): 209 print('doc2vec model loading...') 210 doc2vec_model = Doc2Vec.load(doc2vec_path) 211 = x_train read_data_to_list (process_text) 212 doc_vec = single_cluster.doc_vec (doc2vec_model, x_train) 213 SIM (doc_vec) 214 215 216 Print ( " ...................... .................................................. .................... " ) 217 Print ( " number of classes obtained are: {a} ... " .format (len (Clusters))) 218 Print ( " ................................................ ............................................ \ the n- " ) 219 #Statement by the number of clusters in descending order clustering result 220 clusterTopic_list the sorted = (cluster_text.items (), Key = the lambda X: len (X [. 1]), Reverse = True) 221 with Open (cluster_result, ' W ' , = encoding ' UTF-. 8 ' ) AS file_write: 222 for K in clusterTopic_list: 223 cluster_text = [] 224 for index, value in the enumerate (K [. 1], Start =. 1 ): 225 cluster_text.append ( ' ( ' + STR (index) + ' ):' + Index2corpus [value]) 226 cluster_text = ' \ n- ' .join (cluster_text) 227 file_write.write ( " [] cluster index: {} \ n [cluster number of documents]: {} \ n [cluster document] : \ n-{} " .format (K [0], len (K [. 1 ]), cluster_text)) 228 file_write.write ( ' \ n- ' ) 229 file_write.flush ()