Foreword:
The kmeans clustering method is implemented by calling the sklearn package. Function: Natural language processing text clustering (this is an unorganized version just for the author's record)
You can see the concise and clear version as follows:
https://blog.csdn.net/u013521274/article/details/87924876
Data set address used:
https://download.csdn.net/download/u013521274/11080094
For those who are new to python, it is really a headache to find code that can be understood and cannot run without data. So I give the data set here so that more friends can learn by memory and make progress together.
Code:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import codecs
from scipy import ndimage
from sklearn import manifold,datasets
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer,HashingVectorizer
#第一步计算TFIDF
corpus = [] #仅仅是一个列表啊
for line in open('data/01_All_BHSpider_Content_Result.txt', 'r',encoding='utf-8',errors='ignore').readlines():
#print line
corpus.append(line.strip())
print('库有',len(corpus),'行,个文档')
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))#获得tfidf值
#获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()
num = np.array(weight)
print("列是",num.shape[1])
#打印特征向量文本内容
print( '单词大小:'+str(len(word)))
# resName = "data/BHTfidf_Result.txt"
# result = codecs.open(resName, 'w', 'utf-8')
# for j in range(len(word)):
# result.write(word[j] + ' ')
# result.write('\r\n\r\n')
#
# #打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
# for i in range(len(weight)):
# #print u"-------这里输出第", i, u"类文本的词语tf-idf权重------"
# for j in range(len(word)):
# #print weight[i][j],
# result.write(str(weight[i][j]) + ' ')
# result.write('\r\n\r\n')
# result.close()
'''其实这个word是那个词表weight就是那个超大的稀疏矩阵'''
###第二步 聚类kmeans
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=4) #聚成4类:景区 动物 人物 国家
s=clf.fit(weight)
print(s)
#中心点
print(len(clf.cluster_centers_))#对于矩阵来说len是行
# print(clf.cluster_centers_)#4类 每一类的中心点
#每一个样本所属于的簇
label=[]
# print(clf.labels_)
#每一个文档聚类之后所属于的类别
i=1
while i<=len(clf.labels_):
print(i,clf.labels_[i-1])
label.append(clf.labels_[i-1])
i=i+1
#用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 958.137281791
# print(clf.inertia_)# 958.1511490717049
####第三步 图形输出 降维####
from sklearn.decomposition import PCA
pca = PCA(n_components=2)#输出两维
newData = pca.fit_transform(weight)
print('降维后有几行?',len(newData))
print(newData)
#5A景区
x1=[]
y1=[]
i=0
while i<400:
x1.append(newData[i][0])
y1.append(newData[i][1])
i+=1
#动物
x2 = []
y2 = []
i = 400
while i<600:
x2.append(newData[i][0])
y2.append(newData[i][1])
i += 1
#人物
x3 = []
y3 = []
i = 600
while i<800:
x3.append(newData[i][0])
y3.append(newData[i][1])
i += 1
#国家
x4 = []
y4 = []
i = 800
while i<1000:
x4.append(newData[i][0])
y4.append(newData[i][1])
i += 1
#四种颜色 红 绿 蓝 黑
plt.plot(x1, y1, 'or')
plt.plot(x2, y2, 'og')
plt.plot(x3, y3, 'ob')
plt.plot(x4, y4, 'ok')
plt.show()
Clustering results: