《三国演义》词频统计
(原创文章,转载请标明出处)
使用Jieba分词,统计《三国演义》的词频,最后生成词云
主要特点如下:
1. 制作了两个自定义字典(三国演义人物名、三国演义官职一览表)
2. 使用停用词词典
import re
import jieba
import csv
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType
def ReadText(filename):
"""读取文本内容"""
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
return text
def CutWords(text, *filelist):
"""分词(filelist传入自定义字典)"""
text = re.sub('([\u4e00-\u9fa5]+)', '', text) # 只保留中文汉字
for file in filelist:
jieba.load_userdict(file)
words = jieba.lcut(text) # 分词
words = [word for word in words if len(word)>1] # 去掉只有一个字的词
return words
def StopWords(words, stopfile):
"""去除停用词"""
with open(stopfile, 'r', encoding='utf-8') as f: # 打开存放停用词的文件
stoplist = f.readlines()
stoplist = [stop.strip('\n') for stop in stoplist]
words = [word for word in words if word not in stoplist] # 去除停用词
return words
def WriteCSV(filename, freqdict, num=0):
"""将词频统计结果写入csv"""
with open(filename, 'w', encoding='utf-8', newline='') as f:
if num == 0:
num = len(freqdict.keys()) # 默认统计全部词语
freqlist = freqdict.most_common(num) # 词语列表(元素是tuple:词语,词频)
writer = csv.writer(f)
writer.writerow(('词汇', '词频'))
for freq in freqlist:
writer.writerow((freq[0], freq[1]))
if __name__ == '__main__':
text = ReadText('三国演义.txt')
text = re.sub('曰', '', text) # 手动清理
filelist = ['三国演义人物名.txt', '三国演义官职一览表.txt'] # 自定义字典
words = CutWords(text, filelist[0], filelist[1])
newwords = StopWords(words, stopfile='stop_words.txt')
wordfreq = Counter(newwords) # 计数
WriteCSV('三国演义词频统计.csv', wordfreq, num=50) # Top50
# 生成词云图
wordcloud = WordCloud()
wordcloud.add('', wordfreq.most_common(50), word_size_range=[20,100])
wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='三国演义词云Top50'))
wordcloud.render('三国演义词云图Top50.html')