Jieba与三国？——使用jieba统计《三国演义》词频

《三国演义》词频统计

（原创文章，转载请标明出处）

使用Jieba分词，统计《三国演义》的词频，最后生成词云

主要特点如下：

1. 制作了两个自定义字典（三国演义人物名、三国演义官职一览表）
2. 使用停用词词典

import re
import jieba
import csv
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType


def ReadText(filename):
    """读取文本内容"""
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    return text


def CutWords(text, *filelist):
    """分词（filelist传入自定义字典）"""
    text = re.sub('（[\u4e00-\u9fa5]+）', '', text)        # 只保留中文汉字
    for file in filelist:
        jieba.load_userdict(file) 
    words = jieba.lcut(text)                               # 分词
    words = [word for word in words if len(word)>1]       # 去掉只有一个字的词
    return words


def StopWords(words, stopfile):
    """去除停用词"""
    with open(stopfile, 'r', encoding='utf-8') as f:              # 打开存放停用词的文件
        stoplist = f.readlines()
        stoplist = [stop.strip('\n') for stop in stoplist]
        words = [word for word in words if word not in stoplist] # 去除停用词
    return words


def WriteCSV(filename, freqdict, num=0):
    """将词频统计结果写入csv"""
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        if num == 0:
            num = len(freqdict.keys())                    # 默认统计全部词语
        freqlist = freqdict.most_common(num)              # 词语列表（元素是tuple：词语，词频）
        writer = csv.writer(f)
        writer.writerow(('词汇', '词频'))
        for freq in freqlist:
            writer.writerow((freq[0], freq[1]))
            

if __name__ == '__main__':
    text = ReadText('三国演义.txt')
    text = re.sub('曰', '', text)                                  # 手动清理
    filelist = ['三国演义人物名.txt', '三国演义官职一览表.txt']    # 自定义字典
    words = CutWords(text, filelist[0], filelist[1])
    newwords = StopWords(words, stopfile='stop_words.txt')
    wordfreq = Counter(newwords)                                   # 计数
    WriteCSV('三国演义词频统计.csv', wordfreq, num=50)             # Top50
    
    # 生成词云图
    wordcloud = WordCloud()
    wordcloud.add('', wordfreq.most_common(50), word_size_range=[20,100])
    wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='三国演义词云Top50'))
    wordcloud.render('三国演义词云图Top50.html')

词云图Top50

Jieba与三国？——使用jieba统计《三国演义》词频

《三国演义》词频统计

使用Jieba分词，统计《三国演义》的词频，最后生成词云

猜你喜欢