Jieba与三国?——使用jieba统计《三国演义》词频

《三国演义》词频统计

(原创文章,转载请标明出处)

使用Jieba分词,统计《三国演义》的词频,最后生成词云

主要特点如下:

1. 制作了两个自定义字典(三国演义人物名、三国演义官职一览表)
2. 使用停用词词典

import re
import jieba
import csv
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType


def ReadText(filename):
    """读取文本内容"""
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()
    return text


def CutWords(text, *filelist):
    """分词(filelist传入自定义字典)"""
    text = re.sub('([\u4e00-\u9fa5]+)', '', text)        # 只保留中文汉字
    for file in filelist:
        jieba.load_userdict(file) 
    words = jieba.lcut(text)                               # 分词
    words = [word for word in words if len(word)>1]       # 去掉只有一个字的词
    return words


def StopWords(words, stopfile):
    """去除停用词"""
    with open(stopfile, 'r', encoding='utf-8') as f:              # 打开存放停用词的文件
        stoplist = f.readlines()
        stoplist = [stop.strip('\n') for stop in stoplist]
        words = [word for word in words if word not in stoplist] # 去除停用词
    return words


def WriteCSV(filename, freqdict, num=0):
    """将词频统计结果写入csv"""
    with open(filename, 'w', encoding='utf-8', newline='') as f:
        if num == 0:
            num = len(freqdict.keys())                    # 默认统计全部词语
        freqlist = freqdict.most_common(num)              # 词语列表(元素是tuple:词语,词频)
        writer = csv.writer(f)
        writer.writerow(('词汇', '词频'))
        for freq in freqlist:
            writer.writerow((freq[0], freq[1]))
            

if __name__ == '__main__':
    text = ReadText('三国演义.txt')
    text = re.sub('曰', '', text)                                  # 手动清理
    filelist = ['三国演义人物名.txt', '三国演义官职一览表.txt']    # 自定义字典
    words = CutWords(text, filelist[0], filelist[1])
    newwords = StopWords(words, stopfile='stop_words.txt')
    wordfreq = Counter(newwords)                                   # 计数
    WriteCSV('三国演义词频统计.csv', wordfreq, num=50)             # Top50
    
    # 生成词云图
    wordcloud = WordCloud()
    wordcloud.add('', wordfreq.most_common(50), word_size_range=[20,100])
    wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='三国演义词云Top50'))
    wordcloud.render('三国演义词云图Top50.html')

词云图Top50

猜你喜欢

转载自blog.csdn.net/Funny_Cheng/article/details/106854390