Python文本处理: 分词和词云图

'''
import os
import jieba  # 分词包
import numpy  # numpy计算包
import codecs  # codecs提供open方法指定打开的文件的语言编码,它会在读取时自动转换为内部的unicode
import pandas  # 统计学工具包
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator    # 词云包
from scipy.misc import imread
from time import sleep

def join_txt():
    # ---- 合并txt文件
    # 获取目标文件夹的路径
    meragefiledir = os.getcwd() + '\\corpus'
    # 获取当前文件夹中的文件名称列表
    filenames = os.listdir(meragefiledir)
    # 打开当前目录下的result.txt文件,如果没有则创建
    file = open('all_result.txt', 'w')
    # 向文件中写入字符   先遍历文件名
    for filename in filenames:
        filepath = meragefiledir + '\\'
        filepath = filepath + filename
        # 遍历单个文件,读取行数
        for line in open(filepath,encoding='utf-8'):
            file.writelines(line)
        file.write('\n')
    file.close()
def make_pic():
    # 导入文本,分词处理
    file = codecs.open(u'all_result.txt', 'r')
    content = file.read()
    file.close()
    segment = []
    segs = jieba.cut(content)   # 使用jieba分词
    for seg in segs:
        if len(seg) > 1 and seg != '\r\n':
            segment.append(seg)

    # 去停用词(文本去噪)
    words_df = pandas.DataFrame({'segment': segment})
    words_df.head()
    stopwords = pandas.read_csv("stopword.txt", index_col=False,
                                quoting=3, sep='\t', names=['stopword'], encoding="utf8")
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
    # print(words_df.head(6))
    # 词汇频率表
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"count": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by="count", ascending=False)
    # 自定义词云背景
    bimg = imread('mangguo.png')
    wordcloud = WordCloud(background_color="white", mask=bimg, font_path='msyh.ttf')
    wordcloud = wordcloud.fit_words(dict(words_stat.head(990000).itertuples(index=False)))
    #  从背景图片生成颜色值
    bimgColors = ImageColorGenerator(bimg)
    plt.axis("off")
    plt.imshow(wordcloud.recolor(color_func=bimgColors))
    # plt.show()
    wordcloud.to_file( "ciyun.png")

if __name__ == '__main__':
    join_txt()
    sleep(2)
    print('txt 文件整合完成!----')
    make_pic()
    print(' 词云 图片生成 完成-----ciyun.png ')

'''

win 上 wordcloud包需要自己安装,可以去 https://www.lfd.uci.edu/~gohlke/pythonlibs/
下载对应的whl版本。

需要注意:

wordcloud = wordcloud.fit_words(dict(words_stat.head(990000).itertuples(index=False)))
这里接受的是一个 dict类型

猜你喜欢

转载自blog.51cto.com/13000661/2116624
今日推荐