python 词云实例

#/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 23 16:03:41 2019

@author: Administrator
"""

import os
import sys
import docx
import jieba
import jieba.posseg as pseg
import re
import collections
from PIL import Image
import numpy as np
from docx import Document
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import wordcloud

os.chdir('E:\wordcloud')
os.getcwd()
n=0
# 读取整个文本
wd_lists=''
document = Document('bid_document.docx')
with open('words.txt','w',encoding='utf-8') as f:
for paragraph in document.paragraphs:
f.writelines(paragraph.text.split())

# 读取停用词库
f = open('chineseStopWords.txt','r',encoding='utf-8')
stopwords={}.fromkeys(f.read().split('\n'))
f.close()
# 获取自定义词典
jieba.load_userdict('userdict.txt')
#jieba.add_word('石墨烯')
#jieba.add_word('卡瑟琳')

f = open('words.txt','r',encoding='utf-8')
text = f.read().split()
f.close()

segs = jieba.cut(text[0])

mytext_list = []
for seg in segs:
if seg not in stopwords and seg != '' and len(seg) != 1:
mytext_list.append(seg.replace(" ",""))
cloud_text = "/".join(mytext_list)

# 词频统计
word_counts = collections.Counter(mytext_list) # 对分词做词频统计
word_counts_top10 = word_counts.most_common(10) # 获取前10最高频的词
print (word_counts_top10) # 输出检查

# 词频展示
mask = np.array(Image.open('mask.png')) # 定义词频背景
wc = WordCloud(
background_color="black", #背景颜色
max_words=30, #显示最大词数
font_path="C:/Windows/Fonts/simfang.ttf", #使用字体
min_font_size=15,
max_font_size=100,
width=400, #图幅宽度
mask=mask
)
#wc.generate(cloud_text)
wc.generate_from_frequencies(word_counts)
image_colors = wordcloud.ImageColorGenerator(mask)
wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file("pic.png")

python 词云 实例

猜你喜欢