莎翁作品集词频统计分析python

结论:读书万卷,不如巴掌大词典一本!

代码很简单:

import collections
import re
from pyecharts import Pie,Bar,WordCloud,Page
import webbrowser

#将莎翁作品文本读入
t=""
with open("shakespeare0.txt","r",encoding="utf-8") as f:
	t=f.read()

#将单词词尾缩写还原为原来单词

# to find the 's following the pronouns. re.I is refers to ignore case
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
# to find the 's following the letters
pat_s = re.compile("(?<=[a-zA-Z])\'s")
# to find the ' following the words ending by s
pat_s2 = re.compile("(?<=s)\'s?")
# to find the abbreviation of not
pat_not = re.compile("(?<=[a-zA-Z])n\'t")
# to find the abbreviation of would
pat_would = re.compile("(?<=[a-zA-Z])\'d")
# to find the abbreviation of will
pat_will = re.compile("(?<=[a-zA-Z])\'ll")
# to find the abbreviation of am
pat_am = re.compile("(?<=[I|i])\'m")
# to find the abbreviation of are
pat_are = re.compile("(?<=[a-zA-Z])\'re")
# to find the abbreviation of have
pat_ve = re.compile("(?<=[a-zA-Z])\'ve")

t = pat_is.sub(r"\1 is", t)
t = pat_s.sub("", t)
t = pat_s2.sub("", t)
t = pat_not.sub(" not", t)
t = pat_would.sub(" would", t)
t = pat_will.sub(" will", t)
t = pat_am.sub(" am", t)
t = pat_are.sub(" are", t)
t = pat_ve.sub(" have", t)
t = t.replace('\'', ' ')

#将单词统一转化为小写
t=t.lower()
#滤除所有非单词字符
pattern=re.compile(r"\W+")
t=re.sub(pattern," ",t)


#通过单词间空格来分词
ts=t.split(" ")

#计算总词数
word_count=len(ts)

print("total words:",word_count)

#使用python内建集合模块collection来统计词频
tc=collections.Counter(ts)
#不重复单词的个数
word_count_unique=len(tc)
#非重复单词占比
word_unique_percent=word_count_unique/word_count*100
other_percent=100-word_unique_percent
other_word_count=word_count-word_count_unique

print("unique word count:",word_count_unique)

#前n个最高频词汇
most_common_words=tc.most_common(100)

print("most common words:",most_common_words)

#数据可视化
page=Page()
html_filename="shakespeare_word_count.html"
#柱状图
bar=Bar("词汇统计","莎翁作品")
bar.add("词汇数目",["总词汇数(未去重)","总词汇数(已去重)"],[word_count,word_count_unique],is_label_show=True)
page.add_chart(bar)
#饼图
pie=Pie("词汇数量")
pie.add("",["","去重词汇"],[other_word_count,word_count_unique],is_label_show=False)
#pie.print_echarts_options()
pie._option["color"]=["lightgreen","red"]
page.add_chart(pie)

x=[]
y=[]
for wc in most_common_words:
	x.append(wc[0])
	y.append(wc[1])
	
#词频柱形图
bar0=Bar("各词词频","排行前20位",width=2000)
bar0.add("词频",x[:20],y[:20],is_label_show=True)
page.add_chart(bar0)

#词频柱形图
bar1=Bar("各词词频","排行前20-40位",width=2000)
bar1.add("词频",x[20:40],y[20:40],is_label_show=True)
page.add_chart(bar1)



#词云
word_cloud=WordCloud(width=1000,height=1000)
word_cloud.add("高频词汇",x,y,word_size_range=[10,200])
page.add_chart(word_cloud)
#渲染输出pyecharts网页
page.render(html_filename)
webbrowser.open(html_filename)

猜你喜欢

转载自blog.csdn.net/MAILLIBIN/article/details/86507976