Text word frequency statistics
I. Overview
1. Requirements: an article which appeared in the word? Which words appear most frequently?
2. First, know the English word frequency statistics text and Chinese text are different
two, "of HAMLET"
1. noise processing: extracting words, removing unnecessary other things.
2. extracts words, on an empty Geqie Split points, forming a listing
3. words and corresponding statistical word frequency, using the dictionary
4. Press key word frequency: sorting number appears, listing using Method Sort
5. The output
Hamlet
def gettext():
text = open("hamlet.txt",'r').read()
text = text.lower()
for ch in '"#$%^&*()_+-,./<>=@{}[]\~\'':
text = text.replace(ch,'')
return text
hamlettext = gettext()
words = hamlettext.split()
counts = {}
for word in words:
counts[word]=counts.get(word,0)+1
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse = True)
for i in range (20):
word,count = items[i]
print("{0:<10}{1:>5}".format(word,count))
Third, the "Three Kingdoms" names appearances statistics
1. First Edition
#三国演义
#first,get words;second,count the times word appear in text;third,print the top 20
import jieba
txt = open('三国演义.txt','r',encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word)==1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse = True)
for i in range (10):
word,times = items[i]
print('{0:<10}{1:>5}'.format(word,times))
Found the problem:
Ming and Ming said should be counted as a person
Jingzhou names not
improve:
delete the group from the list of non-human nouns
in the establishment of a set of statistical appearances of the words, the Ming and Ming said, counted a times.
2. second edition
import jieba
txt =open('D:/pythonfiles/三国演义.txt','r',encoding='utf-8').read()
excludes = {'将军','却说','荆州','二人','不可','不能','如此','如何','军士','商议','左右','军马','次日','引兵','大喜','天下','东吴','于是','今日','不敢'}
words = jieba.lcut(txt)
counts = {}
for word in words:
if len (word) == 1 :
continue
elif word == '诸葛亮' or word == '孔明曰':
reword = '孔明'
elif word == '玄德' or word == '玄德曰':
reword = '刘备'
elif word == '关公' or word == '云长':
reword = '关羽'
elif word == '孟德' or word == '丞相':
reword = '曹操'
else :
reword = word
counts[reword]=counts.get(reword,0)+1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key= lambda x:x[1],reverse = True)
for i in range(20):
word, count = items[i]
print('{:<10}{:>5}'.format(word,count))
Still the old problems, in accordance with the improved method can be further optimized.
Three Kingdoms TOP20
import jieba
txt = open('三国演义.txt','r',encoding='utf-8').read()
txt = jieba.lcut(txt)
counts = {}
def countword(a):
global counts
counts[a] = counts.get(a,0)+1
for word in txt:
if len(word) == 1:
continue
elif word == '孔明曰' or word == '诸葛亮' :
word = '孔明'
countword(word)
elif word == '玄德' or word =='玄德曰' or word == '主公' or word == '先主':
word = '刘备'
countword(word)
elif word == '丞相' or word == '孟德':
word ='曹操'
countword(word)
elif word == '关公' or word == '云长':
word = '关羽'
countword(word)
elif word == '都督':
word = '周瑜'
countword(word)
elif word == '后主':
word = '刘禅'
countword(word)
else:
countword(word)
excluse = ['二人','不可','荆州','却说','不能','将军','如此','军士','如何','商议','左右','次日','引兵','大喜','天下','东吴','军马','于是',
'今日','不敢','陛下','魏兵','人马','一人','不知','汉中','众将','只见','蜀兵','大叫','上马','此人','太守','天子','背后','后人','城中'
,'一面','何不','忽报','大军','先生','何故','然后','先锋','夫人','不如','赶来','原来','令人','江东','徐州','正是','忽然','下马','喊声'
,'成都','因此','百姓','未知','大败','一军','大事','之后','不见','起兵','接应','军中','进兵','引军','大惊','可以']
for i in excluse:
del counts[i]
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse = True)
for i in range(20):
item ,times = items[i]
print('{0:<10}{1:>5}'.format(item,times))