Python-文本词频统计

文本准备

英文文本(hamlet分析词频)：https://python123.io/resources/pye/hamlet.txt

中文文本(三国演义分析人物)：https://python123.io/resources/pye/threekingdoms.txt

Hamlet英文词频统计

#CalHamletV1.py

#获取文本内容并去噪及归一化
def getText():
    text = open("D:\Personal\Desktop\hamlet.txt", "r").read();
    text = text.lower();
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
        text = text.replace(ch, " ");
    return text;

hamletTxt = getText();

#文本切割为单词列表
words = hamletTxt.split();

#声明一个计数词典,键是单词，值是单词出现的次数
counts = {};

#循环遍历单词列表，统计各个单词出现的次数用counts记录
for word in words:
    counts[word] = counts.get(word, 0) + 1;

#dict_items类型转化为列表类型
items = list(counts.items());

#列表排序 reverse=True降序 reverse=False升序
items.sort(key=lambda x:x[1], reverse=True);

for i in range(10):
	word, count = items[i];
	print("{0:<10}{1:>5}".format(word, count));

#运行结果
the        1138
and         965
to          754
of          669
you         550
i           542
a           542
my          514
hamlet      462
in          436

知识点：字典，列表，元祖的相关操作

1. 字典的items()函数返回值类型是 'dict_item()'，需转化为列表使用

dict = {"中国":"北京","美国":"华盛顿","法国":"巴黎"};
a = dict.items();
print("dict.items()的返回值是:{0}".format(a));
#运行结果
#dict.items()的返回值是:dict_items([('中国', '北京'), ('美国', '华盛顿'), ('法国', '巴黎')])

print("dict.items()返回值的类型是:{0}".format(type(a)));
#运行结果
#dict.items()返回值的类型是:<class 'dict_items'>

b = list(a);
print("b={0}".format(b));
#运行结果
#b=[('中国', '北京'), ('美国', '华盛顿'), ('法国', '巴黎')]

2.列表排序

#语法
list.sort(cmp=None, key=None, reverse=False);

cmp -- 可选参数，如果指定了该参数会使用该参数的方法进行排序
key -- 主要是用来比较的元素，只有一个参数，具体的函数的参数就是曲子可迭代对象中，指定可迭代对象中的一个元素来进行排序
reverse -- 排序规则， reverse = True,降序， reverse = False升序（默认）

eg1:
aList = [123, 'Google', 'Runoob', 'Taobao', 'Facebook'];
aList.sort();
print("List : ", aList);
#运行结果
List :  [123, 'Facebook', 'Google', 'Runoob', 'Taobao']

eg2:降序输出列表
# 列表
vowels = ['e', 'a', 'u', 'o', 'i']
 
# 降序
vowels.sort(reverse=True)
 
# 输出结果
print('降序输出:', vowels);

#运行结果
#降序输出: ['u', 'o', 'i', 'e', 'a']

eg3:通过指定列表中的元素排序来输出列表
# 获取列表的第二个元素
def takeSecond(elem):
    return elem[1]
 
# 列表
random = [(2, 2), (3, 4), (4, 1), (1, 3)]
 
# 指定第二个元素排序
random.sort(key=takeSecond)
 
# 输出类别
print('排序列表：', random)

#运行结果
#排序列表：[(4, 1), (2, 2), (1, 3), (3, 4)]

3.lambda表达式

#lambda用于定义简单的，能够在一行内表示的函数
-------------------------------------
|<函数名> = lambda <参数>:<表达式>    
|                                   
|等价于                             
|
|def <函数名>(<参数>):
|    <函数体>
|    return <返回值>
-------------------------------------
def takeSecond(elem):
    return elem[1];
<=>(等价于)
takeSecond = lambda elem:elem[1];

eg3:通过指定列表中的元素排序来输出列表
# 列表
random = [(2, 2), (3, 4), (4, 1), (1, 3)]
 
# 指定第二个元素排序
random.sort(key=lambda elem:elem[1]);
 
# 输出类别
print('排序列表：', random)

#运行结果
#排序列表：[(4, 1), (2, 2), (1, 3), (3, 4)]

《三国演义》人物出场统计

#CalThreekingdoms.py
import jieba

txt = open("D:\\Personal\\Desktop\\threekingdoms.txt", "r", encoding="utf-8").read();
excludes = {"将军","却说","荆州","二人","不可","不能","如此","商议","如何","主公","军士","左右","军马","引兵"}
words = jieba.lcut(txt);
counts = {};

for word in words:
	if len(word) == 1:
		continue;
	elif word == "诸葛亮" or word == "孔明曰":
		rword = "孔明";
		
	elif word == "关公" or word == "云长":
		rword = "关羽";
		
	elif word == "玄德" or word == "玄德曰":
		rword = "刘备";
		
	elif word == "孟德" or word == "丞相":
		rword = "曹操";
		
	else:
		rword = word;
	counts[rword] = counts.get(rword, 0) + 1;
	
for word in excludes:
	del counts[word];
	
items = list(counts.items());
items.sort(key=lambda x:x[1], reverse=True);

for i in  range(20):
	word, count = items[i];
	print("{0:<10}{1:>5}".format(word, count));
#运行结果
曹操         1451
孔明         1383
刘备         1252
关羽          784
张飞          358
吕布          300
赵云          278
次日          271
大喜          268
孙权          264
天下          255
东吴          251
于是          250
今日          243
不敢          239
魏兵          233
陛下          223
一人          221
都督          221
司马懿         221

本文仅为学习Python记录，资料来源于中国大学MOOC《Python语言设计》—嵩天

Python-文本词频统计

猜你喜欢