def cut(): str = '' '''open打开歌词数据,加入str''' with open('d:/style.txt','r') as f: for line in f.readlines(): str += line.strip() '''用for循环+replace去除标点符号''' for old in ',,.。??!!': str.replace(old,' ') str = str.lower()#转换大小写 music = str.split(' ')#按空格划分 dic = {} #统计词频 for i in set(music): dic[i] = music.count(i) return dic def sort_dic(dic): return sorted(dic.items(),key= lambda d:d[1],reverse=True)#通过sort传入iterable,if指定为元祖第二个元素 if __name__ == '__main__': dic = cut() print(sort_dic(dic))
英文通过空格分割,中文使用jieba分词并加载停用词表
二、中文分词
1.准备utf-8编码的文本文件file
2.通过文件读取字符串 str
3.对文本进行预处理
4.分解提取单词 list
5.单词计数字典 set , dict
6.按词频排序 list.sort(key=)
7.排除语法型词汇,代词、冠词、连词等无语义词
8.输出TOP(20)
def ch_cut(file,stop): with open(file=file) as f: f = f.read() f = jieba.cut(f) stoplist = [] #加载停用词表stoplist with open(file=stop) as stopword: for i in stopword.readlines(): stoplist.append(i.strip()) li = [i for i in f if i not in stoplist] dic = {} for key in set(li): dic[key] = li.count(key) return sorted(dic.items(),key= lambda d:d[1],reverse=True) if __name__ == '__main__': print(ch_cut('d:/xiaoshuo.txt','d:/stopword.txt')[:20])