中英文分词生成词频

def cut():
    str = ''
    '''open打开歌词数据,加入str'''
    with open('d:/style.txt','r') as f:
        for line in f.readlines():
            str += line.strip()
    '''用for循环+replace去除标点符号'''
    for old in ',,.。??!!':
        str.replace(old,' ')
    str = str.lower()#转换大小写
    music = str.split(' ')#按空格划分
    dic = {}
    #统计词频
    for i in set(music):
        dic[i] = music.count(i)
    return dic
def sort_dic(dic):
    return sorted(dic.items(),key= lambda d:d[1],reverse=True)#通过sort传入iterable,if指定为元祖第二个元素

if __name__ == '__main__':
    dic = cut()
    print(sort_dic(dic))

英文通过空格分割,中文使用jieba分词并加载停用词表

二、中文分词

1.准备utf-8编码的文本文件file

2.通过文件读取字符串 str

3.对文本进行预处理

4.分解提取单词 list

5.单词计数字典 set , dict

6.按词频排序 list.sort(key=)

7.排除语法型词汇,代词、冠词、连词等无语义词

8.输出TOP(20)

def ch_cut(file,stop):
    with open(file=file) as f:
        f = f.read()
    f = jieba.cut(f)
    stoplist = []
    #加载停用词表stoplist
    with open(file=stop) as stopword:
        for i in stopword.readlines():
            stoplist.append(i.strip())
    li = [i for i in f if i not in stoplist]
    dic = {}
    for key in set(li):
        dic[key] = li.count(key)
    return sorted(dic.items(),key= lambda d:d[1],reverse=True)

if __name__ == '__main__':
    print(ch_cut('d:/xiaoshuo.txt','d:/stopword.txt')[:20])

猜你喜欢

转载自www.cnblogs.com/smallgrass/p/9789748.html