使用jieba分词计算txt中文本的词频

# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 17:37:55 2020

@author: weisssun
"""

import jieba
import re
import csv
from collections import Counter

stopw = [line.strip() for line in open(r'D:\Python\dict\dict\stopwords.txt',encoding='utf-8').readlines()]
#读取停用词词典

cut_words=''
for line in open(r'D:\Python\family.txt',encoding='utf-8'):
    #读取需分词的txt文档
    line.strip('\n')
    line = re.sub('[A-Za-z0-9\:\·\—\,\。\“ \”]', '', line)
    seg_list = jieba.cut(line,cut_all=True)
    cut_words += (' '.join(seg_list))
all_words=cut_words.split()
new_words = [w for w in all_words if w not in stopw]
#从分好的词中去掉停用词
#print(all_words)
#word_dict = Counter(all_words)
print(new_words)
word_dict = Counter(new_words)
print(word_dict)


with open(r'D:\Python\family_words.csv', 'w', newline='',encoding='gbk') as f:  # 将词频结果写入csv文件
    writer = csv.writer(f)            
    for k, v in word_dict.items():
        writer.writerow([k, v])
发布了8 篇原创文章 · 获赞 1 · 访问量 301

猜你喜欢

转载自blog.csdn.net/Sun_Weiss/article/details/104616804