1. Data set preparation
Second, the code implementation
import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba
text_list = []
def word_frequency_analysis(path):
# print("111")
files = os.listdir(path) # files为列表,存储的是path里面的所有文件名
for filename in files: # 遍历文件夹中的所有文件
if not fnmatch.fnmatch(filename, '*.TXT'): # 找到txt文件
continue
txt_path = os.path.join(path, filename)
txt_content = open(txt_path, 'rb').read() # 读取txt文件中的中文
text_cutted = jiebaCutText(txt_content) # 用结巴分词
text_list.append(text_cutted) # 列表中添加分词后的结果
def jiebaCutText(text):
seg_list = jieba.cut(text, cut_all=True)
liststr = '/'.join(seg_list) # 分词用/分割
return liststr # 返回的结果中会带标点符号
def jiebaCutText(text):
seg_list = jieba.cut(text, cut_all=True)
liststr = '/'.join(seg_list)
return liststr # 返回的结果中会带标点符号
def clearText(text):
mywordlist = []
for myword in text.split('/'):
if len(myword.strip()) > 1 and contain_zh(myword.strip()):
mywordlist.append(myword.strip())
return '/'.join(mywordlist)
def contain_zh(word):
zh = re.compile(u'[\u4200-\u9fa5]+')
match = zh.search(word)
return match
def countwords(text, counter_file):
count_dict = dict()
for item in text.split('/'):
if item in count_dict:
count_dict[item] += 1
else:
count_dict[item] = 1
d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
with open(counter_file, 'w', encoding='utf-8-sig') as f:
# f.write(codecs.BOM_UTF8)
w = csv.writer(f)
w.writerows(d_sorted_by_value.items())
if __name__ == '__main__':
rootdir = r"C:\Users\fyyuj\Desktop\实验室练习\大作业\dataset"
list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
word_frequency_analysis(path)
newfile = open(r'test1.txt', 'w')
for i in range(0, len(text_list)):
text_clear = clearText(text_list[i])
newfile.write(text_clear)
newfile.close()
with open('test1.txt', 'r') as f:
contents = f.read()
contents = contents.split('/')
with open('output.txt', 'w') as f:
# f.write(str(contents))
dic = dict(enumerate(contents))
f.write(str(dic))