Chinese word segmentation and dictionaryization

1. Data set preparation

Insert picture description here
Insert picture description here

Second, the code implementation

import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba

text_list = []


def word_frequency_analysis(path):
    # print("111")
    files = os.listdir(path)  # files为列表,存储的是path里面的所有文件名
    for filename in files:  # 遍历文件夹中的所有文件
        if not fnmatch.fnmatch(filename, '*.TXT'):  # 找到txt文件
            continue
        txt_path = os.path.join(path, filename)
        txt_content = open(txt_path, 'rb').read()  # 读取txt文件中的中文

        text_cutted = jiebaCutText(txt_content)  # 用结巴分词
        text_list.append(text_cutted)  # 列表中添加分词后的结果


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=True)
    liststr = '/'.join(seg_list)  # 分词用/分割
    return liststr  # 返回的结果中会带标点符号


def jiebaCutText(text):
    seg_list = jieba.cut(text, cut_all=True)
    liststr = '/'.join(seg_list)
    return liststr  # 返回的结果中会带标点符号


def clearText(text):
    mywordlist = []
    for myword in text.split('/'):
        if len(myword.strip()) > 1 and contain_zh(myword.strip()):
            mywordlist.append(myword.strip())
    return '/'.join(mywordlist)


def contain_zh(word):
    zh = re.compile(u'[\u4200-\u9fa5]+')
    match = zh.search(word)
    return match


def countwords(text, counter_file):
    count_dict = dict()
    for item in text.split('/'):
        if item in count_dict:
            count_dict[item] += 1
        else:
            count_dict[item] = 1

    d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
    with open(counter_file, 'w', encoding='utf-8-sig') as f:
        # f.write(codecs.BOM_UTF8)
        w = csv.writer(f)
        w.writerows(d_sorted_by_value.items())


if __name__ == '__main__':
    rootdir = r"C:\Users\fyyuj\Desktop\实验室练习\大作业\dataset"

    list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件

    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        word_frequency_analysis(path)
    newfile = open(r'test1.txt', 'w')
    for i in range(0, len(text_list)):
        text_clear = clearText(text_list[i])
        newfile.write(text_clear)
    newfile.close()

    with open('test1.txt', 'r') as f:
        contents = f.read()
        contents = contents.split('/')

    with open('output.txt', 'w') as f:
        # f.write(str(contents))
        dic = dict(enumerate(contents))
        f.write(str(dic))

Third, the result fragment

Insert picture description here

Guess you like

Origin blog.csdn.net/yjh_SE007/article/details/108297266