python实现中文分词

正向最大匹配分词FMM

def fmm_line(line, max_len, dic):
    seg_list = []
    while len(line) > 0:
        try_word = line[:max_len if len(line) > max_len else len(line)]
        while try_word not in dic and len(try_word) > 1:
            try_word = try_word[:-1]
        seg_list.append(try_word)
        line = line[len(try_word):]
    return seg_list

逆向最大匹配分词BMM

def bmm_line(line, max_len, dic):
    seg_list = []
    while len(line) > 0:
        try_word = line[-max_len if len(line) > max_len else -len(line):]
        while try_word not in dic and len(try_word) > 1:
            try_word = try_word[1:]
        seg_list.insert(0, try_word)
        line = line[:-len(try_word)]
    return seg_list

双向匹配法BM

def bm(line, max_len, dic):
    fmm_lst = fmm_line(line, max_len, dic)
    bmm_lst = bmm_line(line, max_len, dic)
    bm_lst = fmm_lst if len(fmm_lst) < len(bmm_lst) else bmm_lst
    if len(fmm_lst) == len(bmm_lst):
        single_fmm_word_num, single_bmm_word_num = 0, 0
        for fmm_word, bmm_word in zip(fmm_lst, bmm_lst):
            single_fmm_word_num += 1 if len(fmm_word) == 1 else 0
            single_bmm_word_num += 1 if len(bmm_word) == 1 else 0
        bm_lst = fmm_lst if single_fmm_word_num < single_bmm_word_num else bmm_lst
    return bm_lst

调用方式

def main():
    max_len = 5
    to_line = '我要去北京玩'
    words_dic = ['我', '要', '去', '北京', '玩', '京玩']
    print(bmm_line(to_line, max_len, words_dic))


if __name__ == '__main__':
    main()
发布了13 篇原创文章 · 获赞 8 · 访问量 9302

猜你喜欢

转载自blog.csdn.net/qq_43481201/article/details/103717971