正向最大匹配分词FMM
def fmm_line(line, max_len, dic):
seg_list = []
while len(line) > 0:
try_word = line[:max_len if len(line) > max_len else len(line)]
while try_word not in dic and len(try_word) > 1:
try_word = try_word[:-1]
seg_list.append(try_word)
line = line[len(try_word):]
return seg_list
逆向最大匹配分词BMM
def bmm_line(line, max_len, dic):
seg_list = []
while len(line) > 0:
try_word = line[-max_len if len(line) > max_len else -len(line):]
while try_word not in dic and len(try_word) > 1:
try_word = try_word[1:]
seg_list.insert(0, try_word)
line = line[:-len(try_word)]
return seg_list
双向匹配法BM
def bm(line, max_len, dic):
fmm_lst = fmm_line(line, max_len, dic)
bmm_lst = bmm_line(line, max_len, dic)
bm_lst = fmm_lst if len(fmm_lst) < len(bmm_lst) else bmm_lst
if len(fmm_lst) == len(bmm_lst):
single_fmm_word_num, single_bmm_word_num = 0, 0
for fmm_word, bmm_word in zip(fmm_lst, bmm_lst):
single_fmm_word_num += 1 if len(fmm_word) == 1 else 0
single_bmm_word_num += 1 if len(bmm_word) == 1 else 0
bm_lst = fmm_lst if single_fmm_word_num < single_bmm_word_num else bmm_lst
return bm_lst
调用方式
def main():
max_len = 5
to_line = '我要去北京玩'
words_dic = ['我', '要', '去', '北京', '玩', '京玩']
print(bmm_line(to_line, max_len, words_dic))
if __name__ == '__main__':
main()