版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_32113189/article/details/80496338
最近我在做一个对话系统,需要对输入的句子进行分词,苦于懒病,所以我就去百度上查查有没有开源的,我看到了正向最大匹配算法和反向最大匹配算法的实现,然后看到有人用java实现了正反向结合的最大匹配算法,于是小编开始百度是不是有python实现的,然后好失望,竟然没有。没办法,只能自己动手了,以下就是小编自己实现的正反向结合的最大匹配算法,我想会比单独的匹配算法的准确度要高一些,希望大家能给建议,并关注小编哦。
# -*- conding:utf-8 -*-
def add_dict():
#读取字典,开始进行初始化
f_dict = open('C:/users/Administrator/PycharmProject/untitled1/src/resource/dictfile_correct.txt', 'r', encoding="utf-8")
print("开始初始化词典")
max_length = 1
count = 0
dictionary = []
for line in f_dict.readlines():
dictionary.append(line.strip())
count += 1
if len(line) > max_length:
max_length = len(line)
print("完成词典初始化,添加词典条目数:" + str(count))
print("最大分词长度:" + str(max_length))
return dictionary, max_length
'''正向最大匹配'''
def ForwardBMM(line):
dictionary, max_length = add_dict()
revese = []
while len(line) > 0:
max_len = max_length
if len(line) < max_len:
max_len = len(line)
# 取指定的最大长度的文本去词典里面匹配
try_word = line[0:max_len]
while try_word not in dictionary:
if len(try_word) == 1:
break
try_word = try_word[0:(len(try_word) - 1)]
revese.append(try_word)
line = line[len(try_word):]
return revese
'''反向最大匹配'''
def ReverseBMM(line):
dictionary, max_length = add_dict()
forward = []
while len(line) > 0:
max_len = max_length
if (len(line) < max_len):
max_len = len(line)
try_word = line[(len(line) - max_len):]
while try_word not in dictionary:
if (len(try_word) == 1):
break
try_word = try_word[1:]
forward.append(try_word)
# 从待分词文本中取出已经分词的文本
line = line[0:(len(line) - len(try_word))]
forward.reverse()
return forward
'''正反向最大匹配算法合并'''
def BMM(line):
forward = ForwardBMM(line)
reverse = ReverseBMM(line)
issame = True
fsingle = 0
rsingle = 0
for i in range(int(len(forward))):
if forward[i] != reverse[i]:
issame = False
if len(forward[i]) == 1:
fsingle += 1
if len(reverse[i]) == 1:
rsingle += 1
if issame:
result = forward
else:
if fsingle > rsingle:
result = reverse
else:
result = forward
print(result)
return result
def main():
print("请输入您的问题")
BMM(line=input())
if __name__ == '__main__':
main()