正反向最大匹配算法

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_32113189/article/details/80496338

最近我在做一个对话系统,需要对输入的句子进行分词,苦于懒病,所以我就去百度上查查有没有开源的,我看到了正向最大匹配算法和反向最大匹配算法的实现,然后看到有人用java实现了正反向结合的最大匹配算法,于是小编开始百度是不是有python实现的,然后好失望,竟然没有。没办法,只能自己动手了,以下就是小编自己实现的正反向结合的最大匹配算法,我想会比单独的匹配算法的准确度要高一些,希望大家能给建议,并关注小编哦。

# -*- conding:utf-8 -*-
def add_dict():
    #读取字典,开始进行初始化
    f_dict = open('C:/users/Administrator/PycharmProject/untitled1/src/resource/dictfile_correct.txt', 'r', encoding="utf-8")
    print("开始初始化词典")
    max_length = 1
    count = 0
    dictionary = []
    for line in f_dict.readlines():
        dictionary.append(line.strip())
        count += 1
        if len(line) > max_length:
            max_length = len(line)
    print("完成词典初始化,添加词典条目数:" + str(count))
    print("最大分词长度:" + str(max_length))
    return dictionary, max_length

'''正向最大匹配'''
def ForwardBMM(line):
    dictionary, max_length = add_dict()
    revese = []
    while len(line) > 0:
        max_len = max_length
        if len(line) < max_len:
            max_len = len(line)

        # 取指定的最大长度的文本去词典里面匹配
        try_word = line[0:max_len]
        while try_word not in dictionary:
            if len(try_word) == 1:
                break
            try_word = try_word[0:(len(try_word) - 1)]
        revese.append(try_word)
        line = line[len(try_word):]
    return revese

'''反向最大匹配'''
def ReverseBMM(line):
    dictionary, max_length = add_dict()
    forward = []
    while len(line) > 0:
        max_len = max_length
        if (len(line) < max_len):
            max_len = len(line)

        try_word = line[(len(line) - max_len):]
        while try_word not in dictionary:
            if (len(try_word) == 1):
                break
            try_word = try_word[1:]
        forward.append(try_word)
        # 从待分词文本中取出已经分词的文本
        line = line[0:(len(line) - len(try_word))]
    forward.reverse()
    return forward

'''正反向最大匹配算法合并'''
def BMM(line):
    forward = ForwardBMM(line)
    reverse = ReverseBMM(line)
    issame = True
    fsingle = 0
    rsingle = 0
    for i in range(int(len(forward))):
        if forward[i] != reverse[i]:
            issame = False
        if len(forward[i]) == 1:
            fsingle += 1
        if len(reverse[i]) == 1:
            rsingle += 1
    if issame:
        result = forward
    else:
        if fsingle > rsingle:
            result = reverse
        else:
            result = forward
    print(result)
    return result

def main():
    print("请输入您的问题")
    BMM(line=input())


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_32113189/article/details/80496338
今日推荐