Word segmentation processing using the reverse maximum matching method (python)

flow chart:

# 逆向最大匹配
class IMM(object):
    def __init__(self, dic_path):
        self.dictionary = set()  #定义集合
        self.maximum = 0 #定义最大匹配长度
        with open(dic_path, 'r', encoding='utf-8') as f:   #将存储路径中的语料库打开
            for line in f:
                line = line.strip()  #去除首尾的空白字符
                if not line:
                    continue
                self.dictionary.add(line)  #将遍历的语料库中的元素添加到集合中
                if len(line) > self.maximum:
                    self.maximum = len(line)  #元素长度与最大长度的比较

    def cut(self, text):
        result = []
        index = len(text)
        while index > 0:
            word = None
            for size in range(self.maximum, 0, -1):    由最大长度,逆向遍历
                if index - size < 0:
                    continue
                piece = text[(index - size):index]  #逆向切分
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    index -= size
                    break
            if word is None:
                index -= 1
        return result[::-1]

if __name__ == '__main__':
    data_path = ""

    text=‘待切分文本’
    tokenizer = IMM('data_path')
    print(tokenizer.cut(text))

Note: The corpus here needs to be searched by itself

Guess you like

Origin blog.csdn.net/m0_52051577/article/details/124039543