flow chart:
# 逆向最大匹配
class IMM(object):
def __init__(self, dic_path):
self.dictionary = set() #定义集合
self.maximum = 0 #定义最大匹配长度
with open(dic_path, 'r', encoding='utf-8') as f: #将存储路径中的语料库打开
for line in f:
line = line.strip() #去除首尾的空白字符
if not line:
continue
self.dictionary.add(line) #将遍历的语料库中的元素添加到集合中
if len(line) > self.maximum:
self.maximum = len(line) #元素长度与最大长度的比较
def cut(self, text):
result = []
index = len(text)
while index > 0:
word = None
for size in range(self.maximum, 0, -1): 由最大长度,逆向遍历
if index - size < 0:
continue
piece = text[(index - size):index] #逆向切分
if piece in self.dictionary:
word = piece
result.append(word)
index -= size
break
if word is None:
index -= 1
return result[::-1]
if __name__ == '__main__':
data_path = ""
text=‘待切分文本’
tokenizer = IMM('data_path')
print(tokenizer.cut(text))
Note: The corpus here needs to be searched by itself