jieba 细粒度分词 / add_word无效 / 强制分词

def fenci(one_string):
    for _ in range(len(one_string)): # 去掉所有空格
        try:
            one_string=one_string.replace(" ","")
        except:
            break
    def isAllZh(s): # 判断是否全是中文
        for c in s:
            if not ('\u4e00' <= c <= '\u9fa5'):
                return False
        return True
    final_result = []
    temp_list = jieba.lcut(one_string)
    for word in temp_list:
        if isAllZh(word)==False:
            continue
        # if jieba.get_FREQ(word)==1:
        #     print(word)
        if (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) or len(word)>3:
            jieba.del_word(word) # 强制
            final_result.extend(jieba.lcut(word))
        else:
            final_result.append(word)
    return final_result

事实上和HMM=False的结果貌似差不多

print(jieba.lcut('丰田太省了', HMM=False))
print(jieba.lcut('我们中出了一个叛徒', HMM=False))
print(jieba.lcut('丰田太省了', HMM=True))
print(jieba.lcut('我们中出了一个叛徒', HMM=True))

猜你喜欢

转载自blog.csdn.net/guotong1988/article/details/80690523