Needs of the environment: python3.x, numpy, pandas
need to document: the number of Chinese dictionary and corresponding word frequency, word of Chinese as much as possible, it is best to calculate the document corpus is large enough
sample as follows:
Here I share a document: Links , extraction code: 1ltr
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import pandas as pd
import numpy as np
class Pwms(object):
def __init__(self):
self.word_file = '' # 文档的路径
self.sentence = ''
self.candidate_words = {} # 候选词
self.left_neighbours = {} # 左邻词
self.Cumulative_probability = {} # 累积概率
self.Best_left_neighbour = {} # 最佳左邻词
def get_word_pro_dict(self):
"""
:return: 返回词和对应的概率词典
"""
df_data = pd.read_csv(self.word_file, sep=',', encoding='gbk', header=None)
word_pro_dict = dict(zip(df_data[0][:].values, np.log(df_data[2][:].str.strip("%").astype(float)/100))) # 将概率变成对数,对数求和可以防止累计概率过小溢出。
return word_pro_dict
def get_candidate_words(self):
"""
获取候选词和其对应的概率词典
"""
self.candidate_words = {}
word_pro_dict = self.get_word_pro_dict()
sentence_temp = self.sentence + '#' # 在输入的句子后,加入一个符号,仅是用于算法正确
index = 0 # 取候选词的头下标
last_index = 1 # 去候选词的尾下标
lth = len(sentence_temp)
while index != lth + 1 and last_index != lth + 1:
candidate_word = sentence_temp[index: last_index]
if candidate_word in word_pro_dict.keys():
# 如果在该词字典,加入候选词字典中
self.candidate_words[candidate_word, (index, last_index)] = word_pro_dict[candidate_word]
last_index += 1
else:
if last_index - index == 1:
# 用默认值填充字典中无出现的单个字符
self.candidate_words[candidate_word, (index, last_index)] = [np.log(0.000001)]
index += 1
last_index = index + 1
# 删除候选词字典中的#号
del self.candidate_words[('#', (len(sentence_temp) - 1, len(sentence_temp)))]
def get_left_neighbours(self):
"""
获取左邻近词词典
"""
self.left_neighbours = {}
temp_candidate = self.candidate_words
for key, pro in self.candidate_words.items():
self.left_neighbours[key] = []
if key[1][0] == 0: # 如果该词的起始下标是0,则是第一个词,没有左邻近词
self.left_neighbours[key] = []
else: # 如果该词的起始下标不是0,则便利整个字典,找到尾下标=该候选词的起始下标,即为左邻近词
for t_key, t_pro in temp_candidate.items():
if t_key[1][1] == key[1][0]:
self.left_neighbours[key].append(t_key)
def get_best_neighbour(self):
"""
获取最佳左邻近以及累积概率
"""
self.Cumulative_probability = {}
self.Best_left_neighbour = {}
for key, left_candidate in self.left_neighbours.items():
if len(left_candidate) == 0:
# 如果左邻近的候选词列表长度为0,则视为第一个词,直接得到累计概率
self.Cumulative_probability[key] = self.candidate_words[key]
self.Best_left_neighbour[key] = []
else:
temp_probability = {} # 用于暂存每个候选词的所有左邻近词的累积概率
for word in left_candidate:
probability = self.candidate_words[key] + self.Cumulative_probability[word]
temp_probability[word] = probability
best_left = max(temp_probability, key=temp_probability.get)
self.Best_left_neighbour[key] = best_left
self.Cumulative_probability[key] = temp_probability[best_left]
def main_method(self):
"""
用于初始化候选词列表、左邻近词词典、最佳左邻近词典和累积概率
"""
self.get_candidate_words()
self.get_left_neighbours()
self.get_best_neighbour()
def get_result(self, question):
"""
:param question: 问题,即待分词句子
:return: 分词结果
"""
self.sentence = question
self.main_method()
candidate_result = {}
for key, pro in self.Cumulative_probability.items():
if key[1][1] == len(self.sentence):
candidate_result[key] = pro
last_result = max(candidate_result, key=candidate_result.get)
result_index = len(self.sentence)
result = [last_result[0]]
while result_index != 0:
last_result = self.Best_left_neighbour[last_result]
result.append(last_result[0])
result_index = last_result[1][0]
result.reverse()
return '分词结果是:'+'/'.join(result)
if __name__ == '__main__':
pmws = Pwms()
sentence = '幼儿园地节目'
sentence_cut = pmws.get_result(sentence)
print(sentence_cut)