python probability of achieving the greatest Chinese word segmentation algorithm

Needs of the environment: python3.x, numpy, pandas
need to document: the number of Chinese dictionary and corresponding word frequency, word of Chinese as much as possible, it is best to calculate the document corpus is large enough
sample as follows:
Here Insert Picture Description
Here I share a document: Links , extraction code: 1ltr

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import pandas as pd
import numpy as np


class Pwms(object):
    def __init__(self):
        self.word_file = ''  # 文档的路径
        self.sentence = ''
        self.candidate_words = {}  # 候选词
        self.left_neighbours = {}  # 左邻词
        self.Cumulative_probability = {}  # 累积概率
        self.Best_left_neighbour = {}  # 最佳左邻词

    def get_word_pro_dict(self):
        """
        :return: 返回词和对应的概率词典
        """
        df_data = pd.read_csv(self.word_file, sep=',', encoding='gbk', header=None)
        word_pro_dict = dict(zip(df_data[0][:].values, np.log(df_data[2][:].str.strip("%").astype(float)/100)))  # 将概率变成对数,对数求和可以防止累计概率过小溢出。
        return word_pro_dict

    def get_candidate_words(self):
        """
        获取候选词和其对应的概率词典
        """
        self.candidate_words = {}
        word_pro_dict = self.get_word_pro_dict()
        sentence_temp = self.sentence + '#'  # 在输入的句子后,加入一个符号,仅是用于算法正确
        index = 0  # 取候选词的头下标
        last_index = 1  # 去候选词的尾下标
        lth = len(sentence_temp)
        while index != lth + 1 and last_index != lth + 1:
            candidate_word = sentence_temp[index: last_index]
            if candidate_word in word_pro_dict.keys():
                # 如果在该词字典,加入候选词字典中
                self.candidate_words[candidate_word, (index, last_index)] = word_pro_dict[candidate_word]
                last_index += 1
            else:
                if last_index - index == 1:
                    # 用默认值填充字典中无出现的单个字符
                    self.candidate_words[candidate_word, (index, last_index)] = [np.log(0.000001)]
                index += 1
                last_index = index + 1
        # 删除候选词字典中的#号
        del self.candidate_words[('#', (len(sentence_temp) - 1, len(sentence_temp)))]

    def get_left_neighbours(self):
        """
        获取左邻近词词典
        """
        self.left_neighbours = {}
        temp_candidate = self.candidate_words
        for key, pro in self.candidate_words.items():
            self.left_neighbours[key] = []
            if key[1][0] == 0:  # 如果该词的起始下标是0,则是第一个词,没有左邻近词
                self.left_neighbours[key] = []
            else:  # 如果该词的起始下标不是0,则便利整个字典,找到尾下标=该候选词的起始下标,即为左邻近词
                for t_key, t_pro in temp_candidate.items():
                    if t_key[1][1] == key[1][0]:
                        self.left_neighbours[key].append(t_key)

    def get_best_neighbour(self):
        """
        获取最佳左邻近以及累积概率
        """
        self.Cumulative_probability = {}
        self.Best_left_neighbour = {}
        for key, left_candidate in self.left_neighbours.items():
            if len(left_candidate) == 0:
                # 如果左邻近的候选词列表长度为0,则视为第一个词,直接得到累计概率
                self.Cumulative_probability[key] = self.candidate_words[key]
                self.Best_left_neighbour[key] = []
            else:
                temp_probability = {}  # 用于暂存每个候选词的所有左邻近词的累积概率
                for word in left_candidate:
                    probability = self.candidate_words[key] + self.Cumulative_probability[word]
                    temp_probability[word] = probability
                best_left = max(temp_probability, key=temp_probability.get)
                self.Best_left_neighbour[key] = best_left
                self.Cumulative_probability[key] = temp_probability[best_left]

    def main_method(self):
        """
        用于初始化候选词列表、左邻近词词典、最佳左邻近词典和累积概率
        """
        self.get_candidate_words()
        self.get_left_neighbours()
        self.get_best_neighbour()

    def get_result(self, question):
        """
        :param question: 问题,即待分词句子
        :return: 分词结果
        """
        self.sentence = question
        self.main_method()
        candidate_result = {}
        for key, pro in self.Cumulative_probability.items():
            if key[1][1] == len(self.sentence):
                candidate_result[key] = pro
        last_result = max(candidate_result, key=candidate_result.get)
        result_index = len(self.sentence)
        result = [last_result[0]]
        while result_index != 0:
            last_result = self.Best_left_neighbour[last_result]
            result.append(last_result[0])
            result_index = last_result[1][0]
        result.reverse()
        return '分词结果是:'+'/'.join(result)


if __name__ == '__main__':
    pmws = Pwms()
    sentence = '幼儿园地节目'
    sentence_cut = pmws.get_result(sentence)
    print(sentence_cut)


Published 12 original articles · won praise 3 · Views 2047

Guess you like

Origin blog.csdn.net/weixin_40902563/article/details/102584752