基于划分的新词发现

 

主要的技术点:

(1)左右熵、凝固度。

(2)该脚本目前应用于短文本。

目前问题:

对英文数字不好,如果直接使用,需要辅助其他策略,例如部分正则直接提出整体,不进行分割。

对低频实体词不好,可能不能发现该词,或者发现词错误。

部分效果如下:

processing line : 办公室主任
[1, 4, 6]
办公室
主任
processing line : 私家车司机
[1, 4, 6]
私家车
司机
processing line : 土建预算员
[1, 3, 6]
土建
预算员
processing line : 医院挂号员
[1, 3, 6]
医院
挂号员
processing line : 网上兼职
[1, 3, 5]
网上
兼职
processing line : 小儿推拿师
[1, 3, 6]
小儿
推拿师
processing line : 皮肤管理师
[1, 3, 6]
皮肤
管理师
processing line : 档案录入员
[1, 3, 6]
档案
录入员
processing line : 工厂叉车工
[1, 3, 6]
工厂
叉车工
processing line : 超市促销员
[1, 3, 6]
超市
促销员
processing line : 711便利店
[1, 3, 7]
71
1便利店
processing line : 出海船员
[1, 3, 5]
出海
船员
processing line : 中专辅导员
[1, 3, 6]
中专
辅导员
processing line : 德勤华永会计师事务所 
[1, 3, 5, 7, 9, 12]
德勤
华永
会计
师事
务所 
processing line : cosplay模特
[1, 3, 5, 7, 10]
co
sp
la
y模特
processing line : gucci销售
[1, 3, 6, 8]
gu
cci
销售

实现方式如下:

# encoding: utf-8

import jieba
import math
import copy as cp

class new_word_tst(object):
    """新词发现逻辑测试"""

    def __init__(self):
        self.one_gram = {}
        self.two_gram = {}
        self.two_gram_reverse = {}

    def load_data(self, file_path, w_type="word"):
        """
        按照字的模型和词的模式对行进行分词 并统计相关结果
        :param file_path:
        :param w_type:
        :return:
        """
        index = 0
        with open(file_path, "r", encoding="utf-8") as reader:
            for line in reader:
                line = line.strip()
                if len(line) == 0:
                    continue
                index += 1
                if index % 10000 == 0:
                    print("processing data : {}".format(index))
                if index % 400000 == 0:
                    break
                tmp = []
                if w_type == "word":
                    tmp = list(line)
                    tmp.insert(0,"<s>")
                    tmp.append("<e>")

                if w_type == "words":
                    tmp = list(jieba.cut(line))
                    tmp.insert(0, "<s>")
                    tmp.append("<e>")

                for w in tmp:
                    if w not in self.one_gram:
                        self.one_gram[w] = 1
                    else:
                        self.one_gram[w] = 1 + self.one_gram[w]

                for i in range(len(tmp) - 1):
                    tw = tmp[i]
                    twn = tmp[i+1]
                    if tw not in self.two_gram:
                        tmap = {twn: 1}
                        self.two_gram[tw] = tmap
                    else:
                        if twn not in self.two_gram[tw]:
                            self.two_gram[tw][twn] = 1
                        else:
                            self.two_gram[tw][twn] = 1 + self.two_gram[tw][twn]


                for i in range(len(tmp) - 1):
                    tw = tmp[i]
                    twn = tmp[i-1]
                    if tw not in self.two_gram_reverse:
                        tmap = {twn: 1}
                        self.two_gram_reverse[tw] = tmap
                    else:
                        if twn not in self.two_gram_reverse[tw]:
                            self.two_gram_reverse[tw][twn] = 1
                        else:
                            self.two_gram_reverse[tw][twn] = 1 + self.two_gram_reverse[tw][twn]

    def solidification_list(self, wd_list):
        """凝固度计算  该词的左右凝固都"""

        left_result = []
        right_result = []
        wd_list_len = len(wd_list)

        for i in range(wd_list_len - 1):
            w1 = wd_list[i]
            w2 = wd_list[i + 1]
            w1_c = 0
            w2_c = 0
            w1_w2_c = 0
            if w1 in self.one_gram:
                w1_c = self.one_gram[w1]
            if w2 in self.one_gram:
                w2_c = self.one_gram[w2]
            if w1 in self.two_gram and w2 in self.two_gram[w1]:
                w1_w2_c = self.two_gram[w1][w2]

            p1 = (1 + w1_c)/(sum(list(self.one_gram.values())) + len(self.one_gram))
            p2 = (1 + w2_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))

            NN = 0
            NN = NN + len(self.two_gram)
            for k,v in self.two_gram.items():
                NN = NN + sum(list(v.values()))

            p1p2 = (1 + w1_w2_c)/NN
            res = p1p2 / (p1 * p2)
            right_result.append(res)

        for i in range(1, wd_list_len):
            w1 = wd_list[i]
            w2 = wd_list[i - 1]
            w1_c = 0
            w2_c = 0
            w1_w2_c = 0
            if w1 in self.one_gram:
                w1_c = self.one_gram[w1]
            if w2 in self.one_gram:
                w2_c = self.one_gram[w2]
            if w1 in self.two_gram_reverse and w2 in self.two_gram_reverse[w1]:
                w1_w2_c = self.two_gram_reverse[w1][w2]

            p1 = (1 + w1_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))
            p2 = (1 + w2_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))

            NN = 0
            NN = NN + len(self.two_gram_reverse)
            for k, v in self.two_gram_reverse.items():
                NN = NN + sum(list(v.values()))

            p1p2 = (1 + w1_w2_c) / NN
            res = p1p2 / (p1 * p2)
            left_result.append(res)

        return right_result, left_result




    def freedom_list(self,wd_list):
        """自由度计算"""
        left_result = []
        right_result = []
        wd_list_len = len(wd_list)

        for i in range(wd_list_len - 1):
            w1 = wd_list[i]
            w1_c = 0
            w1_w2_c = []
            if w1 in self.one_gram:
                w1_c = self.one_gram[w1] + 1

            if w1 in self.two_gram :
                for k, v in self.two_gram[w1].items():
                    w1_w2_c.append(v)

            res = 0
            for v in w1_w2_c:
                p = v/w1_c
                res = res - p * math.log(p, 2)

            right_result.append(res)

        for i in range(1, wd_list_len):
            w1 = wd_list[i]
            w1_c = 0
            w1_w2_c = []
            if w1 in self.one_gram:
                w1_c = self.one_gram[w1] + 1

            if w1 in self.two_gram_reverse:
                for k, v in self.two_gram_reverse[w1].items():
                    w1_w2_c.append(v)

            res = 0
            for v in w1_w2_c:
                p = v / w1_c
                res = res - p * math.log(p, 2)

            left_result.append(res)

        return right_result, left_result

    def init_data_model(self, path):
        """新词发现主逻辑"""
        self.load_data(path)

    def split_list_by_small(self, data_list, data_list_right, data_list_left, resList):
        """数据分割"""
        data_list_len = len(data_list)
        resList.append(0)
        resList.append(data_list_len)
        max_v = max(data_list)
        max_v_point = max_v/2
        data_list[0] = max_v

        for i in range(data_list_len):
            resList.sort()
            for j in range(len(resList) - 1):
                start = resList[j]
                end = resList[j + 1]
                tmp_data_list = data_list[start:end]

                min_v = min(tmp_data_list)
                index_v = tmp_data_list.index(min_v)
                index_v_a = index_v + start

                if len(tmp_data_list[0: index_v]) <= 1 and len(tmp_data_list[index_v:]) <= 1:
                    continue
                else:
                    if index_v_a not in resList and data_list[index_v_a] <= max_v_point:
                        resList.append(index_v_a)
                        data_list[index_v_a] = max_v

        if len(resList) >= 2 and resList[0] + 1 == resList[1]:
            resList.pop(1)
        if len(resList) >= 2 and resList[-2] + 1 == resList[-1]:
            resList.pop(-2)

        single_word_index = []
        for i in range(len(resList) - 1):
            if resList[i] + 1 == resList[i + 1]:
                single_word_index.append(resList[i])

        for single_v in single_word_index:
            if single_v in resList:
                single_v_index = resList.index(single_v)

                right_v = data_list_right[single_v_index]
                left_v = data_list_left[single_v_index - 1]
                if right_v >= left_v:
                    resList.pop(single_v_index + 1)
                else:
                    resList.pop(single_v_index)

    def process_line(self,line):
        """处理一行数据的分词"""
        print("processing line : {}".format(line))
        line_list = list(line)
        line_list.insert(0, "<s>")
        right_result, left_result = self.solidification_list(line_list)  # 左右凝固度
        right_result1, left_result1 = self.freedom_list(line_list)  # 左右熵

        res_spl_list = []
        final_res_1 = []
        self.split_list_by_small(cp.copy(right_result), cp.copy(right_result1),cp.copy(left_result1), res_spl_list)
        res_spl_list = [ v + 1 for v in res_spl_list]
        print(res_spl_list)
        for i in range(len(res_spl_list) - 1):
            print("".join(line_list[res_spl_list[i]: res_spl_list[i + 1]]))
            final_res_1.append("".join(line_list[res_spl_list[i]: res_spl_list[i + 1]]))

        final_res = []
        i = 0
        for i in range(len(final_res_1) - 1):
            if final_res_1[i].encode('UTF-8').isalpha() and final_res_1[i + 1].encode('UTF-8').isalpha():
                final_res.append(final_res_1[i] + final_res_1[i + 1])
                i += 1
            else:
                final_res.append(final_res_1[i])
        if i == len(final_res_1) - 2 or len(final_res_1) == 1:
            final_res.append(final_res_1[-1])


        return final_res



    def process_file(self):
        """对整个文件进行处理"""
        writer = open("new_word_res5.txt", "w", encoding="utf-8")
        with open(r"word_count_only_word", "r",encoding="utf-8") as reader:
            for line in reader:
                line = line.strip()
                if len(line) == 0:
                    continue
                res = self.process_line(line)
                writer.write(line + " ==> " + "\t".join(res) + "\n")

if __name__ == "__main__":
    n_word = new_word_tst()
    n_word.init_data_model(r"word_count_only_word")
    n_word.process_line("c1司机驾驶员")
    n_word.process_line("会计师事务所")
    n_word.process_line("办公室文员")
    n_word.process_line("图书馆管理员")
    n_word.process_line("跨境电商")
    n_word.process_line("一级建造师")
    n_word.process_line("房地产开发商直招")
    n_word.process_line("高铁乘务员")
    n_word.process_line("信息流优化师")
    n_word.process_line("unity")
    n_word.process_line("linux运维工程师")
    n_word.process_line("办公室主任")
    n_word.process_line("私家车司机")
    n_word.process_line("土建预算员")
    n_word.process_line("医院挂号员")
    n_word.process_line("网上兼职")
    n_word.process_line("小儿推拿师")
    n_word.process_line("皮肤管理师")
    n_word.process_line("档案录入员")
    n_word.process_line("工厂叉车工")
    n_word.process_line("超市促销员")
    n_word.process_line("711便利店")
    n_word.process_line("出海船员")
    n_word.process_line("中专辅导员")
    n_word.process_line("出海船员")
    n_word.process_line("德勤华永会计师事务所 ")
    n_word.process_line("cosplay模特")
    n_word.process_line("gucci销售")
    n_word.process_line("出海船员")
    n_word.process_file()

猜你喜欢

转载自blog.csdn.net/cyinfi/article/details/107329202