主要的技术点:
(1)左右熵、凝固度。
(2)该脚本目前应用于短文本。
目前问题:
对英文数字不好,如果直接使用,需要辅助其他策略,例如部分正则直接提出整体,不进行分割。
对低频实体词不好,可能不能发现该词,或者发现词错误。
部分效果如下:
processing line : 办公室主任
[1, 4, 6]
办公室
主任
processing line : 私家车司机
[1, 4, 6]
私家车
司机
processing line : 土建预算员
[1, 3, 6]
土建
预算员
processing line : 医院挂号员
[1, 3, 6]
医院
挂号员
processing line : 网上兼职
[1, 3, 5]
网上
兼职
processing line : 小儿推拿师
[1, 3, 6]
小儿
推拿师
processing line : 皮肤管理师
[1, 3, 6]
皮肤
管理师
processing line : 档案录入员
[1, 3, 6]
档案
录入员
processing line : 工厂叉车工
[1, 3, 6]
工厂
叉车工
processing line : 超市促销员
[1, 3, 6]
超市
促销员
processing line : 711便利店
[1, 3, 7]
71
1便利店
processing line : 出海船员
[1, 3, 5]
出海
船员
processing line : 中专辅导员
[1, 3, 6]
中专
辅导员
processing line : 德勤华永会计师事务所
[1, 3, 5, 7, 9, 12]
德勤
华永
会计
师事
务所
processing line : cosplay模特
[1, 3, 5, 7, 10]
co
sp
la
y模特
processing line : gucci销售
[1, 3, 6, 8]
gu
cci
销售
实现方式如下:
# encoding: utf-8
import jieba
import math
import copy as cp
class new_word_tst(object):
"""新词发现逻辑测试"""
def __init__(self):
self.one_gram = {}
self.two_gram = {}
self.two_gram_reverse = {}
def load_data(self, file_path, w_type="word"):
"""
按照字的模型和词的模式对行进行分词 并统计相关结果
:param file_path:
:param w_type:
:return:
"""
index = 0
with open(file_path, "r", encoding="utf-8") as reader:
for line in reader:
line = line.strip()
if len(line) == 0:
continue
index += 1
if index % 10000 == 0:
print("processing data : {}".format(index))
if index % 400000 == 0:
break
tmp = []
if w_type == "word":
tmp = list(line)
tmp.insert(0,"<s>")
tmp.append("<e>")
if w_type == "words":
tmp = list(jieba.cut(line))
tmp.insert(0, "<s>")
tmp.append("<e>")
for w in tmp:
if w not in self.one_gram:
self.one_gram[w] = 1
else:
self.one_gram[w] = 1 + self.one_gram[w]
for i in range(len(tmp) - 1):
tw = tmp[i]
twn = tmp[i+1]
if tw not in self.two_gram:
tmap = {twn: 1}
self.two_gram[tw] = tmap
else:
if twn not in self.two_gram[tw]:
self.two_gram[tw][twn] = 1
else:
self.two_gram[tw][twn] = 1 + self.two_gram[tw][twn]
for i in range(len(tmp) - 1):
tw = tmp[i]
twn = tmp[i-1]
if tw not in self.two_gram_reverse:
tmap = {twn: 1}
self.two_gram_reverse[tw] = tmap
else:
if twn not in self.two_gram_reverse[tw]:
self.two_gram_reverse[tw][twn] = 1
else:
self.two_gram_reverse[tw][twn] = 1 + self.two_gram_reverse[tw][twn]
def solidification_list(self, wd_list):
"""凝固度计算 该词的左右凝固都"""
left_result = []
right_result = []
wd_list_len = len(wd_list)
for i in range(wd_list_len - 1):
w1 = wd_list[i]
w2 = wd_list[i + 1]
w1_c = 0
w2_c = 0
w1_w2_c = 0
if w1 in self.one_gram:
w1_c = self.one_gram[w1]
if w2 in self.one_gram:
w2_c = self.one_gram[w2]
if w1 in self.two_gram and w2 in self.two_gram[w1]:
w1_w2_c = self.two_gram[w1][w2]
p1 = (1 + w1_c)/(sum(list(self.one_gram.values())) + len(self.one_gram))
p2 = (1 + w2_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))
NN = 0
NN = NN + len(self.two_gram)
for k,v in self.two_gram.items():
NN = NN + sum(list(v.values()))
p1p2 = (1 + w1_w2_c)/NN
res = p1p2 / (p1 * p2)
right_result.append(res)
for i in range(1, wd_list_len):
w1 = wd_list[i]
w2 = wd_list[i - 1]
w1_c = 0
w2_c = 0
w1_w2_c = 0
if w1 in self.one_gram:
w1_c = self.one_gram[w1]
if w2 in self.one_gram:
w2_c = self.one_gram[w2]
if w1 in self.two_gram_reverse and w2 in self.two_gram_reverse[w1]:
w1_w2_c = self.two_gram_reverse[w1][w2]
p1 = (1 + w1_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))
p2 = (1 + w2_c) / (sum(list(self.one_gram.values())) + len(self.one_gram))
NN = 0
NN = NN + len(self.two_gram_reverse)
for k, v in self.two_gram_reverse.items():
NN = NN + sum(list(v.values()))
p1p2 = (1 + w1_w2_c) / NN
res = p1p2 / (p1 * p2)
left_result.append(res)
return right_result, left_result
def freedom_list(self,wd_list):
"""自由度计算"""
left_result = []
right_result = []
wd_list_len = len(wd_list)
for i in range(wd_list_len - 1):
w1 = wd_list[i]
w1_c = 0
w1_w2_c = []
if w1 in self.one_gram:
w1_c = self.one_gram[w1] + 1
if w1 in self.two_gram :
for k, v in self.two_gram[w1].items():
w1_w2_c.append(v)
res = 0
for v in w1_w2_c:
p = v/w1_c
res = res - p * math.log(p, 2)
right_result.append(res)
for i in range(1, wd_list_len):
w1 = wd_list[i]
w1_c = 0
w1_w2_c = []
if w1 in self.one_gram:
w1_c = self.one_gram[w1] + 1
if w1 in self.two_gram_reverse:
for k, v in self.two_gram_reverse[w1].items():
w1_w2_c.append(v)
res = 0
for v in w1_w2_c:
p = v / w1_c
res = res - p * math.log(p, 2)
left_result.append(res)
return right_result, left_result
def init_data_model(self, path):
"""新词发现主逻辑"""
self.load_data(path)
def split_list_by_small(self, data_list, data_list_right, data_list_left, resList):
"""数据分割"""
data_list_len = len(data_list)
resList.append(0)
resList.append(data_list_len)
max_v = max(data_list)
max_v_point = max_v/2
data_list[0] = max_v
for i in range(data_list_len):
resList.sort()
for j in range(len(resList) - 1):
start = resList[j]
end = resList[j + 1]
tmp_data_list = data_list[start:end]
min_v = min(tmp_data_list)
index_v = tmp_data_list.index(min_v)
index_v_a = index_v + start
if len(tmp_data_list[0: index_v]) <= 1 and len(tmp_data_list[index_v:]) <= 1:
continue
else:
if index_v_a not in resList and data_list[index_v_a] <= max_v_point:
resList.append(index_v_a)
data_list[index_v_a] = max_v
if len(resList) >= 2 and resList[0] + 1 == resList[1]:
resList.pop(1)
if len(resList) >= 2 and resList[-2] + 1 == resList[-1]:
resList.pop(-2)
single_word_index = []
for i in range(len(resList) - 1):
if resList[i] + 1 == resList[i + 1]:
single_word_index.append(resList[i])
for single_v in single_word_index:
if single_v in resList:
single_v_index = resList.index(single_v)
right_v = data_list_right[single_v_index]
left_v = data_list_left[single_v_index - 1]
if right_v >= left_v:
resList.pop(single_v_index + 1)
else:
resList.pop(single_v_index)
def process_line(self,line):
"""处理一行数据的分词"""
print("processing line : {}".format(line))
line_list = list(line)
line_list.insert(0, "<s>")
right_result, left_result = self.solidification_list(line_list) # 左右凝固度
right_result1, left_result1 = self.freedom_list(line_list) # 左右熵
res_spl_list = []
final_res_1 = []
self.split_list_by_small(cp.copy(right_result), cp.copy(right_result1),cp.copy(left_result1), res_spl_list)
res_spl_list = [ v + 1 for v in res_spl_list]
print(res_spl_list)
for i in range(len(res_spl_list) - 1):
print("".join(line_list[res_spl_list[i]: res_spl_list[i + 1]]))
final_res_1.append("".join(line_list[res_spl_list[i]: res_spl_list[i + 1]]))
final_res = []
i = 0
for i in range(len(final_res_1) - 1):
if final_res_1[i].encode('UTF-8').isalpha() and final_res_1[i + 1].encode('UTF-8').isalpha():
final_res.append(final_res_1[i] + final_res_1[i + 1])
i += 1
else:
final_res.append(final_res_1[i])
if i == len(final_res_1) - 2 or len(final_res_1) == 1:
final_res.append(final_res_1[-1])
return final_res
def process_file(self):
"""对整个文件进行处理"""
writer = open("new_word_res5.txt", "w", encoding="utf-8")
with open(r"word_count_only_word", "r",encoding="utf-8") as reader:
for line in reader:
line = line.strip()
if len(line) == 0:
continue
res = self.process_line(line)
writer.write(line + " ==> " + "\t".join(res) + "\n")
if __name__ == "__main__":
n_word = new_word_tst()
n_word.init_data_model(r"word_count_only_word")
n_word.process_line("c1司机驾驶员")
n_word.process_line("会计师事务所")
n_word.process_line("办公室文员")
n_word.process_line("图书馆管理员")
n_word.process_line("跨境电商")
n_word.process_line("一级建造师")
n_word.process_line("房地产开发商直招")
n_word.process_line("高铁乘务员")
n_word.process_line("信息流优化师")
n_word.process_line("unity")
n_word.process_line("linux运维工程师")
n_word.process_line("办公室主任")
n_word.process_line("私家车司机")
n_word.process_line("土建预算员")
n_word.process_line("医院挂号员")
n_word.process_line("网上兼职")
n_word.process_line("小儿推拿师")
n_word.process_line("皮肤管理师")
n_word.process_line("档案录入员")
n_word.process_line("工厂叉车工")
n_word.process_line("超市促销员")
n_word.process_line("711便利店")
n_word.process_line("出海船员")
n_word.process_line("中专辅导员")
n_word.process_line("出海船员")
n_word.process_line("德勤华永会计师事务所 ")
n_word.process_line("cosplay模特")
n_word.process_line("gucci销售")
n_word.process_line("出海船员")
n_word.process_file()