1.算左右熵:
import re import math # print(doc2list(str1)) def gen_words(self, doc): # pattern = re.compile('[:“。”,!?、《》……;’‘\n——\r\t)、(——^[1-9]d*$]') # pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。??:、~@#”“¥:%……&*()]+|[[A-Za-z0-9]*$]'.decode('utf-8')) # pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') pattern = re.compile('[^\u4e00-\u9fa5-_——/\\\]+') doc = pattern.sub(r'', doc) word_index = extract_cadicateword(doc, self.max_word_len) word_cad = {} # 后选词的字典 for suffix in word_index: word = doc[suffix[0]:suffix[1]] if word not in word_cad: word_cad[word] = wordinfo(word) # record frequency of word and left neighbors and right neighbors # print(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1],word) word_cad[word].update_data(doc[suffix[0] - 1:suffix[0]], doc[suffix[1]:suffix[1] + 1]) length = len(doc) # computing frequency of candicate word and entropy of left/right neighbors for word in word_cad: word_cad[word].compute_indexes(length) # ranking by length of word values = sorted(word_cad.values(), key=lambda x: len(x.text)) for v in values: if len(v.text) == 1: continue v.compute_pmi(word_cad) # ranking by freq return sorted(values, key=lambda v: len(v.text), reverse=False) class wordinfo(object): ''' Record every candidate word information include left neighbors, right neighbors, frequency, PMI ''' def __init__(self,text): super(wordinfo,self).__init__() self.text = text self.freq = 0.0 self.left = [] #record left neighbors self.right = [] #record right neighbors self.pmi = 0 def update_data(self,left,right): self.freq += 1.0 if left: self.left.append(left) if right: self.right.append(right) def compute_indexes(self,length): #compute frequency of word,and left/right entropy self.freq /= length self.left = compute_entropy(self.left) self.right = compute_entropy(self.right) def compute_pmi(self,words_dict): #compute all kinds of combines for word sub_part = gen_bigram(self.text) if len(sub_part) > 0: # print(min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part))) self.pmi = min(map(lambda x : math.log(self.freq/words_dict[x[0]].freq/words_dict[x[-1]].freq),sub_part)) # print(len(sub_part)) def extract_cadicateword(_doc, _max_word_len): # _doc 全语料,拼起来 indexes = [] # print(_doc) doc_length = len(_doc) for i in range(doc_length): for j in range(i + 1, min(i + 1 + _max_word_len, doc_length + 1)): indexes.append((i, j)) # print(indexes) return sorted(indexes, key=lambda x: x[-1]) # return indexes ##################################### def gen_bigram(_word_str): ''' A word is divide into two part by following all possible combines. For instance, ABB can divide into (a,bb),(ab,b) :param _word_str: :return: ''' return [(_word_str[0:_i],_word_str[_i:]) for _i in range(1,len(_word_str))] def compute_pmi(words_dict,pmi_word): text = pmi_word word_freq = words_dict[text] print('word_freq==',word_freq) sub_part = gen_bigram(text) print('sub_part==',sub_part) pmi=-10000000000 if len(sub_part) > 0: # pmi = min(map(lambda x : math.log(word_freq/words_dict[x[0]]/words_dict[x[-1]]),sub_part)) pmi = min(map(lambda x: len(doc)*(word_freq / words_dict[x[0]] / words_dict[x[-1]]), sub_part)) print('pmi===',pmi) print('word, pmi==',text,' ',pmi) if __name__ == "__main__": # doc是你要统计的语料。比如30M各个领域的语料拼接起来的长长字符串(需要去掉句号等符号)。 doc= '本届大赛共设置单项奖、单元奖和年度追光奖等奖项。其中单项奖,分别设有最佳剪辑奖、最佳创意奖、最佳故事奖、最佳男、女主角奖以及最佳人气奖六个奖项。颁奖典礼上,著名导演陆川、腾讯新闻运营总经理黄晨霞、腾讯媒体市场部总经理、区域集群总经理易海燕,腾讯新闻产品总经理王京津,知名电视策划人石述思等嘉宾分别为获奖选手颁奖。' #doc= '本届大赛共设置单项奖、单元奖和年度追光奖等奖项。' indexes = extract_cadicateword(doc,6) # 这个6就是'长阳二店分行'的长度 print(indexes) from collections import Counter word_dict=Counter() for start,end in indexes: word_dict.update([doc[start:end]]) print('word_dict=',word_dict) compute_pmi(word_dict,'长阳二店分行') # 计算'长阳二店分行'的左右熵
2. 算左右熵,新词发现是在第一步算了互信息后,在互信息经过阀值过滤后的基础上再进行左右熵的计算,把符合阀值的成词保留下来:
给了我两个函数,一个是如下,
def compute_entropy(_list): length = float(len(_list)) frequence = {} if length == 0: return 0 else: for i in _list: frequence[i] = frequence.get(i,0) + 1 return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values()))
另外一个更好用,直接输入要算的词和文本:
def _get_entropy(self,candidate,text): matchd = re.finditer(candidate,text) left_char_dic=Counter() right_char_dic=Counter() for item in matchd: start,end=item.span() print(start,text[start],end,text[end-1]) if start!=0: left_char_dic.update([text[start-1]]) if end!=len(text): right_char_dic.update([text[end]]) print(left_char_dic,right_char_dic) length = float(sum(left_char_dic.values())) left_entropy = sum(map(lambda x: - x/length * math.log(x/length) , left_char_dic.values())) if length!=0 else 0 length = float(sum(right_char_dic.values())) right_entropy = sum(map(lambda x: - x/length * math.log(x/length) , right_char_dic.values())) if length!=0 else 0 print(left_entropy,right_entropy) return min(left_entropy,right_entropy)