# coding: utf-8
import json
import re
import gensim
import random
from gensim.models import Word2Vec
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support
from tqdm import tqdm
# import word2vec
def process(src, tgt=None):
f = open(src, "r", encoding="utf-8")
g = open(tgt, "w", encoding="utf-8")
line = f.readline().strip("\ufeff\n")
words = eval(line)
print(type(words))
words_len = len(words)
print(words_len)
line = f.readline().strip()
line_ = line.split()
dim = len(line_)
g.write(str(words_len) + ' ' + str(dim) + '\n')
g.write(words[0].strip()+ " " + line + "\n")
i = 1
while i < words_len:
line = f.readline().strip()
g.write(words[i].strip() + " " + line + "\n")
i += 1
# print(line)
f.close()
g.close()
def get_words_from_conll(src="new_test-raw_words.txt"):
f = open(src, "r", encoding="utf-8")
words = []
seqs = []
lines = f.readlines()
for line in lines:
if line != '\n':
line = line.strip('\ufeff\n').split()
char = line[0]
words.append(char)
else:
seqs.append(words)
words = []
print(len(seqs))
f.close()
return seqs
def get_words_from_txt(src="new_test-raw_words.txt"):
f = open(src, "r", encoding="utf-8")
words = []
# seqs = []
lines = f.readlines()
for line in lines:
line = line.strip('\ufeff\n')
words.append(line)
print(len(words))
f.close()
return words
def get_labels(src=None):
f = open(src, "r", encoding="utf-8")
words = []
lines = f.readlines()
for line in lines:
word = line.strip("\ufeff\n")
# if word not in words:
words.append(eval(word))
print(len(words))
f.close()
return words
def calculate_similar(model, word_list, test_real_labels):
g = open("./result.txt", 'w', encoding="utf-8")
test_pred_labels = []
for i,word in enumerate(word_list):
w1,w2 = word.split()
# print(w1,w2)
if i % 100 == 0:
print(i)
try:
# indices =model.similar(word, n=10)
score = model.similarity(w1, w2)
# print(score)
# w1, w2, score = result_cos[0]
# print(score)
if score >= 0.9 and score < 1.0:
test_pred_labels.append(1)
g.write("1" + "\n")
else:
test_pred_labels.append(0)
g.write("0" + "\n")
except:
print(w1 + " " + w2 + "有问题")
test_pred_labels.append(0)
g.write("0")
# continue
g.close()
compute_metrics(test_real_labels, test_pred_labels)
def compute_metrics(real_labels, preds_labels):
print("二分类-准确率:",precision_score(real_labels, preds_labels, average="binary", pos_label=0))
print("二分类-召回率:",recall_score(real_labels, preds_labels, average="binary", pos_label=0))
print("二分类-F1值:",f1_score(real_labels, preds_labels, average="binary", pos_label=0))
print("二分类-准确率:",precision_score(real_labels, preds_labels, average="binary", pos_label=1))
print("二分类-召回率:",recall_score(real_labels, preds_labels, average="binary", pos_label=1))
print("二分类-F1值:",f1_score(real_labels, preds_labels, average="binary", pos_label=1))
def read_from_json(src):
f = open(src, 'r', encoding='utf-8')
line = f.readline()
line = line.strip('\ufeff\n')
line = json.loads(line)
return line
def get_word_from_json(words2id_txt="words2id.txt",tencent = "tencent_pre_processed_with_200.txt", tgt=None):
# tencent = "tencent_pre_processed_with_200.txt"
words2id = read_from_json(words2id_txt)
del words2id['<pad>']
del words2id['<unk>']
words_list = list(words2id.keys())
# for word in words_list:
# if len(word) >2:
# print(word)
f = open(tencent, 'r', encoding='utf-8')
g = open(tgt, 'w', encoding='utf-8')
i = 0
while True:
if i % 10000 == 0:
print(i)
i += 1
line = f.readline()
line = line.strip('\ufeff\n')
line_ = line.split()
word = line_[0]
if word in words_list:
g.write(line + '\n')
words_list.pop(words_list.index(word))
if len(words_list) == 0:
break
f.close()
g.close()
def extract_ngram(all_sentences, min_feq=0):
# all_sentences = [['迈','向','充','满','希','望','的','新','世','纪','-','-','一','九','九','八','年',...],[...],...]
n_gram_dict = {
}
new_all_sentences = []
for sen in all_sentences:
str_sen = ''.join(sen)
new_sen = re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]', str_sen)
for s in new_sen:
if len(s) > 0:
new_all_sentences.append(s) #二维列表转换为一维列表
for sentence in new_all_sentences:
for i in range(len(sentence)):
for n in range(1, 5):
if i + n > len(sentence):
break
n_gram = ''.join(sentence[i:i+n])
if n_gram not in n_gram_dict:
n_gram_dict[n_gram] = 1
else:
n_gram_dict[n_gram] += 1
new_ngram_dict = {
gram: c for gram, c in n_gram_dict.items() if (c > min_feq and not re.search('[a-zA-Z]', gram))}
return new_ngram_dict
def get_all_sentences(train, dev, test):
train_sent = get_words_from_conll(train)
dev_sent = get_words_from_conll(dev)
test_sent = get_words_from_conll(test)
all_sentences = train_sent + dev_sent + test_sent
return all_sentences
def get_wb_ngram_txt(char2num, tencent_ngram, tgt=None):
# 需要过滤单词
# line = read_from_json(src)
char2ngram = {
}
g = open(tgt, 'w', encoding='utf-8')
for ngram in tqdm(tencent_ngram):
if len(ngram) > 1 and len(ngram) < 5 and not re.search('[a-zA-Z]', ngram):
for char, _ in char2num.items():
if char in ngram:
if char not in char2ngram:
char2ngram[char] = [ngram]
else:
char2ngram[char].append(ngram)
# g.write(ngram + '\n')
# del char2num[char]
# break
res = {
}
for char, ngrams in char2ngram.items():
res[char] = ngrams
g.write(json.dumps(res, ensure_ascii=False) + '\n')
res = {
}
g.close()
def get_chars_from_conll(all_sentences, tgt):
chars2num = {
}
for seq in all_sentences:
for char in seq:
chars2num[char] = chars2num.get(char, 0) + 1
g = open(tgt, "w", encoding="utf-8")
res = json.dumps(chars2num, ensure_ascii=False)
g.write(res + '\n')
g.close()
return chars2num
def choose_ngram(src, tgt, k=5, seed=66):
random.seed(seed)
f = open(src, 'r', encoding='utf-8')
g = open(tgt, 'w', encoding='utf-8')
ngram_dict = []
lines = f.readlines()
for line in lines:
line = line.strip('\ufeff\n')
line = json.loads(line)
(char, ngrams), = line.items()
# print(ngrams)
# 随机选择5个放入词典中
# char是汉字,并且ngram列表长度大于等于5
if not is_cn_char(char) or len(ngrams) < 5:
continue
k_ngrams= random.sample(ngrams, k)
for n in k_ngrams:
if n not in ngram_dict:
ngram_dict.append(n)
g.write(n + '\n')
# g = open(tgt, 'w', encoding='utf-8')
# for ngram in ngram_dict:
# g.write(ngram + '\n')
f.close()
g.close()
def is_cn_char(word):
# 判断一个字是否是汉字
return 0x4e00 <= ord(word) <= 0x9fa5
if __name__ == "__main__":
# """
# 1
# 提取ngram
# train = "ontonote_ner/train.txt"
# dev = "ontonote_ner/dev.txt"
# test = "ontonote_ner/test.txt"
# all_sentences = get_all_sentences(train, dev, test)
# weibo_ngram_dict = extract_ngram(all_sentences)
# weibo_ngram = list(weibo_ngram_dict.keys()) # 粗略提取
# tencent_ngram_txt = "tencent_ngrams.txt"
# tencent_ngram = get_words_from_txt(tencent_ngram_txt)
# new_ngarm = {}
# for ngram in tqdm(tencent_ngram):
# if ngram in weibo_ngram:
# new_ngarm[ngram] = weibo_ngram_dict[ngram]
# weibo_ngram.pop(weibo_ngram.index(ngram))
# if len(weibo_ngram) == 0:
# break
# print(len(new_ngarm))
# g = open("weibo_ngram_from_tencent.txt", 'w', encoding='utf-8')
# res = json.dumps(new_ngarm, ensure_ascii=False)
# g.write(res + '\n')
# g.close()
# """
# """
# 2
# 生成ngram文件
# char_txt = "ontonote_ner/char2num.txt"
# # char2num = get_chars_from_conll(all_sentences, char_txt)
# char2num = read_from_json(char_txt)
# tencent_ngram_txt = "tencent_ngrams.txt"
# tencent_ngram = get_words_from_txt(tencent_ngram_txt)
# tgt = "ontonote_ner/ontonote_all_ngram_from_tencent_get_by_char.json"
# get_wb_ngram_txt(char2num, tencent_ngram, tgt)
# """
# """
# 3
# 获得word2ids.txt
tencent = "tencent_pre_processed_with_200.txt"
tgt = "new_ot_cws/tencent_words_emb_for_ot.txt"
get_word_from_json(words2id_txt="new_re_cws/word2ids.txt",tencent=tencent, tgt=tgt)
# """
# 4
# 每个字符随机选5个
# src = "ontonote_ner/ontonote_all_ngram_from_tencent_get_by_char.json"
# tgt = "ontonote_ner/ot_tencent_ngram.txt"
# choose_ngram(src, tgt, k=5, seed=66)
利用数据集和腾讯词向量,提取对应的embedding
猜你喜欢
转载自blog.csdn.net/tailonh/article/details/115001220
今日推荐
周排行