算法原理:
算法原理可以参考该链接
超参数
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000
valid_size = 20
valid_window = 100
eval_words = np.random.choice(valid_window, valid_size, replace=False)
embedding_size = 200
max_vocabulary_size = 50000
min_occurrence = 10
skip_window = 3
num_skips = 2
num_sampled = 64
解释:
- 我们模型的验证是:计算于eval_words数组中的词最近似的几个词
词典生成模块
def make_vocabulary(data):
"""
data:是一个一维的list,每个元素可以是单个字也可以是切词后的词
data是我们将句子切词后再拼接生成的(如果以字为单位不用切词直接拼接)
"""
word2count = [('UNK', -1)]
word2count.extend(collections.Counter("".join(data)).most_common(max_vocabulary_size - 1))
for i in range(len(word2count) - 1, -1, -1):
if word2count[i][1] < min_occurrence:
word2count.pop(i)
else:
break
vocabulary_size = len(word2count)
word2id = dict()
for i, (word, _) in enumerate(word2count):
word2id[word] = i
data_id = list()
unk_count = 0
for word in data:
index = word2id.get(word, 0)
if index == 0:
unk_count += 1
data_id.append(index)
word2count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))
return data_id,word2id,id2word,word2count,vocabulary_size
batch生成模块
data_index = 0
def next_batch(batch_size, num_skips, skip_window,data):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
context_words = [w for w in range(span) if w != skip_window]
words_to_use = random.sample(context_words, num_skips)
for j, context_word in enumerate(words_to_use):
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[context_word]
if data_index == len(data):
buffer.extend(data[0:span])
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
样本生成图:
其中skip_window=2;num_skips=整个窗口大小,和上面代码有一点差异。
训练模型模块
def train():
s=time.time()
print("加载数据....")
data=read_file("./序列标注文件.txt")
print("cost:%s"%str(time.time()-s))
s = time.time()
print("创建字典...")
data_id, word2id, id2word, word2count, vocabulary_size=make_vocabulary(data)
print("cost:%s" % str(time.time() - s))
print("保存词汇表...")
print("词汇表大小: %d"%vocabulary_size)
with codecs.open("./vocabulary_text", "w", "utf8") as fout:
L = sorted(list(word2id.items()), key=lambda x: int(x[1]))
for word,id in L:
fout.write("%s\t%s\n"%(word,str(id)))
X = tf.placeholder(tf.int32, shape=[None])
Y = tf.placeholder(tf.int32, shape=[None, 1])
embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
X_embed = tf.nn.embedding_lookup(embedding, X)
nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss_op = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=Y,
inputs=X_embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op)
X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)
init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.allow_soft_placement=True
config.allow_soft_placement=True
config.gpu_options.per_process_gpu_memory_fraction = 0.5
with tf.Session(config=config) as sess:
sess.run(init)
x_test = eval_words
print(x_test)
average_loss = 0
s=time.time()
for step in xrange(1, num_steps + 1):
batch_x, batch_y = next_batch(batch_size, num_skips, skip_window,data_id)
_, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
average_loss += loss
if step % display_step == 0 or step == 1:
if step > 1:
average_loss /= display_step
print("Step " + str(step) + ", Average Loss= " + \
"{:.4f}".format(average_loss)+" cost: %s"%str(time.time()-s))
average_loss = 0
s = time.time()
if step % eval_step == 0 or step == 1:
print("Evaluation...")
sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
for i in xrange(len(eval_words)):
top_k = 8
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = '"%s" nearest neighbors:' % id2word[eval_words[i]]
for k in xrange(top_k):
log_str = '%s %s,' % (log_str, id2word[nearest[k]])
print(log_str)
print("保存词向量...")
embedding_array=np.array(sess.run(embedding),dtype=np.float32)
np.save("./vocabulary_vec", embedding_array)
np.savetxt("./vocabulary_vec.txt", embedding_array)
完整代码
完整代码GitHub链接