5-RNN-04_词向量



# Embedding Skip-gram

import time
import numpy as np
import tensorflow as tf
from utils import utils
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile
from collections import Counter
import random
import os

# dataset_folder_path = '../datas/text8'
# dataset_filename = 'text8.zip'
# dataset_name = 'Text8 Dataset'
#
#
# class DLProgress(tqdm):
#     last_block = 0
#
#     def hook(self, block_num=1, block_size=1, total_size=None):
#         self.total = total_size
#         self.update((block_num - self.last_block) * block_size)
#         self.last_block = block_num
#
#
# if not isfile(dataset_filename):
#     with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
#         urlretrieve(
#             'http://mattmahoney.net/dc/text8.zip',
#             dataset_filename,
#             pbar.hook)
#
# if not isdir(dataset_folder_path):
#     with zipfile.ZipFile(dataset_filename) as zip_ref:
#         zip_ref.extractall(dataset_folder_path)

with open('../datas/text8/text8') as f:
    text = f.read()

# todo :使用 utils.preprocess函数做了数据预处理。我们将所有标点符号都转化成了一个 标记符号, 句号表示为： ` <PERIOD> `.
#       同时移除了出现次数少于5次的单词。 这将极大减少数据的噪音，提升词向量表达式的质量。

words = utils.preprocess(text)  # todo - 看下utils中这个处理过程。
print(words[:100])
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

# 二、创建了2个数据字典：{单词：数字}，  {数字：单词}
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
print(vocab_to_int)

# 利用词表，将文本转为数字。
int_words = [vocab_to_int[word] for word in words]

# todo 做子采样：去掉高频词语。主要是为了丢弃中性词。 因为 中性词 没有对附近的词提供足够的信息。
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)

freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1-np.sqrt(threshold / freqs[word]) for word in word_counts}
# 使用random生成（0，1）的随机数字
train_words = [word for word in int_words if random.random() < (1-p_drop[word])]


def get_target(words, idx, window_size=5):
    """
    按照随机窗口的大小，获取目标词(targets)
    :param words:  单词列表
    :param idx:    中心词的索引号
    :param window_size:   取词的窗口大小
    :return:
    """
    R = np.random.randint(1, window_size+1)
    # 如果（中心词的索引 - R） 小于0，那么start设置为0（即从words列表的开头开始取词）；
    # 相反，大于0，那么start = 中心词的索引 - R
    start = idx - R if (idx - R) > 0 else 0
    end = idx + R
    targets_words = set(words[start:idx] + words[idx+1: end+1])
    return list(targets_words)


def get_batches(words, batch_size, window_size=5):
    """
    创建获取批量数据的生成器 （inputs, targets）
    :param words:   整个文本
    :param batch_size:
    :param window_size:
    :return:
    """
    n_batches = len(words) // batch_size
    # 只取整数倍个批量
    words = words[:n_batches*batch_size]

    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx: idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            bctch_y = get_target(words=batch, idx=ii, window_size=window_size)
            y.extend(bctch_y)
            x.extend([batch_x]*len(bctch_y))
        yield x, y


# todo 创建模型
train_graph = tf.Graph()
with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, [None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='inputs')


# 嵌入层 embedding
n_vocab = len(vocab_to_int)  # 单词表大小
n_embedding = 200  # 词向量的长度
with train_graph.as_default():
    # 定义嵌入矩阵
    embeded_matrix = tf.Variable(
        initial_value=tf.random_uniform(shape=[n_vocab, n_embedding], minval=-1, maxval=1)
    )
    embed = tf.nn.embedding_lookup(params=embeded_matrix, ids=inputs)

# todo: 负采样
"""
负采样：我们每次只更新正样本的权重， 负样本只更新一部分。
"""
n_sampled = 100  # 负采样的数量
with train_graph.as_default():
    # 构建隐藏层 到  输出层的权重
    softmax_w = tf.get_variable(
        'w', shape=[n_vocab, n_embedding], dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=0.1)
    )
    softmax_b = tf.get_variable(
        'b', shape=[n_vocab], dtype=tf.float32, initializer=tf.zeros_initializer()
    )
    # 计算负采样的损失
    loss = tf.nn.sampled_softmax_loss(
        weights=softmax_w, biases=softmax_b, labels=labels, inputs=embed,
        num_sampled=n_sampled, num_classes=n_vocab
    )
    cost = tf.reduce_mean(loss)
    train_opt = tf.train.AdamOptimizer().minimize(cost)


epochs = 20
batch_size = 1000
window_size = 10
checkpoint_dir = './models/word2vec'

def train():
    with train_graph.as_default():
        saver = tf.train.Saver()
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

    with tf.Session(graph=train_graph) as sess:
        sess.run(tf.global_variables_initializer())
        step = 1
        for e in range(1, epochs):
            for x, y in get_batches(train_words, batch_size, window_size=window_size):
                feed = {inputs: x, labels: np.array(y)[:, None]}
                # 执行模型训练
                sess.run(train_opt, feed)
                if step % 10 == 0:
                    train_loss = sess.run(cost, feed)
                    print('Epochs:{} - step:{} - Train loss:{:.5f}'.format(
                        e, step, train_loss))
                step += 1
        # 模型持久化
        files = 'model.ckpt'
        save_files = os.path.join(checkpoint_dir, files)
        saver.save(sess, save_path=save_files)


# restore 模型
with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))
    embed_mat = sess.run(embeded_matrix)

# todo 可视化

# 下面使用 T-SNE包，来可视化高维的单词向量。 T-SNE包将高维向量投影到2维向量
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
viz_words = 500
tsne = TSNE()
embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
plt.show()


if __name__ == '__main__':
    train()
HJZ11
发布了88 篇原创文章 · 获赞 2 · 访问量 1327
私信关注
猜你喜欢