word2vec实现cbow和skip-gram

skip-gram
这里写图片描述

cbow
这里写图片描述
1.CBOW实现


"""
学习参考：
http://www.hankcs.com/ml/cbow-word2vec.html
https://blog.csdn.net/layumi1993/article/details/72866235
https://blog.csdn.net/linxuheng/article/details/70170888
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

os.environ["CUDA_VISIBLE_DEVICES"] = "9"

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        # 该方法返回一个包含两个元素的(filename, headers)元组，
        # filename 表示保存到本地的路径，header 表示服务器的响应头。
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    # stat 系统调用时用来返回相关文件的系统状态信息的。
    statinfo = os.stat(filename)
    # 文件的大小，以位为单位
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    with zipfile.ZipFile(filename) as f:
        #  英文字母读出来，分词，然后把分过后的词放在list中（str.split()可以做到）
        # [b'love', b'their', b'servitude', b'and',]
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


words = read_data(filename)
# size=17005207
print('Data size', len(words))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words):
    count = [['UNK', -1]]
    # >>> c=Counter('dgfg')
    # >>> c
    # Counter({'g': 2, 'f': 1, 'd': 1})
    # >> > c.most_common(3)
    # [('r', 2), ('a', 1), ('g', 1)]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        # 为词建立索引，词频越大，索引值越小。
        # {'g': 3, 'a': 2, 'UNK': 0, 'r': 1, 'p': 4}
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    # 建立50000个词的索引字典,这一步等同于为单词创建独热码(唯一的索引就可以写成one-hot的形式）
    #  data--》字典里词的索引值k形如[5239, 3083, 12, 6, 195, 2, 3134, 46, 59, 156]
    # count--》字典里词出现次数，词频[(‘UNK’,unk_count)，('r', 2), ('a', 1), ('g', 1)]
    # dictionary--》字典，包括词和词的索引k{'g': 3, 'a': 2, 'UNK': 0, 'r': 1, 'p': 4}
    # reverse_dictionary--》于字典相反，key是索引，value是词汇。{ 3：'g',  2：'a',  0：'UNK', 1：'r',4 :'p'}
    return data, count, dictionary, reverse_dictionary


data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data key:%sSample data:%s' % (data[:10], [reverse_dictionary[i] for i in data[:10]]))

data_index = 0

cbow_window = 1  # How many words to consider left and right.
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size,  cbow_window):
    global data_index
    # assert batch_size % num_skips == 0
    # assert num_skips <= 2 * bag_window
    span = 2 * cbow_window + 1  # 3 [ bag_window=1 ]
    batch = np.ndarray(shape=(batch_size,cbow_window*2), dtype=np.int32)
    # array([[0],
    #        [0],
    #        [353280],
    #        [1694499684],
    #        [40501248],
    #        [33587968],
    #        [1677722458],
    #        [55771140]], dtype=int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # deque双向列表
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        # 第1，2,3个词的索引值
        data_index = (data_index + 1) % len(data)
        '''
        每当有新的单词索引添加至缓冲区时，最左方的元素将从缓冲区中排出，
        以便为新的单词索引腾出空间。输入文本流中的缓冲器被存储在全局变量 data_index 中，
        每当缓冲器中有新的单词进入时， data_index 递增。
        如果到达文本流的末尾，索引更新的「％len（data）」组件会将计数重置为 0。

        '''
    for i in range(batch_size):
        target = cbow_window  # 1 target label at the center of the buffer
        targets_to_avoid = [cbow_window]
        # 从单词的 span
        # 范围中随机选择其他单词，确保上下文中不包含输入词且每个上下文单词都是唯一的。
        labels[i, 0] = buffer[cbow_window]
        # print("labels:",labels)
        for j in range(span-1):
                while target in targets_to_avoid:
                    #   0=<target<=2
                    target = random.randint(0, span - 1)
                targets_to_avoid.append(target)
                batch[i, j] = buffer[target]
                # print("batch:",batch)
        buffer.append(data[data_index])
        # print("data_index:",data_index)
        # print("data[data_index]",data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


# batch(batch_size,2)是索引值，labels存的也是索引值[i,0]。
# [5243   12] -> 3084
batch, labels = generate_batch(batch_size=8,cbow_window=1)

for i in range(8):
    # anarchism originated as a term of abuse first used
    # [5243   12]['anarchism', 'as'] -> 3084 originated
    # [3084    6]['originated', 'a'] -> 12 as
    # [195  12]['term', 'as'] -> 6 a
    # [2 6]['of', 'a'] -> 195 term
    print( batch[i,:], [reverse_dictionary[batch[i,0]],
           reverse_dictionary[batch[i,1]]],

          '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.

batch_size = 128  # 一次扫描多少块。
embedding_size = 128  # Dimension of the embedding vector.


# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
'''
通过测量向量空间中最接近的向量来建立验证集，并使用英语知识以确保这些词确实是相似的。
这将在下一节中进行具体讨论。不过我们可以先暂时使用另一种方法，从词汇表最常用的词中随机提取验证单词，
上面的代码从 0 到 100 中随机选择了 16 个整数——这些整数与文本数据中最常用的 100 个单词的整数索引相对应。
https://blog.csdn.net/IAMoldpan/article/details/78707140
'''
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
# replace 默认为True允许采样有重复值，False不允许值重复，
# -->array([45, 85,  5, 59, 26, 75, 95, 84, 56,  7, 16, 78, 66,...]
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64  # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size,cbow_window*2])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        # 参考解析look——up https://blog.csdn.net/u013041398/article/details/60955847
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        print('shape:',embed.get_shape())#shape: (128, 2, 128)
        # embeds = None
        # for i in range(2 * cbow_window):
        #     embedding_i = tf.nn.embedding_lookup(embeddings, train_inputs[:, i])
        #     print('embedding %d shape: %s' % (i, embedding_i.get_shape().as_list()))
        #     emb_x, emb_y = embedding_i.get_shape().as_list()
        #     if embeds is None:
        #         embeds = tf.reshape(embedding_i, [emb_x, emb_y, 1])
        #     else:
        #         embeds = tf.concat([embeds, tf.reshape(embedding_i, [emb_x, emb_y, 1])],2)
        #
        # assert embeds.get_shape().as_list()[2] == 2 * cbow_window
        # print("Concat embedding size: %s" % embeds.get_shape().as_list())
        # avg_embed = tf.reduce_mean(embeds, 2, keep_dims=False)
        # print("Avg embedding size: %s" % avg_embed.get_shape().as_list())

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        # '''
        # 原始的写法，用softmax判断,时间略长。
        # '''
        # （batch_size=128,vocabulary_size=50000）
        # hidden_out = tf.matmul(embed, tf.transpose(nce_weights)) + nce_biases
        # one_hot_labels=tf.one_hot(tf.reshape(train_labels,[batch_size]),depth=vocabulary_size)
        # loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=hidden_out,
        #                                   name=None))
    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(nce_weights, nce_biases, tf.cast(train_labels, tf.float32), tf.reduce_mean(embed,1),
                       num_sampled, vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    # 该操作将返回一个（validation_size, vocabulary_size）大小的张量，
    # 该张量的每一行指代一个验证词，列则指验证词和词汇表中其他词的相似度。
    # (16,50000)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.initialize_all_variables()

# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, cbow_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                # argsort()
                # 函数是将x中的元素从小到大排列，提取其对应的index(索引)，
                # 从1开始到第9个是因为分数最高的一定是自己，把自己排除。
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()


# Step 6: Visualize the embeddings.

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)


try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")

Average loss at step  92000 :  4.13854655159
Average loss at step  94000 :  4.20275877428
Average loss at step  96000 :  4.14698807251
Average loss at step  98000 :  3.88556081307
Average loss at step  100000 :  3.90406033587
Nearest to years: internet, days, stages, whitcomb, alban, analyzing, positivism, alley,
Nearest to have: had, has, were, are, require, be, never, ifrcs,
Nearest to called: gel, martens, given, fin, tsar, predetermined, comnenus, believers,
Nearest to use: mesoplodon, measure, electrical, most, study, welt, gysin, thereafter,
Nearest to state: producer, soil, accomplishments, callithrix, duchy, government, mississippi, story,
Nearest to or: and, structuring, mythos, than, insular, scratch, hint, prix,
Nearest to it: he, she, this, there, usages, they, herbert, what,
Nearest to some: many, several, all, these, lucid, blended, functionally, those,
Nearest to also: still, often, always, generally, steep, never, adopted, originally,
Nearest to after: before, when, during, ddrmax, kerr, aurangzeb, from, climbing,
Nearest to would: will, could, can, may, should, must, might, cannot,
Nearest to be: been, is, was, become, are, have, were, matter,
Nearest to other: different, arthritis, various, pertwee, plasma, guderian, herbaceous, lagrange,
Nearest to than: or, facing, shrugged, much, nyu, annealing, elmo, inducing,
Nearest to this: which, it, our, albums, joaquin, bet, any, grass,
Nearest to these: those, many, such, some, various, all, enchanted, they,
Please install sklearn, matplotlib, and scipy to visualize embeddings.

2.skip-gram实现
同tensorflow下的word2vec_basic,进行了注释，加了softmax+entropy 熵的情况，对英文文本中的句子进行实现，
这里写图片描述

"""
学习参考：
https://blog.csdn.net/sweetcandy2/article/details/73351031
https://www.jianshu.com/p/f682066f0586
https://www.jiqizhixin.com/articles/2017-11-20-3
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile
os.environ["CUDA_VISIBLE_DEVICES"] = "9"

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    # 该方法返回一个包含两个元素的(filename, headers)元组，
    # filename 表示保存到本地的路径，header 表示服务器的响应头。
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  #stat 系统调用时用来返回相关文件的系统状态信息的。
  statinfo = os.stat(filename)
  # 文件的大小，以位为单位
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    #  英文字母读出来，分词，然后把分过后的词放在list中（str.split()可以做到）
    # [b'love', b'their', b'servitude', b'and',]
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

words = read_data(filename)
# size=17005207
print('Data size', len(words))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  # >>> c=Counter('dgfg')
  # >>> c
  # Counter({'g': 2, 'f': 1, 'd': 1})
  # >> > c.most_common(3)
  # [('r', 2), ('a', 1), ('g', 1)]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    # 为词建立索引，词频越大，索引值越小。
    # {'g': 3, 'a': 2, 'UNK': 0, 'r': 1, 'p': 4}
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  # 建立50000个词的索引字典,这一步等同于为单词创建独热码(唯一的索引就可以写成one-hot的形式）
  #  data--》字典里词的索引值k形如[5239, 3083, 12, 6, 195, 2, 3134, 46, 59, 156]
  # count--》字典里词出现次数，词频[(‘UNK’,unk_count)，('r', 2), ('a', 1), ('g', 1)]
  # dictionary--》字典，包括词和词的索引k{'g': 3, 'a': 2, 'UNK': 0, 'r': 1, 'p': 4}
  # reverse_dictionary--》于字典相反，key是索引，value是词汇。{ 3：'g',  2：'a',  0：'UNK', 1：'r',4 :'p'}
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data key:%sSample data:%s'% (data[:10], [reverse_dictionary[i] for i in data[:10]]))

data_index = 0


# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  # batch=array([0, 0, 91136, 1694499428, 40501248,
  #        33587968, 1677722458, 5439491], dtype=int32)
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  # array([[0],
  #        [0],
  #        [353280],
  #        [1694499684],
  #        [40501248],
  #        [33587968],
  #        [1677722458],
  #        [55771140]], dtype=int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 #3 [ skip_window target skip_window ]
  # deque双向列表
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    # 第1，2,3个词的索引值
    data_index = (data_index + 1) % len(data)
    '''
    每当有新的单词索引添加至缓冲区时，最左方的元素将从缓冲区中排出，
    以便为新的单词索引腾出空间。输入文本流中的缓冲器被存储在全局变量 data_index 中，
    每当缓冲器中有新的单词进入时， data_index 递增。
    如果到达文本流的末尾，索引更新的「％len（data）」组件会将计数重置为 0。

    '''
  for i in range(batch_size // num_skips):
    target = skip_window  # 1 target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    # 从单词的 span
    # 范围中随机选择其他单词，确保上下文中不包含输入词且每个上下文单词都是唯一的。
    for j in range(num_skips):
      while target in targets_to_avoid:
        #   0=<target<=2
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels
# batch[i]是索引值，labels存的也是索引值[i,0]。
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  # anarchism originated as a term of abuse first used
  # 3083 originated -> 5239 anarchism
  # 12 as -> 6 a
  # 12 as -> 3083 originated
  print(batch[i], reverse_dictionary[batch[i]],
      '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

# Step 4: Build and train a skip-gram model.

batch_size = 128 #一次扫描多少块。
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
'''
通过测量向量空间中最接近的向量来建立验证集，并使用英语知识以确保这些词确实是相似的。
这将在下一节中进行具体讨论。不过我们可以先暂时使用另一种方法，从词汇表最常用的词中随机提取验证单词，
上面的代码从 0 到 100 中随机选择了 16 个整数——这些整数与文本数据中最常用的 100 个单词的整数索引相对应。
https://blog.csdn.net/IAMoldpan/article/details/78707140
'''
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
#replace 默认为True允许采样有重复值，False不允许值重复，
# -->array([45, 85,  5, 59, 26, 75, 95, 84, 56,  7, 16, 78, 66,...]
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    # 参考解析look——up https://blog.csdn.net/u013041398/article/details/60955847
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    # '''
    # 原始的写法，用softmax判断,时间略长。
    # '''
    # # （batch_size=128,vocabulary_size=50000）
    # hidden_out = tf.matmul(embed, tf.transpose(nce_weights)) + nce_biases
    # one_hot_labels=tf.one_hot(tf.reshape(train_labels,[batch_size]),depth=vocabulary_size)
    # loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=hidden_out,
    #                                   name=None))
  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  loss = tf.reduce_mean(
      tf.nn.nce_loss(nce_weights, nce_biases, tf.cast( train_labels,tf.float32),embed,
                     num_sampled, vocabulary_size))

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  # 该操作将返回一个（validation_size, vocabulary_size）大小的张量，
  # 该张量的每一行指代一个验证词，列则指验证词和词汇表中其他词的相似度。
  # (16,50000)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
  init = tf.initialize_all_variables()

# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        # argsort()
        # 函数是将x中的元素从小到大排列，提取其对应的index(索引)，
        # 从1开始到第9个是因为分数最高的一定是自己，把自己排除。
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log_str = "Nearest to %s:" % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

# Step 6: Visualize the embeddings.

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  plt.figure(figsize=(18, 18))  #in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i,:]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
  plot_only = 500
  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, labels)

except ImportError:
  print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")

Average loss at step  92000 :  4.70792403448
Average loss at step  94000 :  4.61834581614
Average loss at step  96000 :  4.73397651279
Average loss at step  98000 :  4.61884291029
Average loss at step  100000 :  4.68271846807
Nearest to however: but, and, although, that, which, abitibi, when, though,
Nearest to first: last, agouti, second, stench, next, formidable, cerebral, abet,
Nearest to was: is, has, had, were, became, been, by, being,
Nearest to world: mitral, commuters, swims, erectus, dmd, peculiarities, largest, ssbn,
Nearest to three: four, five, two, six, seven, eight, callithrix, zero,
Nearest to in: during, at, on, abitibi, within, and, throughout, from,
Nearest to during: in, after, at, from, into, against, under, when,
Nearest to d: b, thaler, dasyprocta, section, immolation, alum, connecticut, groves,
Nearest to six: seven, eight, four, five, nine, three, two, zero,
Nearest to UNK: callithrix, dasyprocta, tamarin, upanija, reginae, iit, cebus, four,
Nearest to its: their, his, the, her, sheridan, microcebus, galvani, hooke,
Nearest to only: agouti, quine, callithrix, casuistry, microcebus, hypotension, prism, tamarin,
Nearest to united: redfern, successor, clodius, provocation, touring, upanija, treachery, of,
Nearest to for: dasyprocta, agouti, prism, with, during, tamarin, while, towards,
Nearest to been: be, was, were, by, become, pontificia, had, always,
Nearest to th: eight, seven, six, four, nine, frud, two, allotropes,
Please install sklearn, matplotlib, and scipy to visualize embeddings.

同tensorflow下的word2vec_basic,进行了注释，加了softmax+entropy 熵的情况，对中文文本中的句子进行实现，那么就有分词的问题，选用合工大LTP。

这里写图片描述

Average loss at step  92000 :  4.19266751575
Average loss at step  94000 :  4.10320964313
Average loss at step  96000 :  4.11487320387
Average loss at step  98000 :  4.06696178246
Average loss at step  100000 :  4.10072786987
nearest: [ 5049 25731   169  2964   508 28401    38   347]
Nearest to 有着: 收取, 饺子, 因为, 不乏, 掠过, 一步之遥, 有, 拥有,
nearest: [  652   639   321   435 17367 18282   324    79]
Nearest to 道: 丝, 问道, 一道道, 怔, 反之, 落日, 枚, 声,
nearest: [  178   865   364 11195 16746  1153 22879 20820]
Nearest to 有些: 极为, 异常, 颇为, 对得起, 安安稳稳, 显, 魂殿待, 假体,
nearest: [  123  2706   629  1082  2614    47  7785 45052]
Nearest to 点: 摇, ☆, 松, 一番, ♀, 笑, ┅, ......................................,
nearest: [   71  5245  4610  3547  9404  6424  8918 11426]
Nearest to 上: 之上, 干瘦, 气浪, 绝, 黄莲精, 漠城, 天上, ︴,
nearest: [ 2706 11715 11426 23616  7785  6967  5496 10970]
Nearest to UNK: ☆, ♂, ︴, 第两百九十八, ┅, 功, 黑夜, 攻击性,
nearest: [  485   389  1009   912   160 11797  5185   159]
Nearest to 目光: 视线, 眼睛, 美眸, 眼神, 声音, ①, 怒喝道, 脸色,
nearest: [   1  116 2706    3   77 7785    9 2614]
Nearest to ,: ，, ’, ☆, 。, …, ┅, ”, ♀,
nearest: [ 142  334  105  148   76 2614  102 5124]
Nearest to 这种: 这个, 那种, 这些, 这般, 一些, ♀, 什么, 逛,
nearest: [  175   150 11715   663 10929 18793    57  2706]
Nearest to 出: 出来, 进, ♂, 幕, 囦体, 燃, 去, ☆,
nearest: [  31   18  260   59   94  166  266 2706]
Nearest to 你: 我, 他, 你们, 他们, 她, 我们, 它, ☆,
nearest: [  28  154  257 2706    9  126  140  211]
Nearest to “: ！, 听, 闻, ☆, ”, 见到, 对于, 随着,
nearest: [  403   472   673 11426   366   991   435   610]
Nearest to 笑: 苦笑, 微笑, 叹, ︴, 冷笑, 挥手, 怔, 咬,
nearest: [  132   438  8028   142 12533 36092 15307  8744]
Nearest to 一个: 个, 那个, 不动声色, 这个, 印射, 魂断涧, 大斗篷, 阻挡,
nearest: [  86 5732  143 1505 6795 7554   63 9404]
Nearest to 最后: 然后, 左, 如今, 长枪, 皱褶, 裔民, 旋即, 黄莲精,
nearest: [  118   162 16797 14389   256   318 14924  2947]
Nearest to 般: 一般, 犹如, 煞殿, 迈进, 如, 如同, 诀窍, 身材,

word2vec实现cbow和skip-gram

word2vec实现cbow和skip-gram

猜你喜欢