深度学习语言模型(3)-word2vec负采样(Negative Sampling) 模型(keras版本)

目录:
深度学习语言模型(1)-word2vec的发展历程
 深度学习语言模型(2)-词向量，神经概率网络模型(keras版本)
深度学习语言模型(3)-word2vec负采样(Negative Sampling) 模型(keras版本)
代码参考了:https://spaces.ac.cn/archives/4515
但他采用的是随机采样，我这里采用的是负采样，但还是有一些细节没有去实现，但大体框架就这样
# coding=utf-8
'''
Created on 2018年9月15日

@author: admin
'''

from gensim import corpora, models, similarities
import numpy as np
import keras.backend as K
from keras.engine.topology import Layer
class NegativeLayer(Layer):
    def __init__(self, nb_negative,M,M_num, **kwargs):


        self.nb_negative = nb_negative
        self.M = M
        self.M_num = M_num
        super(NegativeLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        super(NegativeLayer, self).build(input_shape)


    def call(self, x, mask=None):

        batch = 0
        if str(x.shape[0]).isdigit() == False:
            batch = 4
        else:
            batch = x.shape[0]
        #负采样
        final_output = np.array([[M[i] for i in j]for j in np.random.randint(0, self.M_num+1, size=(batch, self.nb_negative))])
        #变成tensor格式
        final_output = K.tensorflow_backend._to_tensor(final_output,dtype=np.int32)
        return final_output
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.nb_negative)
if __name__ == '__main__':
    text = [["我","今天","打","篮球"],
            ["我","今天","打","足球"],
            ["我","今天","打","羽毛球"],
            ["我","今天","打","网球"],
            ["我","今天","打","排球"],
            ["我","今天","打","气球"],
            ["我","今天","打","游戏"],
            ["我","今天","打","冰球"],
            ["我","今天","打","人"],
            ["我","今天","打","台球"],
            ["我","今天","打","桌球"],
            ["我","今天","打","水"],
            ["我","今天","打","篮球"],
            ["我","今天","打","足球"],
            ["我","今天","打","羽毛球"],
            ["我","今天","打","网球"],
            ["我","今天","打","排球"],
            ["我","今天","打","气球"],
            ]
    #使用gensim生成词典
    dictionary = corpora.Dictionary(text,prune_at=2000000)
    
    #打印词典中的词
    for key in dictionary.iterkeys():
        print(key,dictionary.get(key),dictionary.dfs[key])
    #保存词典
    dictionary.save_as_text('word_dict.dict',  sort_by_word=True)
    #加载词典
    dictionary = dictionary.load_from_text('word_dict.dict')
    
    L = {}
    #计算出词出现的总数，dictionary.dfs{单词id，在多少文档中出现}
    allword_num = np.sum(list(dictionary.dfs.values()))
    print(allword_num)
    #72
    
    #构造负采样dict
    #进行归一化,然后按照0-1排列,然后再使用M个均等值来评分0-1，方便对应词的id
    sum = 0
    M = {}
    M_num = 1000
    for id,num in dictionary.dfs.items():
        #向上取整
        left = int(np.ceil(sum/(1/M_num)))
        sum = sum + num/allword_num
        L[id] = sum
        #向下取整
        right = int(sum/(1/M_num))
        print(id,left,right)
#         11 0 13
#         0 14 263
#         10 264 277
#         12 278 291
#         1 292 541
#         2 542 791
#         7 792 819
#         13 820 833
#         8 834 861
#         14 862 875
#         9 875 888
#         3 889 916
#         6 917 944
#         5 945 972
#         4 973 1000
        for i in range(left,right+1):
            M[i] = id
    print(L)
    #{11: 0.013888888888888888, 0: 0.25, 10: 0.013888888888888888, 12: 0.013888888888888888, 1: 0.25, 2: 0.25, 7: 0.027777777777777776, 13: 0.013888888888888888, 8: 0.027777777777777776, 14: 0.013888888888888888, 9: 0.013888888888888888, 3: 0.027777777777777776, 6: 0.027777777777777776, 5: 0.027777777777777776, 4: 0.027777777777777776}
        
    #词语个数
    word_num = len(dictionary.keys())
    #使用多少编文章生成每个batch数据
    sentence_batch_size = 1
    #滑动窗口
    window = 3
    def data_generator(): #训练数据生成器
        while True:
            x,y = [],[]
            _ = 0
            for sentence in text:
                #使用word_num的值作为padding
                sentence = [word_num]*window + [dictionary.token2id[w] for w in sentence if w in dictionary.token2id] + [word_num]*window
                for i in range(window, len(sentence)-window):
                    x.append(sentence[i-window:i]+sentence[i+1:i+1+window])
                    #因为使用的loss函数为sparse_categorical_crossentropy，所以不用one-hot
                    y.append([sentence[i]])
                _ += 1
                if _ == sentence_batch_size:
                    x,y = np.array(x),np.array(y)
                    #因为正例为输出层第一个神经元，所以这里都使用0标签，也是因为loss函数为sparse_categorical_crossentropy
                    z = np.zeros((len(x), 1))
                    print("输入的数据 :",x.shape)
                    print("对应的标签 :",y.shape)
                    print("对应的标签 2:",z.shape)
                    yield [x,y],z
                    x,y = [],[]
                    _ = 0
        
    from keras.models import Sequential
    from keras.layers import Dense, Activation,Embedding,Reshape,Flatten,Input,Embedding,Lambda

    from keras.models import Model
    #词向量维度
    word_size = 100
    #负样本个数
    nb_negative = 16
    
    input_words = Input(shape=(window*2,), dtype='int32')
    input_vecs = Embedding(word_num+1, word_size, name='word2vec')(input_words)
    input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) #CBOW模型，直接将上下文词向量求和
    
    #构造随机负样本，与目标组成抽样
    target_word = Input(shape=(1,), dtype='int32')
    negatives = NegativeLayer(16,M,M_num)(target_word)
    samples = Lambda(lambda x: K.concatenate(x))([target_word,negatives]) #构造抽样，负样本随机抽。负样本也可能抽到正样本，但概率小。
    
    #使用Embedding层代替dense主要原因是只更新正例和负例相对应的输出层神经元的权重，这样可以大量减少内存占用和计算量
    softmax_weights = Embedding(word_num+1, word_size, name='W')(samples)
    softmax_biases = Embedding(word_num+1, 1, name='b')(samples)
    softmax = Lambda(lambda x: 
                        K.softmax((K.batch_dot(x[0], K.expand_dims(x[1],2))+x[2])[:,:,0])
                    )([softmax_weights,input_vecs_sum,softmax_biases]) #用Embedding层存参数，用K后端实现矩阵乘法，以此复现Dense层的功能
    
    #留意到，我们构造抽样时，把目标放在了第一位，也就是说，softmax的目标id总是0，这可以从data_generator中的z变量的写法可以看出
    
    model = Model(inputs=[input_words,target_word], outputs=softmax)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    

     
    model.summary()
    model.fit_generator(data_generator(),steps_per_epoch =np.ceil(dictionary.num_docs/sentence_batch_size),epochs=100,max_queue_size=1,workers=1)
#     #保存模型
    model.save_weights("DNNword-vec2.h5")
#     #加载模型
    model.load_weights("DNNword-vec2.h5",by_name=True)
#     
    #获取embeding的权重，也就是词向量
    embeddings = model.get_weights()[0]
    #向量标准化
    normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5
    dictionary.id2token = {j:i for i,j in dictionary.token2id.items()}
    #获取前面最相似的15个词语
    def most_similar(w,dictionary):
        v = normalized_embeddings[dictionary.token2id[w]]
        #向量标准化之后分母就是1，所以直接相乘就好
        sims = np.dot(normalized_embeddings, v)
        sort = sims.argsort()[::-1]
        sort = sort[sort > 0]
        return [(dictionary.id2token[i],sims[i]) for i in sort[:15] if i in dictionary.id2token]
     
    for sim in most_similar(u'网球',dictionary):
        print(sim[0],sim[1])
# 网球 0.99999994
# 羽毛球 0.9787248
# 篮球 0.978495
# 排球 0.9773369
# 人 0.9761201
# 水 0.9760275
# 气球 0.9753146
# 桌球 0.9731983
# 冰球 0.97278094
# 游戏 0.9711289
# 足球 0.9660615
# 台球 0.96072686
# 我 -0.3409065
# 打 -0.42166257
深度学习语言模型(3)-word2vec负采样(Negative Sampling) 模型(keras版本)

猜你喜欢