word2vec方法代码学习

word2vec内容链接
word2vec代码内容如下:

import numpy as np
from collections import defaultdict
 
 
class word2vec():
 
    def __init__(self):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
 
    def generate_training_data(self, settings, corpus):
        """
        得到训练数据
        """
 
        #defaultdict(int)  一个字典,当所访问的键不存在时,用int类型实例化一个默认值
        word_counts = defaultdict(int)
 
        #遍历语料库corpus
        for row in corpus:
            for word in row:
                #统计每个单词出现的次数
                word_counts[word] += 1
 
        # 词汇表的长度
        self.v_count = len(word_counts.keys())
        # 在词汇表中的单词组成的列表
        self.words_list = list(word_counts.keys())
        # 以词汇表中单词为key,索引为value的字典数据
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        #以索引为key,以词汇表中单词为value的字典数据
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
 
        training_data = []
 
        for sentence in corpus:
            sent_len = len(sentence)
 
            for i, word in enumerate(sentence):
 
                w_target = self.word2onehot(sentence[i])
 
                w_context = []
 
                for j in range(i - self.window, i + self.window):
                    if j != i and j <= sent_len - 1 and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))
 
                training_data.append([w_target, w_context])
 
        return np.array(training_data)
 
    def word2onehot(self, word):
 
        #将词用onehot编码
 
        word_vec = [0 for i in range(0, self.v_count)]
 
        word_index = self.word_index[word]
 
        word_vec[word_index] = 1
 
        return word_vec
 
    def train(self, training_data):
 
 
        #随机化参数w1,w2
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
 
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
 
        for i in range(self.epochs):
 
            self.loss = 0
 
            # w_t 是表示目标词的one-hot向量
            #w_t -> w_target,w_c ->w_context
            for w_t, w_c in training_data:
 
                #前向传播
                y_pred, h, u = self.forward(w_t)
 
                #计算误差
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
 
                #反向传播,更新参数
                self.backprop(EI, h, w_t)
 
                #计算总损失
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
 
            print('Epoch:', i, "Loss:", self.loss)
 
    def forward(self, x):
        """
        前向传播
        """
 
        h = np.dot(self.w1.T, x)
 
        u = np.dot(self.w2.T, h)
 
        y_c = self.softmax(u)
 
        return y_c, h, u
 
 
    def softmax(self, x):
        """
        """
        e_x = np.exp(x - np.max(x))
 
        return e_x / np.sum(e_x)
 
 
    def backprop(self, e, h, x):
 
        d1_dw2 = np.outer(h, e)
        d1_dw1 = np.outer(x, np.dot(self.w2, e.T))
 
        self.w1 = self.w1 - (self.lr * d1_dw1)
        self.w2 = self.w2 - (self.lr * d1_dw2)
 
    def word_vec(self, word):
 
        """
        获取词向量
        通过获取词的索引直接在权重向量中找
        """
 
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
 
        return v_w
 
    def vec_sim(self, word, top_n):
        """
        找相似的词
        """
 
        v_w1 = self.word_vec(word)
        word_sim = {
    
    }
 
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
 
            #np.linalg.norm(v_w1) 求范数 默认为2范数,即平方和的二次开方
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den
 
            word = self.index_word[i]
            word_sim[word] = theta
 
        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
 
        for word, sim in words_sorted[:top_n]:
            print(word, sim)
 
    def get_w(self):
        w1 = self.w1
        return  w1
#超参数
settings = {
    
    
    'window_size': 2,   #窗口尺寸 m
    #单词嵌入(word embedding)的维度,维度也是隐藏层的大小。
    'n': 10,
    'epochs': 50,         #表示遍历整个样本的次数。在每个epoch中,我们循环通过一遍训练集的样本。
    'learning_rate':0.01 #学习率
}
 
#数据准备
text = "natural language processing and machine learning is fun and exciting"
#按照单词间的空格对我们的语料库进行分词
corpus = [[word.lower() for word in text.split()]]
print(corpus)
 
#初始化一个word2vec对象
w2v = word2vec()
 
training_data = w2v.generate_training_data(settings,corpus)
 
#训练
w2v.train(training_data)
 
# 获取词的向量
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)
 
# 找相似的词
w2v.vec_sim("machine", 3)

原句为’natural language processing and machine learning is fun and exciting’
这里统计完词频、词的索引等参数之后,获得原来数据对应的list的内容

[
['natural', ['language']], ['language', ['natural', 'processing']], 
['processing', ['natural', 'language', 'and']], 
['and', ['language', 'processing', 'machine']], 
['machine', ['processing', 'and', 'learning']], 
['learning', ['and', 'machine', 'is']], 
['is', ['machine', 'learning', 'fun']], 
['fun', ['learning', 'is', 'and']], 
['and', ['is', 'fun', 'exciting']], 
['exciting', ['fun', 'and']]
]

调用

training_data = w2v.generate_training_data(settings,corpus)

返回来的training_data的数据为

training_data = 
[[list([1, 0, 0, 0, 0, 0, 0, 0, 0]) list([[0, 1, 0, 0, 0, 0, 0, 0, 0]])]
 [list([0, 1, 0, 0, 0, 0, 0, 0, 0])
  list([[1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]])]
 ..........
 [list([0, 0, 0, 0, 0, 0, 0, 0, 1])
  list([[0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0]])]]

接下来进入train训练的代码过程

self.w1 = np.random.uniform(-1,1,(self.v_count,self.n))
self.w2 = np.random.uniform(-1,1,(self.n,self.v_count))

这里的self.v_count = 9,self.n = 10,因此构成了self.w1为一个(9,10)的(-1,1)的矩阵,self.w2为一个(10,9)的(-1,1)的矩阵

self.w1 = 
[[-0.88093613  0.44287707 -0.20015634 -0.17542098 -0.18688373  0.25044748
   0.86296623  0.85030189  0.78452837 -0.18417995]
   ......
 [-0.52173874  0.2372753  -0.06543664  0.18024424  0.28042927  0.34655803
 0.06426065  0.79247053 -0.60444507  0.45783363]]

接下来查看反向传播的操作

for w_t,w_c in training_data:
	y_pred,h,u = self.forward(w_t)
	......

查看反向传播的研读过程

    def forward(self, x):
        """
        前向传播
        """
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u

整理出来公式

h = self.w1.T*x
u = self.w2.T*(self.w1.T*h)
y_c = softmax(self.w2.T*(self.w1.T*h))

然后计算所有单词的相应的误差

EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

这步类似于交叉熵损失函数的内容
接下来代码进行反向传播、更新参数的操作,这里重点看一下这个的过程

self.backprop(EI,h,w_t)

进入backprop函数之中

def backprop(self,e,h,x):
	d1_dw2 = np.outer(h,e)
	d1_dw1 = np.outer(x,np.dot(self.w1.T,e.T))
	self.w1 = self.w1 - (self.lr*d1_dw1)
	self.w2 = self.w2 - (self.lr*d1_dw2)

这里解析一下这里的d1_dw2和d1_dw1的计算得到的结果
首先我们将e、h、x的原来构成的值提取出来
e = E I = s o f t m a x ( w 2 ∗ ( w 1 ∗ w o r d ) ) e = EI = softmax(w_{2}*(w_{1}*word)) e=EI=softmax(w2(w1word))
h = w 2 ∗ ( w 1 ∗ w o r d ) h = w_{2}*(w_{1}*word) h=w2(w1word)
x = w o r d x = word x=word
所以对应的值为
d 1 d w 1 = w o r d ∗ ( w 2 ∗ s o f t m a x l o s s ) = s o f t m a x l o s s ∗ w 2 ∗ w o r d d_{1}dw_{1} = word*(w_{2}*softmaxloss) = softmaxloss*w_{2}*word d1dw1=word(w2softmaxloss)=softmaxlossw2word
d 1 d w 2 = s o f t m a x l o s s ∗ ( w 2 T ∗ ( w 1 T ∗ w o r d ) ) = s o f t m a x l o s s ∗ w 2 T ∗ w o r d ∗ w 1 T d_{1}dw_{2} = softmaxloss*(w_{2}^{T}*(w_{1}^{T}*word)) = softmaxloss * w_{2}^{T}*word*w_{1}^{T} d1dw2=softmaxloss(w2T(w1Tword))=softmaxlossw2Twordw1T
个人认为这里的求便导然后反向传播写的不对
感觉应该为下面的过程(一个对w1.T偏导只剩w2,一个对w2.T偏导只剩w1)

def backprop(self, e, h, x):

    #d1_dw2 = np.outer(h, e)
    d1_dw2 = np.outer(x, np.dot(self.w1.T,e.T)).T
    d1_dw1 = np.outer(x, np.dot(self.w2, e.T))
    #d1_dw1 = np.outer(x,np.dot(self.w1.T,e.T))
    
    self.w1 = self.w1 - (self.lr * d1_dw1)
    self.w2 = self.w2 - (self.lr * d1_dw2)

本质上就是给出一个单词的embedding和一个w1_t以及w2_t,周边的单词通过乘上w1_t和w2_t能够计算到中间的词语上去
最后解析一下计算单词的相似度内容

def vec_sim(self, word, top_n):
    """
    找相似的词
    """

    v_w1 = self.word_vec(word)
    word_sim = {
    
    }

    for i in range(self.v_count):
        v_w2 = self.w1[i]
        theta_sum = np.dot(v_w1, v_w2)

        #np.linalg.norm(v_w1) 求范数 默认为2范数,即平方和的二次开方
        theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
        theta = theta_sum / theta_den

        word = self.index_word[i]
        word_sim[word] = theta

    words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

    for word, sim in words_sorted[:top_n]:
        print(word, sim)

从词典中挨个找出单词来,调出embedding,然后不断地乘上v_w1或者v_w2,最后计算单词的相似度并进行排序得到最终的结果

猜你喜欢

转载自blog.csdn.net/znevegiveup1/article/details/121504541
今日推荐