NumPy实现Word2vec

import numpy as np
from collections import defaultdict

getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
		[-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
		[-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
		[0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
		[0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
		[0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
		[0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
		[0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
		[0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]

getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
		[-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
		[-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
		[0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
		[-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
		[0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
		[-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
		[-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
		[-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
		[0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]

class word2vec():

	def __init__(self):
		self.n = settings['n']
		self.lr = settings['learning_rate']
		self.epochs = settings['epochs']
		self.window = settings['window_size']

	def generate_training_data(self, settings, corpus):

		word_counts = defaultdict(int)
		for row in corpus:
			for word in row:
				word_counts[word] += 1

		self.v_count = len(word_counts.keys())



		self.words_list = list(word_counts.keys())

		self.word_index = dict((word, i) for i, word in enumerate(self.words_list))



		self.index_word = dict((i, word) for i, word in enumerate(self.words_list))


		training_data = []

		for sentence in corpus:
			sent_len = len(sentence)


			for i, word in enumerate(sentence):

				w_target = self.word2onehot(sentence[i])

				w_context = []

				for j in range(i - self.window, i + self.window+1):

					if j != i and j <= sent_len-1 and j >= 0:
						# Append the one-hot representation of word to w_context
						w_context.append(self.word2onehot(sentence[j]))

				training_data.append([w_target, w_context])

		return np.array(training_data)

	def word2onehot(self, word):

		word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)

		word_index = self.word_index[word]

		word_vec[word_index] = 1

		return word_vec

	def train(self, training_data):

		self.w1 = np.array(getW1)
		self.w2 = np.array(getW2)
		# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
		# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

		for i in range(self.epochs):

			self.loss = 0

			for w_t, w_c in training_data:

				y_pred, h, u = self.forward_pass(w_t)




				EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)


				self.backprop(EI, h, w_t)

				self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))

			print('Epoch:', i, "Loss:", self.loss)

	def forward_pass(self, x):

		h = np.dot(x, self.w1)

		u = np.dot(h, self.w2)

		y_c = self.softmax(u)
		return y_c, h, u

	def softmax(self, x):
		e_x = np.exp(x)
		return e_x / e_x.sum()

	def backprop(self, e, h, x):

		dl_dw2 = np.outer(h, e)
		dl_dw1 = np.outer(x, np.dot(self.w2, e.T))


		# Update weights
		self.w1 = self.w1 - (self.lr * dl_dw1)
		self.w2 = self.w2 - (self.lr * dl_dw2)

	# Get vector from word
	def word_vec(self, word):
		w_index = self.word_index[word]
		v_w = self.w1[w_index]
		return v_w

	# Input vector, returns nearest word(s)
	def vec_sim(self, word, top_n):
		v_w1 = self.word_vec(word)
		word_sim = {}

		for i in range(self.v_count):
			# Find the similary score for each word in vocab
			v_w2 = self.w1[i]
			theta_sum = np.dot(v_w1, v_w2)
			theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
			theta = theta_sum / theta_den

			word = self.index_word[i]
			word_sim[word] = theta

		words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

		for word, sim in words_sorted[:top_n]:
			print(word, sim)

settings = {
	'window_size': 2,			# context window +- center word
	'n': 10,					# dimensions of word embeddings, also refer to size of hidden layer
	'epochs': 50,				# number of training epochs
	'learning_rate': 0.01		# learning rate
}

text = "natural language processing and machine learning is fun and exciting"


corpus = [[word.lower() for word in text.split()]]

w2v = word2vec()

training_data = w2v.generate_training_data(settings, corpus)

# Training
w2v.train(training_data)

Epoch: 0 Loss: 81.99625348926824
Epoch: 1 Loss: 80.37706194534589
Epoch: 2 Loss: 78.9116663644708
Epoch: 3 Loss: 77.57806136236584
Epoch: 4 Loss: 76.35803881618284
Epoch: 5 Loss: 75.23652831371102
Epoch: 6 Loss: 74.20101998340209
Epoch: 7 Loss: 73.24108053436369
Epoch: 8 Loss: 72.34795785304598
Epoch: 9 Loss: 71.51426275735359
Epoch: 10 Loss: 70.73371481522524
Epoch: 11 Loss: 70.00093990321889
Epoch: 12 Loss: 69.31130890283573
Epoch: 13 Loss: 68.66080883957898
Epoch: 14 Loss: 68.04593952257906
Epoch: 15 Loss: 67.46363022458323
Epoch: 16 Loss: 66.91117214136592
Epoch: 17 Loss: 66.38616331680674
Epoch: 18 Loss: 65.88646345805446
Epoch: 19 Loss: 65.41015663657106
Epoch: 20 Loss: 64.95552031189413
Epoch: 21 Loss: 64.5209994553034
Epoch: 22 Loss: 64.10518481350977
Epoch: 23 Loss: 63.70679455597247
Epoch: 24 Loss: 63.324658707226554
Epoch: 25 Loss: 62.95770588813032
Epoch: 26 Loss: 62.60495198520285
Epoch: 27 Loss: 62.265490441321816
Epoch: 28 Loss: 61.938483918707924
Epoch: 29 Loss: 61.623157129993245
Epoch: 30 Loss: 61.31879066812967
Epoch: 31 Loss: 61.02471569320668
Epoch: 32 Loss: 60.74030935570918
Epoch: 33 Loss: 60.46499085277964
Epoch: 34 Loss: 60.19821802777384
Epoch: 35 Loss: 59.939484434684516
Epoch: 36 Loss: 59.68831679851707
Epoch: 37 Loss: 59.44427281092189
Epoch: 38 Loss: 59.206939207669535
Epoch: 39 Loss: 58.97593008112777
Epoch: 40 Loss: 58.750885386909765
Epoch: 41 Loss: 58.53146960938957
Epoch: 42 Loss: 58.31737055585847
Epoch: 43 Loss: 58.108298253732954
Epoch: 44 Loss: 57.903983929414984
Epoch: 45 Loss: 57.70417905114165
Epoch: 46 Loss: 57.50865442143902
Epoch: 47 Loss: 57.3171993076209
Epoch: 48 Loss: 57.12962060116183
Epoch: 49 Loss: 56.94574199875216

# Get vector for word
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)

# Find similar words
w2v.vec_sim("machine", 3)

machine [ 0.76702922 -0.95673743  0.49207258  0.16240808 -0.4538815  -0.74678226
  0.42072706 -0.04147312  0.08947326 -0.24245257]
machine 0.9999999999999999
fun 0.6223490454018771
and 0.5190154215400249

猜你喜欢