loaddata.py
# coding=utf-8
import os
import json
import pandas as pd
import pickle
import numpy as np
TPS_DIR = '../data/music'
TP_file = os.path.join(TPS_DIR, 'Digital_Music_5.json')
f = open(TP_file)
users_id = []
items_id = []
ratings = []
reviews = []
np.random.seed(2017)
for line in f:
js = json.loads(line)
if str(js['reviewerID']) == 'unknown':
print("unknown")
continue
if str(js['asin']) == "unknown":
print("unknown")
continue
reviews.append(js['reviewText'])
users_id.append(str(js['reviewerID']) + ",")
items_id.append(str(js['asin']) + ",")
ratings.append(str(js['overall']))
# get primal data
# ===================================================
data = pd.DataFrame(
{'user_id': pd.Series(users_id),
'item_id': pd.Series(items_id),
'ratings': pd.Series(ratings),
'reviews': pd.Series(reviews)}
)[['user_id', 'item_id', 'ratings', 'reviews']]
# trainsform data to index
# ==================================================
def get_count(tp, id):
playcount_groupbyid = tp[[id, 'ratings']].groupby(id, as_index=False)
count = playcount_groupbyid.size()
return count
usercount, itemcount = get_count(data, 'user_id'), get_count(data, 'item_id')
unique_uid = usercount.index
unique_sid = itemcount.index
item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))
def numerize(tp):
uid = list(map(lambda x: user2id[x], tp['user_id']))
sid = list(map(lambda x: item2id[x], tp['item_id']))
tp['user_id'] = uid
tp['item_id'] = sid
return tp
data = numerize(data)
tp_rating = data[['user_id', 'item_id', 'ratings']]
# split data
# ===================================================
n_ratings = tp_rating.shape[0]
test = np.random.choice(n_ratings, size=int(0.20 * n_ratings), replace=False)
test_idx = np.zeros(n_ratings, dtype=bool)
test_idx[test] = True
tp_1 = tp_rating[test_idx]
tp_train = tp_rating[~test_idx]
data2 = data[test_idx]
data = data[~test_idx]
n_ratings = tp_1.shape[0]
test = np.random.choice(n_ratings, size=int(0.50 * n_ratings), replace=False)
test_idx = np.zeros(n_ratings, dtype=bool)
test_idx[test] = True
tp_test = tp_1[test_idx]
tp_valid = tp_1[~test_idx]
tp_train.to_csv(os.path.join(TPS_DIR, 'music_train.csv'), index=False, header=None)
tp_valid.to_csv(os.path.join(TPS_DIR, 'music_valid.csv'), index=False, header=None)
tp_test.to_csv(os.path.join(TPS_DIR, 'music_test.csv'), index=False, header=None)
user_reviews = {}
item_reviews = {}
user_rid = {}
item_rid = {}
for i in data.values:
user_id = i[0]
item_id = i[1]
reviews_text = i[3]
if user_id in user_reviews:
user_reviews[user_id].append(reviews_text)
user_rid[user_id].append(item_id)
else:
user_rid[user_id] = [item_id]
user_reviews[user_id] = [reviews_text]
if item_id in item_reviews:
item_reviews[item_id].append(reviews_text)
item_rid[item_id].append(user_id)
else:
item_reviews[item_id] = [reviews_text]
item_rid[item_id] = [user_id]
for i in data2.values:
user_id = i[0]
item_id = i[1]
if user_id in user_reviews:
l = 1
else:
user_rid[user_id] = [0]
user_reviews[user_id] = ['0']
if item_id in item_reviews:
l = 1
else:
item_rid[item_id] = [0]
item_reviews[item_id] = ['0']
pickle.dump(user_reviews, open(os.path.join(TPS_DIR, 'user_review'), 'wb'))
pickle.dump(item_reviews, open(os.path.join(TPS_DIR, 'item_review'), 'wb'))
pickle.dump(user_rid, open(os.path.join(TPS_DIR, 'user_rid'), 'wb'))
pickle.dump(item_rid, open(os.path.join(TPS_DIR, 'item_rid'), 'wb'))
usercount, itemcount = get_count(data, 'user_id'), get_count(data, 'item_id')
print(np.sort(np.array(usercount.values)))
print(np.sort(np.array(itemcount.values)))
pro_data.py
# coding=utf-8
import numpy as np
import re
import itertools
from collections import Counter
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import csv
import pickle
import os
tf.flags.DEFINE_string("valid_data", "../data/music/music_valid.csv", "Data for validation")
tf.flags.DEFINE_string("test_data", "../data/music/music_test.csv", "Data for testing")
tf.flags.DEFINE_string("train_data", "../data/music/music_train.csv", "Data for training")
tf.flags.DEFINE_string("user_review", "../data/music/user_review", "User's reviews")
tf.flags.DEFINE_string("item_review", "../data/music/item_review", "Item's reviews")
tf.flags.DEFINE_string("user_review_id", "../data/music/user_rid", "user_review_id")
tf.flags.DEFINE_string("item_review_id", "../data/music/item_rid", "item_review_id")
tf.flags.DEFINE_string("stopwords", "../data/stopwords", "stopwords")
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
"""
string = re.sub(r"[^A-Za-z]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n'\t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def pad_sentences(u_text, u_len, u2_len, padding_word="<PAD/>"):
"""
Pads all sentences to teh same length. The length is defined by the longest sentence.
"""
review_num = u_len
review_len = u2_len
u_text2 = {}
for i in u_text.keys():
u_reviews = u_text[i]
padded_u_train = []
for ri in range(review_num):
if ri < len(u_reviews):
sentence = u_reviews[ri]
if review_len > len(sentence):
num_padding = review_len - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_u_train.append(new_sentence)
else:
new_sentence = sentence[:review_len]
padded_u_train.append(new_sentence)
else:
new_sentence = [padding_word] * review_len
padded_u_train.append(new_sentence)
u_text2[i] = padded_u_train
return u_text2
def pad_reviewid(u_train, u_valid, u_len, num):
"""
num is the padding id
"""
pad_u_train = []
for i in range(len(u_train)):
x = u_train[i]
while u_len > len(x):
x.append(num)
if u_len < len(x):
x = x[:u_len]
pad_u_train.append(x)
pad_u_valid = []
for i in range(len(u_valid)):
x = u_valid[i]
while u_len > len(x):
x.append(num)
if u_len < len(x):
x = x[:u_len]
pad_u_valid.append(x)
return pad_u_train, pad_u_valid
def build_vocab(sentences1, sentences2):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts1 = Counter(itertools.chain(*sentences1))
# Mapping from index to word
vocabulary_inv1 = [x[0] for x in word_counts1.most_common()]
vocabulary_inv1 = list(sorted(vocabulary_inv1))
# Mapping from word to index
vocabulary1 = {x: i for i, x in enumerate(vocabulary_inv1)}
word_counts2 = Counter(itertools.chain(*sentences2))
# Mapping from index to word
vocabulary_inv2 = [x[0] for x in word_counts2.most_common()]
vocabulary_inv2 = list(sorted(vocabulary_inv2))
# Mapping from word to index
vocabulary2 = {x: i for i, x in enumerate(vocabulary_inv2)}
return [vocabulary1, vocabulary_inv1, vocabulary2, vocabulary_inv2]
def build_input_data(u_text, i_text, vocabulary_u, vocabulary_i):
"""
Maps sentence and labels to vectors based on a vocabulary
"""
l = len(u_text)
u_text2 = {}
for i in u_text.keys():
u_reviews = u_text[i]
u = np.array([[vocabulary_u[word] for word in words] for words in u_reviews])
u_text2[i] = u
l = len(i_text)
i_text2 = {}
for j in i_text.keys():
i_reviews = i_text[j]
i = np.array([[vocabulary_i[word] for word in words] for words in i_reviews])
i_text2[j] = i
return u_text2, i_text2
def load_data(train_data, valid_data, user_review, item_review, user_rid, item_rid, stopwords):
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
# ===============================================
print("load data")
u_text, i_text, y_train, y_valid, u_len, i_len, u2_len, i2_len, uid_train, iid_train, uid_valid, iid_valid, user_num, item_num \
, reid_user_train, reid_item_train, reid_user_valid, reid_item_valid = \
load_data_and_labels(train_data, valid_data, user_review, item_review, user_rid, item_rid, stopwords)
# padding
# ===============================================
print("pad user")
u_text = pad_sentences(u_text, u_len, u2_len)
reid_user_train, reid_user_valid = pad_reviewid(reid_user_train, reid_user_valid, u_len, item_num + 1)
print("pad item")
i_text = pad_sentences(i_text, i_len, i2_len)
reid_item_train, reid_item_valid = pad_reviewid(reid_item_train, reid_item_valid, i_len, user_num + 1)
# get vocabulary
# ===============================================
user_voc = [xx for x in u_text.values() for xx in x]
item_voc = [xx for x in i_text.values() for xx in x]
vocabulary_user, vocabulary_inv_user, vocabulary_item, vocabulary_inv_item = build_vocab(user_voc, item_voc)
print(len(vocabulary_user))
print(len(vocabulary_item))
u_text, i_text = build_input_data(u_text, i_text, vocabulary_user, vocabulary_item)
y_train = np.array(y_train)
y_valid = np.array(y_valid)
uid_train = np.array(uid_train)
uid_valid = np.array(uid_valid)
iid_train = np.array(iid_train)
iid_valid = np.array(iid_valid)
reid_user_train = np.array(reid_user_train)
reid_user_valid = np.array(reid_user_valid)
reid_item_train = np.array(reid_item_train)
reid_item_valid = np.array(reid_item_valid)
return [u_text, i_text, y_train, y_valid, vocabulary_user, vocabulary_inv_user, vocabulary_item,
vocabulary_inv_item, uid_train, iid_train, uid_valid, iid_valid, user_num, item_num, reid_user_train,
reid_item_train, reid_user_valid, reid_item_valid]
def load_data_and_labels(train_data, valid_data, user_review, item_review, user_rid, item_rid, stopwords):
"""
Loads data from files, splits the data into words and generate labels.
"""
# Load data from file
f_train = open(train_data, "r")
f1 = open(user_review, "rb")
f2 = open(item_review, "rb")
f3 = open(user_rid, "rb")
f4 = open(item_rid, "rb")
user_reviews = pickle.load(f1)
item_reviews = pickle.load(f2)
user_rids = pickle.load(f3)
item_rids = pickle.load(f4)
reid_user_train = []
reid_item_train = []
uid_train = []
iid_train = []
y_train = []
u_text = {}
u_rid = {}
i_text = {}
i_rid = {}
i = 0
for line in f_train:
i = i + 1
line = line.split(",")
user_id = int(line[0])
item_id = int(line[1])
uid_train.append(user_id)
iid_train.append(item_id)
# add user_id
if user_id in u_text:
reid_user_train.append(u_rid[user_id])
else:
u_text[user_id] = []
for s in user_reviews[user_id]:
s1 = clean_str(s)
s1 = s1.split(" ")
u_text[user_id].append(s1)
u_rid[user_id] = []
for s in user_rids[user_id]:
u_rid[user_id].append(int(s))
reid_user_train.append(u_rid[user_id])
# add item_id
if item_id in i_text:
reid_item_train.append(i_rid[item_id])
else:
i_text[item_id] = []
for s in item_reviews[item_id]:
s1 = clean_str(s)
s1 = s1.split(" ")
i_text[item_id].append(s1)
i_rid[item_id] = []
for s in item_rids[item_id]:
i_rid[item_id].append(int(s))
reid_item_train.append(i_rid[item_id])
y_train.append(float(line[2]))
print("valid")
reid_user_valid = []
reid_item_valid = []
uid_valid = []
iid_valid = []
y_valid = []
f_valid = open(valid_data)
for line in f_valid:
line = line.split(",")
user_id = int(line[0])
item_id = int(line[1])
uid_valid.append(user_id)
iid_valid.append(item_id)
if user_id in u_text:
reid_user_valid.append(u_rid[user_id])
else:
u_text[user_id] = [['<PAD/>']]
u_rid[user_id] = [int(0)]
reid_user_valid.append(u_rid[user_id])
if item_id in i_text:
reid_item_valid.append(i_rid[item_id])
else:
i_text[item_id] = [['<PAD/>']]
i_text[item_id] = [int(0)]
reid_item_valid.append(i_rid[item_id])
y_valid.append(float(line[2]))
review_num_u = np.array([len(x) for x in u_text.values()])
x = np.sort(review_num_u)
u_len = x[int(0.9 * len(review_num_u)) - 1]
review_len_u = np.array([len(j) for i in u_text.values() for j in i])
x2 = np.sort(review_len_u)
u2_len = x2[int(0.9 * len(review_len_u)) - 1]
review_num_i = np.array([len(x) for x in i_text.values()])
y = np.sort(review_num_i)
i_len = y[int(0.9 * len(review_num_i)) - 1]
review_len_i = np.array([len(j) for i in i_text.values() for j in i])
y2 = np.sort(review_len_i)
i2_len = y2[int(0.9 * len(review_len_i)) - 1]
print("u_len:", u_len)
print("i_len:", i_len)
print("u2_len:", u2_len)
print("i2_len:", i2_len)
user_num = len(u_text)
item_num = len(i_text)
print("user_num:", user_num)
print("item_num:", item_num)
return [u_text, i_text, y_train, y_valid, u_len, i_len, u2_len, i2_len, uid_train,
iid_train, uid_valid, iid_valid, user_num, item_num,
reid_user_train, reid_item_train, reid_user_valid, reid_item_valid]
if __name__ == '__main__':
TPS_DIR = "../data/music"
FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
u_text, i_text, y_train, y_valid, vocabulary_user, vocabulary_inv_user, vocabulary_item, \
vocabulary_inv_item, uid_train, iid_train, uid_valid, iid_valid, user_num, item_num, reid_user_train, reid_item_train, reid_user_valid, reid_item_valid = \
load_data(FLAGS.train_data, FLAGS.valid_data, FLAGS.user_review, FLAGS.item_review, FLAGS.user_review_id,
FLAGS.item_review_id, FLAGS.stopwords)
np.random.seed(2017)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
userid_train = uid_train[shuffle_indices]
itemid_train = iid_train[shuffle_indices]
y_train = y_train[shuffle_indices]
reid_user_train = reid_user_train[shuffle_indices]
reid_item_train = reid_item_train[shuffle_indices]
y_train = y_train[:, np.newaxis]
y_valid = y_valid[:, np.newaxis]
userid_train = userid_train[:, np.newaxis]
itemid_train = itemid_train[:, np.newaxis]
userid_valid = uid_valid[:, np.newaxis]
itemid_valid = iid_valid[:, np.newaxis]
batches_train = list(
zip(userid_train, itemid_train, reid_user_train, reid_item_train, y_train)
)
batches_test = list(zip(userid_valid, itemid_valid, reid_user_valid, reid_item_valid, y_valid))
print("write begin")
output = open(os.path.join(TPS_DIR, 'music.train'), 'wb')
pickle.dump(batches_train, output)
output = open(os.path.join(TPS_DIR, 'music.test'), 'wb')
pickle.dump(batches_test, output)
para = {}
para['user_num'] = user_num
para['item_num'] = item_num
para['review_num_u'] = u_text[0].shape[0]
para['review_num_i'] = i_text[0].shape[0]
para['review_len_u'] = u_text[1].shape[1]
para['review_len_i'] = i_text[1].shape[1]
para['user_vocab'] = vocabulary_user
para['item_vocab'] = vocabulary_item
para['train_length'] = len(y_train)
para['test_length'] = len(y_valid)
para['u_text'] = u_text
para['i_text'] = i_text
# print("user_num", para['user_num'])
# print("item num", para['item_num'])
# print("review_num_u", para["review_num_u"])
# print("review_num_i", para["review_num_i"])
# print("review_len_u", para["review_len_u"])
# print("review_len_i", para["review_len_i"])
# print("user_vocab", para["user_vocab"])
# print("item_vocab", para["item_vocab"])
# print("train_length", para["train_length"])
# print("test_length", para["test_length"])
# print("u_text", u_text[0])
# print("i_text", i_text[0])
output = open(os.path.join(TPS_DIR, 'music.para'), 'wb')
# Pickle dictionary using protocol 0.
pickle.dump(para, output)
gensim.py
# coding=utf-8
import os
import numpy as np
import pickle
import gensim
EMB_DIR = '../data/embedding'
EMB_file = os.path.join(EMB_DIR, 'google.bin')
W_USER_file = os.path.join(EMB_DIR, 'W_user.pk')
W_ITEM_file = os.path.join(EMB_DIR, 'W_item.pk')
PARA_DIR = '../data/music'
PARA_file = os.path.join(PARA_DIR, "music.para")
def get_embedding(vocab, embedding_dim, emb_file, W_file=None):
w = 0
initW = np.random.uniform(-1.0, 1.0, (len(vocab), embedding_dim))
print("Load word2vec file")
model = gensim.models.KeyedVectors.load_word2vec_format(emb_file, binary=True)
for word in vocab:
if word in model:
idx = vocab[word]
initW[idx] = np.array(model[word])
w += 1
print("number of pre-trained words", w)
print(initW)
return initW
if __name__ == '__main__':
pkl_file = open(PARA_file, 'rb')
para = pickle.load(pkl_file)
vocabulary_user = para['user_vocab']
vocabulary_item = para['item_vocab']
embedding_dim = 300
init_Wu = get_embedding(vocabulary_user, embedding_dim, EMB_file)
init_Wi = get_embedding(vocabulary_item, embedding_dim, EMB_file)
pickle.dump(init_Wu, open(W_USER_file, 'wb'))
pickle.dump(init_Wi, open(W_ITEM_file, 'wb'))
print("get pre-trained words")
train.py
# coding=utf-8
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import pickle
import datetime
from model import NARRE
tf.flags.DEFINE_string("word2vec", "../data/music/google.bin", "Wor2vec file with pre-trained embedings (default: None)")
tf.flags.DEFINE_string("valid_data", "../data/music/music.test", "Data for validation")
tf.flags.DEFINE_string("para_data", "../data/music/music.para", "Data parameters")
tf.flags.DEFINE_string("train_data", "../data/music/music.train", "Data for training")
tf.flags.DEFINE_string("word_weight_user", "../data/embedding/W_user.pk", "word2vec file from user vocabulary")
tf.flags.DEFINE_string("word_weight_item", "../data/embedding/W_item.pk", "word2vec file from item vocabulary")
# ===================================================
# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes ")
tf.flags.DEFINE_integer("num_filters", 100, "Number of filters per filter size")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability")
tf.flags.DEFINE_float("l2_reg_lambda", 0.001, "L2 regularization lambda")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 50, "Batch Size")
tf.flags.DEFINE_integer("num_epochs", 40, "Number of training epochs ")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
def train_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch, batch_num):
"""
A single training step
"""
feed_dict = {
narre.input_u: u_batch,
narre.input_i: i_batch,
narre.input_uid: uid,
narre.input_iid: iid,
narre.input_y: y_batch,
narre.input_reuid: reuid,
narre.input_reiid: reiid,
narre.drop0: 0.8,
narre.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, loss, accuracy, mae, u_a, i_a, fm = sess.run(
[train_op, global_step, narre.loss, narre.accuracy, narre.mae, narre.u_a, narre.i_a, narre.score],
feed_dict
)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, rmse {:g}, mae {:g}".format(time_str, batch_num, loss, accuracy, mae))
return accuracy, mae, u_a, i_a, fm
def dev_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch, writer=None):
"""
Evaluates model on a dev set
"""
feed_dict = {
narre.input_u: u_batch,
narre.input_i: i_batch,
narre.input_y: y_batch,
narre.input_uid: uid,
narre.input_iid: iid,
narre.input_reuid: reuid,
narre.input_reiid: reiid,
narre.drop0:1.0,
narre.dropout_keep_prob: 1.0
}
step, loss, accuracy, mae = sess.run(
[global_step, narre.loss, narre.accuracy, narre.mae],
feed_dict
)
time_str = datetime.datetime.now().isoformat()
return loss, accuracy, mae
if __name__ == '__main__':
FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
print("{}={}".format(attr.upper(), value))
print("")
print("Loading data...")
pkl_file = open(FLAGS.para_data, 'rb')
para = pickle.load(pkl_file)
user_num = para['user_num']
item_num = para['item_num']
review_num_u = para['review_num_u']
review_num_i = para['review_num_i']
review_len_u = para['review_len_u']
review_len_i = para['review_len_i']
vocabulary_user = para['user_vocab']
vocabulary_item = para['item_vocab']
train_length = para['train_length']
test_length = para['test_length']
u_text = para['u_text']
i_text = para['i_text']
np.random.seed(2017)
random_seed = 2017
print("user_num", user_num)
print("item_num", item_num)
print("review_num_u", review_num_u)
print("review_len_u", review_len_u)
print("review_num_i", review_num_i)
print("review_len_i", review_len_i)
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement = FLAGS.allow_soft_placement,
log_device_placement = FLAGS.log_device_placement
)
session_conf.gpu_options.allow_growth = 0
sess = tf.Session(config=session_conf)
with sess.as_default():
narre = NARRE(
review_num_u = review_num_u,
review_num_i = review_num_i,
review_len_u = review_len_u,
review_len_i = review_len_i,
user_num = user_num,
item_num = item_num,
num_classes = 1,
user_vocab_size=len(vocabulary_user),
item_vocab_size=len(vocabulary_item),
embedding_size=FLAGS.embedding_dim,
embedding_id = 32,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda,
attention_size=32,
n_latent=32
)
tf.set_random_seed(random_seed)
print(user_num)
print(item_num)
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(0.002, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(narre.loss)
train_op = optimizer
sess.run(tf.initialize_all_variables())
saver = tf.train.Saver()
if FLAGS.word2vec:
# load word weight user
initW = np.random.uniform(-1.0, 1.0, (len(vocabulary_user), FLAGS.embedding_dim))
W_u_file = open(FLAGS.word_weight_user, 'rb')
initW = pickle.load(W_u_file)
sess.run(narre.W1.assign(initW))
# load word weigth item
initW = np.random.uniform(-1.0, 1.0, (len(vocabulary_item), FLAGS.embedding_dim))
W_i_file = open(FLAGS.word_weight_item, 'rb')
initW = pickle.load(W_i_file)
sess.run(narre.W2.assign(initW))
print("get pre-trained initW")
epoch = 1
best_mae = 5
best_rmse = 5
train_mae = 0
train_rmse = 0
pkl_file = open(FLAGS.train_data, "rb")
train_data = pickle.load(pkl_file)
train_data = np.array(train_data)
pkl_file.close()
pkl_file = open(FLAGS.valid_data, "rb")
test_data = pickle.load(pkl_file)
test_data = np.array(test_data)
pkl_file.close()
data_size_train = len(train_data)
data_size_test = len(test_data)
batch_size = FLAGS.batch_size
ll = int(len(train_data) / batch_size)
for epoch in range(40):
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size_train))
shuffled_data = train_data[shuffle_indices]
for batch_num in range(ll):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size_train)
data_train = shuffled_data[start_index: end_index]
uid, iid, reuid, reiid, y_batch = zip(*data_train)
u_batch = []
i_batch = []
for i in range(len(uid)):
u_batch.append(u_text[uid[i][0]])
i_batch.append(i_text[iid[i][0]])
u_batch = np.array(u_batch)
i_batch = np.array(i_batch)
t_rmse, t_mae, u_a, i_a, fm = train_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch, batch_num)
current_step = tf.train.global_step(sess, global_step)
train_rmse += t_rmse
train_mae += t_mae
if batch_num % 100 == 0 and batch_num > 1:
print("\nEvaluation")
print(batch_num)
loss_s = 0
accuracy_s = 0
mae_s = 0
ll_test = int(len(test_data) / batch_size) + 1
for batch_num in range(ll_test):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size_test)
data_test = test_data[start_index: end_index]
userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
u_valid = []
i_valid = []
for i in range(len(userid_valid)):
u_valid.append(u_text[userid_valid[i][0]])
i_valid.append(i_text[itemid_valid[i][0]])
u_valid = np.array(u_valid)
i_valid = np.array(i_valid)
loss, accuracy, mae = dev_step(u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid, y_valid)
loss_s = loss_s + len(u_valid) * loss
accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
mae_s = mae_s + len(u_valid) * mae
print("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
np.sqrt(accuracy_s / test_length),
mae_s / test_length))
rmse = np.sqrt(accuracy_s / test_length)
mae = mae_s / test_length
if best_rmse > rmse:
best_rmse = rmse
if best_mae > mae:
best_mae = mae
print("")
print(str(epoch) + ":\n")
print("\nEvaluation:")
print("train: rmse, mae: ", train_rmse / ll, train_mae / ll)
train_rmse = 0
train_mae = 0
loss_s = 0
accuracy_s = 0
mae_s = 0
ll_test = int(len(test_data) / batch_size) + 1
for batch_num in range(ll_test):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size_test)
data_test = test_data[start_index: end_index]
userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
u_valid = []
i_valid = []
for i in range(len(userid_valid)):
u_valid.append(u_text[userid_valid[i][0]])
i_valid.append(i_text[itemid_valid[i][0]])
u_valid = np.array(u_valid)
i_valid = np.array(i_valid)
loss, accuracy, mae = dev_step(u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid, y_valid)
loss_s = loss_s + len(u_valid) * loss
accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
mae_s = mae_s + len(u_valid) * mae
print("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
np.sqrt(accuracy_s / test_length),
mae_s / test_length))
rmse = np.sqrt(accuracy_s / test_length)
mae = mae_s / test_length
if best_rmse > rmse:
best_rmse = rmse
if best_mae > mae:
best_mae = mae
print("")
print("best rmse:", best_rmse)
print("best mae:", best_mae)
model.py
# coding=utf-8
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
class NARRE(object):
def __init__(
self, review_num_u, review_num_i, review_len_u, review_len_i, user_num, item_num, num_classes,
user_vocab_size, item_vocab_size, n_latent, embedding_id, attention_size,
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
# Hyperparameter:
# review_num_u : number of users in reviews
# review_num_i : number of items in reviews
# review_len_u : length of words in user reviews
# review_len_i : length of words in item reviews
# user_num: total number of users
# item_num: total number of items
# num_classes: number of classes in ratings
# user_vocab_size: word vocabulary in user reviews
# item_vocab_size: word vocabulary in item reviews
# n_latent: length of latent vector
# embedding_id: length of rating user vector and item vector
# attention_size: size of attention vector
# embedding_size: size of word embedding
# filter_sizes: sizes of filters in CNN
# num_filters: number of filters
# l2_reg_lambda: weight l2 regularization
self.input_u = tf.placeholder(tf.int32, [None, review_num_u, review_len_u], name="input_u")
self.input_i = tf.placeholder(tf.int32, [None, review_num_i, review_len_i], name="input_i")
self.input_reuid = tf.placeholder(tf.int32, [None, review_num_u], name="input_reuid")
self.input_reiid = tf.placeholder(tf.int32, [None, review_num_i], name="input_reiid")
self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
self.input_uid = tf.placeholder(tf.int32, [None, 1], name="input_uid")
self.input_iid = tf.placeholder(tf.int32, [None, 1], name="input_iid")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
self.drop0 = tf.placeholder(tf.float32, name="dropout0")
iidW = tf.Variable(tf.random_uniform([item_num + 2, embedding_id], -0.1, 0.1), name="iidW")
uidW = tf.Variable(tf.random_uniform([user_num + 2, embedding_id], -0.1, 0.1), name="uidW")
l2_loss = tf.constant(0.0)
with tf.name_scope("user_embedding"):
self.W1 = tf.Variable(
tf.random_uniform([user_vocab_size, embedding_size], -1.0, 1.0),
name="W1"
)
self.embedded_user = tf.nn.embedding_lookup(self.W1, self.input_u)
self.embedded_users = tf.expand_dims(self.embedded_user, -1)
with tf.name_scope("item_embedding"):
self.W2 = tf.Variable(
tf.random_uniform([item_vocab_size, embedding_size], -1.0, 1.0),
name="W2"
)
self.embedded_item = tf.nn.embedding_lookup(self.W2, self.input_i)
self.embedded_items = tf.expand_dims(self.embedded_item, -1)
pooled_outputs_u = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("user_conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
self.embedded_users = tf.reshape(self.embedded_users, [-1, review_len_u, embedding_size, 1])
conv = tf.nn.conv2d(
self.embedded_users,
W,
strides=[1,1,1,1],
padding="VALID",
name="conv"
)
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
pooled = tf.nn.max_pool(
h,
ksize=[1, review_len_u - filter_size + 1, 1, 1],
strides=[1,1,1,1],
padding="VALID",
name="pool"
)
pooled_outputs_u.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
self.h_pool_u = tf.concat(pooled_outputs_u, 3)
self.h_pool_flat_u = tf.reshape(self.h_pool_u, [-1, review_num_u, num_filters_total])
pooled_outputs_i = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("item_conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
self.embedded_items = tf.reshape(self.embedded_items, [-1, review_len_i, embedding_size, 1])
conv = tf.nn.conv2d(
self.embedded_items,
W,
strides=[1,1,1,1],
padding="VALID",
name="conv"
)
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
pooled = tf.nn.max_pool(
h,
ksize=[1, review_len_i - filter_size + 1, 1, 1],
strides=[1,1,1,1],
padding='VALID',
name="pool"
)
pooled_outputs_i.append(pooled)
num_filters_total = num_filters * len(filter_sizes)
self.h_pool_i = tf.concat(pooled_outputs_i, 3)
self.h_pool_flat_i = tf.reshape(self.h_pool_i, [-1, review_num_i, num_filters_total])
with tf.name_scope("dropout"):
self.h_drop_u = tf.nn.dropout(self.h_pool_flat_u, 1.0)
self.h_drop_i = tf.nn.dropout(self.h_pool_flat_i, 1.0)
with tf.name_scope("attention"):
# User attention
Wau = tf.Variable(
tf.random_uniform([num_filters_total, attention_size], -0.1, 0.1),
name="Wau"
)
Wru = tf.Variable(
tf.random_uniform([embedding_id, attention_size], -0.1, 0.1),
name="Wru"
)
Wpu = tf.Variable(
tf.random_uniform([attention_size, 1], -0.1, 0.1),
name="Wpu"
)
bau = tf.Variable(
tf.constant(0.1, shape=[attention_size]),
name="bau"
)
bbu = tf.Variable(
tf.constant(0.1, shape=[1]),
name="bbu"
)
self.iid_a = tf.nn.relu(tf.nn.embedding_lookup(iidW, self.input_reuid))
self.u_j = tf.einsum('ajk,kl->ajl', tf.nn.relu(
tf.einsum('ajk,kl->ajl', self.h_drop_u, Wau) + tf.einsum('ajk,kl->ajl', self.iid_a, Wru) + bau),
Wpu) + bbu
self.u_a = tf.nn.softmax(self.u_j, 1)
print(self.u_a)
# item attention
Wai = tf.Variable(
tf.random_uniform([num_filters_total, attention_size], -0.1, 0.1),
name="Wai"
)
Wri = tf.Variable(
tf.random_uniform([embedding_id, attention_size], -0.1, 0.1),
name="Wri"
)
Wpi = tf.Variable(
tf.random_uniform([attention_size, 1], -0.1, 0.1),
name="Wpi"
)
bai = tf.Variable(tf.constant(0.1, shape=[attention_size]), name="bai")
bbi = tf.Variable(tf.constant(0.1, shape=[1]), name="bbi")
self.uid_a = tf.nn.relu(tf.nn.embedding_lookup(uidW, self.input_reiid))
#self.i_j = tf.einsum('ajk,kl->ajl', tf.nn.relu(
# tf.einsum('ajk,kl->ajl', self.h_drop_i, Wai) + tf.einsum('ajk,kl->ajl', self.uid_a, Wri) + bai),
# Wpi) + bbi
sm111 = tf.reshape(self.h_drop_i, shape=[-1, num_filters_total])
sm11 = tf.matmul(sm111, Wai)
sm1 = tf.reshape(sm11, shape=[-1, review_num_i, attention_size])
sm2 = tf.reshape(tf.matmul(tf.reshape(self.uid_a, shape=[-1, embedding_id]), Wri),
shape=[-1, review_num_i, attention_size])
sm3 = tf.nn.relu(sm1 + sm2 + bai)
self.i_j = tf.reshape(tf.matmul(tf.reshape(sm3, shape=[-1, attention_size]), Wpi),
shape=[-1, review_num_i, 1]) + bbi
self.i_a = tf.nn.softmax(self.i_j, 1)
l2_loss += tf.nn.l2_loss(Wau)
l2_loss += tf.nn.l2_loss(Wru)
l2_loss += tf.nn.l2_loss(Wri)
l2_loss += tf.nn.l2_loss(Wai)
with tf.name_scope("add_reviews"):
self.u_feas = tf.reduce_sum(tf.multiply(self.u_a, self.h_drop_u), 1)
self.u_feas = tf.nn.dropout(self.u_feas, self.dropout_keep_prob)
self.i_feas = tf.reduce_sum(tf.multiply(self.i_a, self.h_drop_i), 1)
self.i_feas = tf.nn.dropout(self.i_feas, self.dropout_keep_prob)
with tf.name_scope("get_fea"):
# user fusion (text review + dmf)
uidmf = tf.Variable(tf.random_uniform([user_num + 2, embedding_id], -0.1, 0.1), name="uidmf")
self.uid = tf.nn.embedding_lookup(uidmf, self.input_uid)
self.uid = tf.reshape(self.uid, [-1, embedding_id])
Wu = tf.Variable(
tf.random_uniform([num_filters_total, n_latent], -0.1, 0.1),
name="Wu"
)
bu = tf.Variable(
tf.constant(0.1, shape=[n_latent]),
name="bu"
)
self.u_feas = tf.matmul(self.u_feas, Wu) + self.uid + bu
# item fusion (text review + dmf)
iidmf = tf.Variable(tf.random_uniform([item_num + 2, embedding_id], -0.1, 0.1), name="iidmf")
self.iid = tf.nn.embedding_lookup(iidmf, self.input_iid)
self.iid = tf.reshape(self.iid, [-1, embedding_id])
Wi = tf.Variable(
tf.random_uniform([num_filters_total, n_latent], -0.1, 0.1),
name="Wi"
)
bi = tf.Variable(
tf.constant(0.1, shape=[n_latent]),
name="bi"
)
self.i_feas = tf.matmul(self.i_feas, Wi) + self.iid + bi
with tf.name_scope("ncf"):
self.FM = tf.multiply(self.u_feas, self.i_feas)
self.FM = tf.nn.relu(self.FM)
self.FM = tf.nn.dropout(self.FM, self.dropout_keep_prob)
Wmul = tf.Variable(
tf.random_uniform([n_latent, 1], -0.1, 0.1),
name='wmul'
)
self.mul = tf.matmul(self.FM, Wmul)
self.score = tf.reduce_sum(self.mul, 1, keep_dims=True)
self.uidW2 = tf.Variable(tf.constant(0.1, shape=[user_num + 2]), name="uidW2")
self.iidW2 = tf.Variable(tf.constant(0.1, shape=[item_num + 2]), name="iidW2")
self.u_bias = tf.gather(self.uidW2, self.input_uid)
self.i_bias = tf.gather(self.iidW2, self.input_iid)
self.Feature_bias = self.u_bias + self.i_bias
self.bised = tf.Variable(tf.constant(0.1), name='bias')
self.predictions = self.score + self.Feature_bias + self.bised
with tf.name_scope("loss"):
losses = tf.nn.l2_loss(tf.subtract(self.predictions, self.input_y))
self.loss = losses + l2_reg_lambda * l2_loss
with tf.name_scope("accuracy"):
self.mae = tf.reduce_mean(tf.abs(tf.subtract(self.predictions, self.input_y)))
self.accuracy = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.predictions, self.input_y))))