版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/pp634077956/article/details/53397421
(1)PMF算法
PMF的基本的思路,就是定义两个基本的参数W,U,然后对于任意一个组合(u,m),利用
Wi∗Uj
,来获取预测值。这些基本的算法思路网上很多,就不细说了。简单说一下程序
[0]:一开始我们要将训练数据划分为3部分,第一部分用来做普通的SGD训练,第二部分用来训练模型融合,第三部分用来测试RMSE。
[1]:我们利用SGD(随机梯度下降)来训练函数,最后就可以得到W,U,为了更好的效果,还添加了偏置参数bu,bi,也要训练得到
[2]:初始值问题,我们随机生成参数,并且正比于
1/k√
.
[3]:最后是学习速率的变化,我简单的每次都乘以0.9。但是这里有好几种办法:
method1:
我们可以用启发式的算法来更新学习速率.当RMSE变大的时候就要减少速率,反之可以增大。
method2:
让学习速率等于
α01+iter∗d
,d是一个常数用来条件减小的速度,随着迭代的增加学习速率会越来越小。
[4]另外为了加快训练的速度,避免每次训练都要重头开始,我将训练的结果保存在文本里面,每次重新读取即可。所以这也增加了编写程序的复杂度
[5]可以优化的地方:为了避免偏导数进入一个长期平滑的区域,我们可以引入动量这个参数,叫做momentum,每次偏导不光等于它的数学表达式,而是等于
partiald=partiald∗momentum+expression
,这样可以加快收敛,在这里我就没有实现了。但是实现也比较简单。
下面是代码,分割文件的代码在最后给出
import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
from code import read_file
def get_train(path=r'smaller_train.txt'):
train = defaultdict(dict)
for uid,iid,r in read_file(path):
train[uid][iid] = r
return train
def write_file(data,path):
with open(path,'w') as f:
pass
with open(path,'a') as file:
for u_i,modelitems in data.items():
if isinstance(modelitems,Iterable):
file.write('{0} '.format(u_i)+' '.join(('{0:.2f}'.format(x) for x in modelitems))+'\n')
else:
file.write('{0} '.format(u_i)+'{0:.3f}'.format(modelitems)+'\n')
def LFM(train,F,N,alpha,_lambda):
(p,q,bu,bi) = init(train,F)
for step in range(0,N):
print(step)
print(bu[1],p[1][1])
for u,user_items in train.items():
pu = p[u]
for i,r in user_items.items():
pui = predict(u,i,p,q,bu,bi)
eui = r - pui
bu[u] = alpha*(eui-_lambda*bu[u])
bi[i] = alpha*(eui-_lambda*bi[i])
qi = q[i]
for f in range(F):
pu[f] += alpha*(qi[f]*eui - _lambda*pu[f])
qi[f] += alpha*(pu[f]*eui - _lambda*qi[f])
alpha *= 0.9
write_file(p,'p{}.txt'.format(F))
write_file(q,'q{}.txt'.format(F))
write_file(bu,'bu{}.txt'.format(F))
write_file(bi,'bi{}.txt'.format(F))
return bu,bi,p,q
def predict(u,i,p,q,bu,bi):
try:
pu,qi,bu_,bi_= p[u],q[i],bu[u],bi[i]
except:
return None
return sum(pu[f]*qi[f] for f in range(len(pu)))+bu_+bi_
def init(train,F):
import random
p,q = dict(),dict()
bu,bi = dict(),dict()
for u in train:
p[u] = [random.random()/(F)**(0.5) for x in range(F)]
bu[u] = 0
for i in train[u]:
if i not in q:
bi[i] = 0
q[i] = [random.random()/(F)**(0.5) for x in range(F)]
return p,q,bu,bi
def get_pq(sep = '\t',index = 5):
import re
p,q,bu,bi = dict(),dict(),dict(),dict()
p_name,q_name,bu_name,bi_name = 'p{0}.txt'.format(index),'q{0}.txt'.format(index),'bu{0}.txt'.format(index),'bi{0}.txt'.format(index)
name = [p_name,q_name,bu_name,bi_name]
for x,_name in zip([p,q,bu,bi],name):
with open(_name) as p_f:
for line in p_f:
line = re.split(r'[;,\s\t\n]\s*',line)
line = [x for x in line if x!='']
uid = int(line[0])
x[uid] = list(map(float,line[1:])) if len(line)!=2 else float(line[1])
return p,q,bu,bi
def REMS(p,q,bu,bi):
error = 0
cnt = 0
mmin = 10;mmax = 0;
for uid,iid,r in read_file(r'smaller_test.txt'):
pr = predict(uid,iid,p,q,bu,bi)
if pr==None:continue
if pr<1:pr = 1.0
if pr>5:pr=5.0
error += (r-pr)**2
cnt += 1
print(cnt,error,(error/cnt)**0.5)
if __name__ == '__main__':
k = 50
LFM(get_train(),k,50,0.02,0.01)
p,q,bu,bi = get_pq(index=k)
REMS(p,q,bu,bi)
用来分隔文件,读取文件
import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
import re
def read_file(r_path,sep='\t',num = 3):
with open(r_path) as file:
for line in file:
line = re.split(r'[;,\s\t\n]\s*',line)
line = [x for x in line if x!='']
uid,iid = int(line[0]),int(line[1])
if num==2:
yield uid,iid#return uid,item id in test file
else:
yield uid,iid,float(line[2])#return uid,item id,rating
def write_file(w_path,data):
with open(w_path,'w'):
pass
with open(w_path,'a') as file:
for u in data:
user = data[u]
for iid,r in user.items():
file.write('{0}\t{1}\t{2}\n'.format(u,iid,r))
def split():
index = 0
M = 10
train = defaultdict(dict)
test = defaultdict(dict)
for uid,iid,r in read_file(r'real_train.txt'):
index+=1
if index%M == 0:
test[uid][iid]=r
else:
train[uid][iid] = r
w_path1,w_path2 = r'real_train.txt',r'real_test.txt'
w_path3,w_path4 = r'smaller_train.txt',r'smaller_test.txt'
write_file(w_path3,train)
write_file(w_path4,test)
#write_file(w_path1,train)
#write_file(w_path2,test)
if __name__ == '__main__':
pass