基于模型融合的推荐系统实现(1):基于SGD的PMF

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/pp634077956/article/details/53397421

(1)PMF算法

PMF的基本的思路,就是定义两个基本的参数W,U,然后对于任意一个组合(u,m),利用 WiUj ,来获取预测值。这些基本的算法思路网上很多,就不细说了。简单说一下程序

[0]:一开始我们要将训练数据划分为3部分,第一部分用来做普通的SGD训练,第二部分用来训练模型融合,第三部分用来测试RMSE。

[1]:我们利用SGD(随机梯度下降)来训练函数,最后就可以得到W,U,为了更好的效果,还添加了偏置参数bu,bi,也要训练得到

[2]:初始值问题,我们随机生成参数,并且正比于 1/k .

[3]:最后是学习速率的变化,我简单的每次都乘以0.9。但是这里有好几种办法:

method1: 我们可以用启发式的算法来更新学习速率.当RMSE变大的时候就要减少速率,反之可以增大。

method2: 让学习速率等于 α01+iterd ,d是一个常数用来条件减小的速度,随着迭代的增加学习速率会越来越小。

[4]另外为了加快训练的速度,避免每次训练都要重头开始,我将训练的结果保存在文本里面,每次重新读取即可。所以这也增加了编写程序的复杂度

[5]可以优化的地方:为了避免偏导数进入一个长期平滑的区域,我们可以引入动量这个参数,叫做momentum,每次偏导不光等于它的数学表达式,而是等于 partiald=partialdmomentum+expression ,这样可以加快收敛,在这里我就没有实现了。但是实现也比较简单。

下面是代码,分割文件的代码在最后给出

import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
from code import read_file

def get_train(path=r'smaller_train.txt'):
    train = defaultdict(dict)
    for uid,iid,r in read_file(path):
        train[uid][iid] = r
    return train
def write_file(data,path):
    with open(path,'w') as f:
        pass

    with open(path,'a') as file:
        for u_i,modelitems in data.items():
            if isinstance(modelitems,Iterable):
                file.write('{0} '.format(u_i)+' '.join(('{0:.2f}'.format(x) for x in modelitems))+'\n')
            else:
                file.write('{0} '.format(u_i)+'{0:.3f}'.format(modelitems)+'\n')
def LFM(train,F,N,alpha,_lambda):

    (p,q,bu,bi) = init(train,F)
    for step in range(0,N):
        print(step)
        print(bu[1],p[1][1])
        for u,user_items in train.items():
            pu = p[u]
            for i,r in user_items.items():
                pui = predict(u,i,p,q,bu,bi)
                eui = r - pui

                bu[u] = alpha*(eui-_lambda*bu[u])
                bi[i] = alpha*(eui-_lambda*bi[i])
                qi = q[i]
                for f in range(F):
                    pu[f] += alpha*(qi[f]*eui - _lambda*pu[f])
                    qi[f] += alpha*(pu[f]*eui - _lambda*qi[f])
        alpha *= 0.9
        write_file(p,'p{}.txt'.format(F))
        write_file(q,'q{}.txt'.format(F))
        write_file(bu,'bu{}.txt'.format(F))
        write_file(bi,'bi{}.txt'.format(F))
    return bu,bi,p,q

def predict(u,i,p,q,bu,bi):
    try:
        pu,qi,bu_,bi_= p[u],q[i],bu[u],bi[i]
    except:
        return None
    return sum(pu[f]*qi[f] for f in range(len(pu)))+bu_+bi_

def init(train,F):
    import random
    p,q = dict(),dict()
    bu,bi = dict(),dict()
    for u in train:
        p[u] = [random.random()/(F)**(0.5) for x in range(F)]
        bu[u] = 0
        for i in train[u]:
            if i not in q:
                bi[i] = 0
                q[i] = [random.random()/(F)**(0.5) for x in range(F)]
    return p,q,bu,bi

def get_pq(sep = '\t',index = 5):
    import re
    p,q,bu,bi = dict(),dict(),dict(),dict()
    p_name,q_name,bu_name,bi_name = 'p{0}.txt'.format(index),'q{0}.txt'.format(index),'bu{0}.txt'.format(index),'bi{0}.txt'.format(index)
    name = [p_name,q_name,bu_name,bi_name]
    for x,_name in zip([p,q,bu,bi],name):
        with open(_name) as p_f:
            for line in p_f:
                line = re.split(r'[;,\s\t\n]\s*',line)
                line = [x for x in line if x!='']
                uid = int(line[0])
                x[uid] = list(map(float,line[1:])) if len(line)!=2 else float(line[1])
    return p,q,bu,bi

def REMS(p,q,bu,bi):
    error = 0
    cnt = 0
    mmin = 10;mmax = 0;
    for uid,iid,r in read_file(r'smaller_test.txt'):
        pr = predict(uid,iid,p,q,bu,bi)
        if pr==None:continue
        if pr<1:pr = 1.0
        if pr>5:pr=5.0
        error += (r-pr)**2
        cnt += 1
    print(cnt,error,(error/cnt)**0.5)

if __name__ == '__main__':
    k = 50
    LFM(get_train(),k,50,0.02,0.01)
    p,q,bu,bi = get_pq(index=k)
    REMS(p,q,bu,bi)

用来分隔文件,读取文件

import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
import re

def read_file(r_path,sep='\t',num = 3):
    with open(r_path) as file:
        for line in file:
            line = re.split(r'[;,\s\t\n]\s*',line)
            line = [x for x in line if x!='']
            uid,iid = int(line[0]),int(line[1])
            if num==2:
                yield uid,iid#return uid,item id in test file
            else:
                yield uid,iid,float(line[2])#return uid,item id,rating

def write_file(w_path,data):

    with open(w_path,'w'):
        pass
    with open(w_path,'a') as file:
        for u in data:
            user = data[u]
            for iid,r in user.items():
                file.write('{0}\t{1}\t{2}\n'.format(u,iid,r))
def split():
    index = 0
    M = 10
    train = defaultdict(dict)
    test = defaultdict(dict)
    for uid,iid,r in read_file(r'real_train.txt'):
        index+=1
        if index%M == 0:
            test[uid][iid]=r
        else:
            train[uid][iid] = r

    w_path1,w_path2 = r'real_train.txt',r'real_test.txt' 
    w_path3,w_path4 = r'smaller_train.txt',r'smaller_test.txt'
    write_file(w_path3,train)
    write_file(w_path4,test)
    #write_file(w_path1,train)
    #write_file(w_path2,test)
if __name__ == '__main__':
    pass

大概7M的数据RMSE在0.88左右,应该还有进一步优化的空间

猜你喜欢

转载自blog.csdn.net/pp634077956/article/details/53397421