Python实现根据评论评分信息预测 (协同过滤,LFM,词向量)

一.数据集

train.dat共393366项,第一维是用户的代号,第二维是商品代号,第三项是用户对该商品的评分,第四项是评论数,第五项是评论内容。

test.dat第一维是用户代号,第二维是商品代号。

二.简单思路及实现过程

这里简单先贴出代码实现,后期再慢慢补上具体过程说明~

1.数据预处理

去除部分无关常用词,这里调用nltk包,将评论词词词根化等,进行评论数据的预处理,存为new dat.dat

import nltk

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split

def textPrecessing(text):

    wordLst = nltk.word_tokenize(text)

    filtered = [w for w in wordLst if w not in stopwords.words('english')]

    refiltered =nltk.pos_tag(filtered)

    filtered = [w for w, pos in refiltered if pos.startswith('NN')]

    ps = PorterStemmer()

    filtered = [ps.stem(w) for w in filtered]

 

    return " ".join(filtered)

 

def split_word():

    x = []

    y = []

    with open("‪E:/project/o/comdata/train.dat", encoding='utf-8') as f:

        for data in f.readlines():

            data = data.strip("\n").split(" ")

            temp = []

            temp.append(' '.join(data[0:4]))

            text = data[4:]

            temp.append(textPrecessing(" ".join(text)))

            x.append(' '.join(temp))

    with open("‪E:/project/o/comdata/newdat.dat", "w", encoding='utf-8') as w:

            for i in x:

                w.write(i)

                w.write('\n')

    f.close()

w.close()

 

with open('train.dat',encoding='UTF-8') as file_object:     #获取训练集

  for j in range(303366):

      line = file_object.readline()

      if(line):

       traindata.append(line.strip('\n').split(' ')[0:4]) #

       word=line.strip('\n').split(' ')[4:]

       comm=[]

       for w in word:

           if w not in stoplist:

 

              comm.append(w)

       commentdata.append(comm)   # get comment information

 

2.获取对应的用户名-id,物品-id字典,构建用户id-物品id 评分表

 

useridlist={} #用户名对应序号

itemidlist={} #物品对应序号

num=0

mun=0

uu=0

for item in traindata:

    if item[0] in useridlist:

        uu+=1

    if item[0] not in useridlist:

     useridlist[item[0]]=num

     num+=1

for item in traindata:

    if item[1] not in itemidlist:

     itemidlist[item[1]]=mun

     mun+=1

 

scorematr=np.zeros((len(useridlist),len(itemidlist))) #评分表ui

for item in traindata:

    i=int(useridlist[item[0]])

    j=int(itemidlist[item[1]])

    scorematr[i][j]=int(item[2])

 

3.尝试LFM法 

def Lfm(T,k):

    alpha = np.float32(0.05)

    lambda_ = np.float32(0.08)

    m, n = T.shape

    u = np.float32(np.random.rand(m,k))

    v = np.float32(np.random.ran77dn(k,n))

    du=np.float32(0)

    dv=np.float32(0)

    for t in range(500):

        for i in range(m):

            for j in range(n):

                if math.fabs(T[i][j]) > 1e-4:

                    err = T[i][j] - np.dot(u[i],v[:,j])

                    for t in range(k):

                        du = err * v[t][j] - lambda_ * u[i][t]

                        dv = err * u[i][t] - lambda_ * v[t][j]

                        u[i][t] += alpha * du; v[t][j] += alpha * dv

return u,v

#u,v=Lfm(scorearr) print(np.dot(u,v))

四.尝试LDA法/word2vec法基于评论预测

LDA 模型

dictionary = corpora.Dictionary(commentdata)  #建立字典

dictionary.filter_tokens(bad_ids=['i','the','is','and','very','a','an','had','about','for','it','if','of','to'])

    dictionary.filter_extremes(no_above=40/100)

    #dictionary.filter_n_most_frequent(8)

    print(dictionary.token2id)   #打印字典表

    corpus = [dictionary.doc2bow(text) for text in commentdata]

    print(corpus[3:5])

    print("corpus----")

lda=LdaModel(corpus=corpus,id2word=dictionary,num_topics=20)   

# 训练LDA模型

    lda.save("./lda.model")

   # lda=LdaModel.load("./lda.model")

 

word2vec 模型

 model="./pinglun.model"

     if os.path.exists(model):

       model=gensim.models.Word2Vec.load("./pinglun.model")      

       

     else :

model = gensim.models.word2vec.Word2Vec(commentdata, size=200,

workers=7,min_count=2)  #训练 word2vec模型

       model.save("./pinglun.model")

 

(1)求θuij

LDA方法的θ取法

def getHuik(doc_topic,n,bo):   #获得Huik

    Huikarr = np.zeros((u0, i0, n))

    for ui in traindata:

           u=int(useridlist[ui[0]])

           i=int(itemidlist[ui[1]])

           comid=Bui[u][i]  #u,i对应的评论的序号

           item = doc_topic[int(comid)]  #遍历评论的词汇,与主题特征词

           for topid in range(5):

                Huikarr[u][i][topid]=float(item[topid][1]) #评论ui对应topid的概率

                bo[int(comid)][topid]=float(item[topid][1])

 

    return Huikarr

word2vec的θ取法

def getHuik(doc_topic):   #获得Huik

 

    for top in topic:

         topid=topictokenlist[top]

         temp=topic[topid]

         model = gensim.models.Word2Vec.load("./pinglun.model")

         simlar=model.most_similar(temp,topn=50) #主题的特征词集,取了10个

         

         for ui in traindata:

           u=int(useridlist[ui[0]])

           i=int(itemidlist[ui[1]])

           comid=Bui[u][i]  #u,i对应的评论的序号

           for item in commentdata[int(comid)]:  #遍历评论的词汇,与主题特征词

             tempcom=0  # 统计一条评论中主题特征词的相似和

             if item in simlar:

                 tempcom+=model.similarity(item,temp)

                 print("zhaodoa!!!")

             Huikarr[u][i][topid]=tempcom

 

         return Huikarr

 

(1)求puij

 

def getpuj0(u,j): #求单个的puj

    Huij=getHuik()  #得θuij数组

    tempujs = 0

    for item in traindata:

        i=int(itemidlist[item[1]]) # 物品号

        tempujs+=Huij[u][i][j]

 

return tempujs/Cu[u] #得到puj

  Puj0=np.zeros((len(useridlist),len(topictokenlist)))

 

 

def getPuj0():  #求全体puj 数组

    for u in range(len(useridlist)):

        for  j in range((len(topictokenlist))):

 

            Puj0[u][j]=getpuj0(u,j)

 

    return Puj0

(3)求 puj

 

def getPuj1():

    bottom=np.zeros((len(useridlist)))

    for u in range(len(useridlist)):

        temp=0

        for j in range(len(topictokenlist)):

           temp+=Puj0[u][j]

        bottom[u]=temp

 

    for u in range(len(useridlist)):

        for j in range((len(topictokenlist))):

           Puj1[u][j]=Puj0[u][j]/bottom[u]

return Puj1

 

(2)求qij

 

 def getqij0(i,j):

    Huij=getHuik() #uij数组

    tempu = 0

    for u in range(len(useridlist)):

 

        tempu+=Huij[u][i][j]

 

    return tempu/Ci[i] #得到p'u

    # j

Qij0=np.zeros((len(itemidlist),len(topictokenlist)))

 

def getQij0():

    for i in range(len(itemidlist)):

        for j in range(len(topictokenlist)):

            Qij0[i][j]=getqij0(i,j)

return Qij0

 

 

(4)求qij

def getQij1():

    bottom = np.zeros((len(itemidlist)))

    for i in range(len(itemidlist)):

        temp = 0

        for j in range(len(topictokenlist)):

            temp += Qij0[i][j]

        bottom[i] = temp

 

    for i in range(len((itemidlist))):

        for j in range((len(topictokenlist))):

            Qij1[i][j] = Qij0[i][j] / bottom[i]

    return Qij1

(5)产生用户u未评论物品i的主题分布

def getTuij0(Huij):

    Tuij0 = np.zeros((393366, 5))

    Qij1=getQij1(Huij)

    Puj1=getPuj1(Huij)

    Tuij0 = np.zeros((len(useridlist), len(itemidlist),5))

    for u in range(len(useridlist)):

        for i in range(len(itemidlist)):

            mm=int(Bui[u][i])

            for j in range(5):

                Tuij0[m][j]=Puj1[u][j]*Qij1[i][j]

    return Tuij0

 

 

def getTuij_input(Huij):

    Tuij_input = np.zeros((393366,5))

    bottom=np.zeros((393366,1))

    Tuij0=getTuij0(Huij)

    for u in range(len(useridlist)):

        for i in range(len(itemidlist)):

           temp=0

           hh=int(Bui[u][i])

           for j in range(5):

               temp+=Tuij0[hh][j]

           jj=Bui[u][i]

           bottom[jj]=temp

    for u in range(len(useridlist)):

        for i in range(len(itemidlist)):

            for j in range(5):

                tok=Bui[u][i]

                Tuij_input[tok][j]=Tuij0[tok][j]/bottom[tok]

     return Tuij_input

 

(6)线性回归与预测

bo=np.zeros((393366,5))

m=getHuik(doc_topic,5,bo)

clf= sklearn.linear_model.LinearRegression()

    clf.fit(bo,mark)

 

    Test_input=getTuij_input(m)

    print("结果:")

    for item in testdata:

  #  item=testdata[1]

      print(item)

      u=int(useridlist[item[0]])

      i=int(itemidlist[item[1]])

      print(clf.predict(Test_input[u][i]))     #输出预测评分

 

猜你喜欢

转载自blog.csdn.net/m0_37783096/article/details/80837001