版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lily960427/article/details/78616904
Line论文中采用了alias 采样算法进行优化,其源码为c++,现用Python实现一遍,加深一下印象
网上有人已经用C++ 脱离LINE算法,单独实现了这个算法并且测试,可以先看看。
而alias算法的原理可以看我的上一篇博客,其中包括C++源码的分析,所以Python就不加注释了。 。
python代码:
from gensim.models import Word2Vec
import numpy as np
edg_num=0
u = []#开始边
v = []#目标边
w = []#权重数组
def readdata():
f = open("weight.txt", "r", encoding='utf-8')#测试用的文本边集合,每行为 u v weight
weights = f.readlines()
edg_num = len(weights) # 边数目
for i in range(len(weights)):
u.append(int(weights[i].split()[0]))
v.append(int(weights[i].split()[1]))
w.append(int(weights[i].split()[2]))
return edg_num
def initAliasTable():
alias=[-1 for i in range(edg_num)]
prob=[0.0 for i in range(edg_num)]
norm_prob = [0.0 for i in range(edg_num)]
large = []
small= []
w_sum=0
for i in range(edg_num):
w_sum+=w[i]
for i in range(edg_num):
norm_prob[i]=w[i]*edg_num/w_sum
# print(norm_prob)
small_num=0
large_num=0
for i in range(edg_num):
if norm_prob[edg_num-i-1]<1:
small.append(edg_num-i-1)
small_num=small_num+1
else:
large.append(edg_num-i-1)
large_num=large_num+1
#print(small,small_num)
# print(large,large_num)
for l in large:
prob[l]=1.0
while(large_num and small_num):
small_num = small_num - 1
small_cur=small[small_num]
large_num = large_num - 1
large_cur=large[large_num]
if(norm_prob[large_cur]==1):
prob[large_cur] = 1.0
large_num=large_num-1
large_cur=large[large_num]
# print(small_cur,large_cur)
prob[small_cur]=norm_prob[small_cur]
alias[small_cur]=large_cur
norm_prob[large_cur]=norm_prob[large_cur]+norm_prob[small_cur]-1.0
# print(prob)
#print(alias)
if norm_prob[large_cur]<1:
small[small_num]=large_cur
small_num=small_num+1
elif norm_prob[large_cur]==1:
prob[large_cur] = 1.0
else:
large[large_num]=large_cur
large_num=large_num+1
#print(small,small_num)
# print(large,large_num )
return prob,alias
def sampleanedg(prob,alias):
rand1=np.random.uniform(0,1)
rand2=np.random.uniform(0,1)
k=int(rand1*edg_num)
return k if rand2<prob[k] else alias[k]
if __name__ == '__main__':
edg_num=readdata()
prob,alias=initAliasTable()
print("prob=",prob)
print("alias=",alias)
i=60
while(i):
edg_cur=sampleanedg(prob,alias)
i=i-1
print("采样的边序号为",edg_cur+1)
#print("采样边为:",u[edg_cur],v[edg_cur],w[edg_cur])
采样了60次,其中18次为权重最大的边,权重越大越可能被采样。。