Manually reproduce DeepWalk code and realize visual analysis
- Environment configuration
- import toolkit
- retrieve data
- Build an undirected graph
- Generate random walk node sequence function
- Generate a random walk sequence
- training word2vec model
- Analyzing word2vec results
- PAC dimensionality reduction, visualization of two-dimensional embedding of all entries
- Visualize the two-dimensional embedding of some terms
- Better dimensionality reduction, TSNE dimensionality reduction visualization
Original link: DeepWalk: Online Learning of Social Representations
paper intensive reading: DeepWalk, the art of random walk graph embedding, the first integration of natural language processing and graph processing
Environment configuration
Toolkit | effect |
---|---|
networkx | Graph network visualization analysis |
pandas | data analysis |
numpy | data analysis |
tqdm | progress bar |
as a nation | Natural language processing related tools (word2vec) |
scikit-learn | Machine Learning (PAC Dimensionality Reduction, TNSE Dimensionality Reduction) |
!pip install networkx pandas numpy tqdm gensim scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple
import toolkit
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import random
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
retrieve data
Graph Relationship Crawler Website
Instructions for crawling website information:
https://en.wikipedia.org/wiki/Deep_learning
https://en.wikipedia.org/wiki/Brain_science
https://en.wikipedia.org/wiki/Granger_Analysis
df = pd.read_csv('seealsology-data.tsv', sep='\t')
df.head()
Build an undirected graph
G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr=True, create_using=nx.Graph())
# nx.draw(G)
Generate random walk node sequence function
def get_randomwalk(node, path_length):
"""
输入起始节点和路径长度,生成随机游走节点序列
node: 当前节点
path_length: 序列长度
return: 随机游走节点序列
"""
# 保存起点节点
random_walk = [node]
for i in range(path_length-1):
# 汇总邻接节点
temp = list(G.neighbors(node))
temp = list(set(temp)-set(random_walk))
# 如果没有邻接节点,走到一条思路就结束
if len(temp) == 0:
break
# 选择一个邻接节点并游走过去
random_node = random.choice(temp)
random_walk.append(random_node)
# 重置当前节点为游走后的新节点
node = random_node
return random_walk
Test function:
get_randomwalk('deep learning', 5)
>>['deep learning',
'vector quantization',
'rate-distortion function',
'white noise']
Generate a random walk sequence
gamma = 10 # 每个节点作为起始节点生成随机游走序列的个数
walk_length = 5 # 游走序列的最大长度
random_walks = []
# 遍历每个节点
for n in tqdm(all_nodes):
# 每个节点生成gamma个随机游走序列
for i in range(gamma):
random_walks.append(get_randomwalk(n, walk_length))
random_walks[1]
>>['brain science',
'neurophysiology',
'computational neuroscience',
'galves–löcherbach model']
training word2vec model
model = Word2Vec(
vector_size=256, # embedding维度
window=4, # 左右窗口宽度
sg=1, # 使用skipgram模式
hs=0, # 不分层softmax
negative=10, # 负采样
alpha=0.03, # 学习率
min_alpha=0.0007,# 最小学习率
seed=14, # 随机种子
)
model.build_vocab(random_walks, progress_per=2)
model.train(random_walks, total_examples=model.corpus_count, epochs=50, report_delay=1)
Analyzing word2vec results
# 查看每个节点的embedding
print(model.wv.get_vector('brain science'))
# 查找相似词语
print(model.wv.similar_by_word('brain science'))
PAC dimensionality reduction, visualization of two-dimensional embedding of all entries
# 导入节点的嵌入向量
X = model.wv.vectors
pca = PCA(n_components=2) # 输入2维
embed_2d = pca.fit_transform(X)
embed_2d.shape
>>(2880, 2)
Visualization:
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
plt.show()
Visualize the two-dimensional embedding of some terms
Use PageRank to find the top 30 nodes of importance:
pagerank = nx.pagerank(G)
node_importamce = sorted(pagerank.items(), key=lambda x:x[1], reverse=True)
# 关注前30重要的
n = 30
term_chosen = []
for each in node_importamce[:n]:
term_chosen.append(each[0])
# 输入词条,输出词典中的索引号
term2index = model.wv.key_to_index
visualization
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
for item in term_chosen:
idx = term2index[item]
plt.scatter(embed_2d[idx, 0], embed_2d[idx, 1], c='r', s=50)
plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]), c='k', fontsize=12)
plt.show()
Better dimensionality reduction, TSNE dimensionality reduction visualization
tsne = TSNE(n_components=2, n_iter=1000)
embed_2d = tsne.fit_transform(X)
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
plt.show()
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
for item in term_chosen:
idx = term2index[item]
plt.scatter(embed_2d[idx, 0], embed_2d[idx, 1], c='r', s=50)
plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]), c='k', fontsize=12)
plt.show()
Visualize in 3D space
tsne = TSNE(n_components=3, n_iter=1000)
embed_3d = tsne.fit_transform(X)
plt.figure(figsize=(14,14))
ax = plt.axes(projection="3d")
ax.scatter3D(embed_3d[:,0], embed_3d[:,1], embed_3d[:,2])
plt.show()