Manually reproduce DeepWalk code and realize visual analysis

Original link: DeepWalk: Online Learning of Social Representations
paper intensive reading: DeepWalk, the art of random walk graph embedding, the first integration of natural language processing and graph processing

Environment configuration

Toolkit effect
networkx Graph network visualization analysis
pandas data analysis
numpy data analysis
tqdm progress bar
as a nation Natural language processing related tools (word2vec)
scikit-learn Machine Learning (PAC Dimensionality Reduction, TNSE Dimensionality Reduction)
!pip install networkx pandas numpy tqdm gensim scikit-learn -i https://pypi.tuna.tsinghua.edu.cn/simple

import toolkit

import networkx as nx

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import random
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

retrieve data

insert image description here
Graph Relationship Crawler Website

Instructions for crawling website information:
https://en.wikipedia.org/wiki/Deep_learning
https://en.wikipedia.org/wiki/Brain_science
https://en.wikipedia.org/wiki/Granger_Analysis

df = pd.read_csv('seealsology-data.tsv', sep='\t')
df.head()

insert image description here

Build an undirected graph

G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr=True, create_using=nx.Graph())
# nx.draw(G)

Generate random walk node sequence function

def get_randomwalk(node, path_length):
    """
    输入起始节点和路径长度,生成随机游走节点序列
    node: 当前节点
    path_length: 序列长度
    return: 随机游走节点序列
    """
    # 保存起点节点
    random_walk = [node]
    
    for i in range(path_length-1):
        # 汇总邻接节点
        temp = list(G.neighbors(node))
        temp = list(set(temp)-set(random_walk))
        
        # 如果没有邻接节点,走到一条思路就结束
        if len(temp) == 0:
            break
        
        # 选择一个邻接节点并游走过去
        random_node = random.choice(temp)
        random_walk.append(random_node)
        # 重置当前节点为游走后的新节点
        node = random_node
    
    return random_walk

Test function:

get_randomwalk('deep learning', 5)
>>['deep learning',
 'vector quantization',
 'rate-distortion function',
 'white noise']

Generate a random walk sequence

gamma = 10 # 每个节点作为起始节点生成随机游走序列的个数
walk_length = 5 # 游走序列的最大长度

random_walks = []

# 遍历每个节点
for n in tqdm(all_nodes):
    # 每个节点生成gamma个随机游走序列
    for i in range(gamma):
        random_walks.append(get_randomwalk(n, walk_length))

random_walks[1]
>>['brain science',
 'neurophysiology',
 'computational neuroscience',
 'galves–löcherbach model']

training word2vec model

model = Word2Vec(
                vector_size=256, # embedding维度
                window=4,        # 左右窗口宽度
                sg=1,            # 使用skipgram模式
                hs=0,            # 不分层softmax
                negative=10,     # 负采样
                alpha=0.03,      # 学习率
                min_alpha=0.0007,# 最小学习率
                seed=14,         # 随机种子
)
model.build_vocab(random_walks, progress_per=2)
model.train(random_walks, total_examples=model.corpus_count, epochs=50, report_delay=1)

Analyzing word2vec results

# 查看每个节点的embedding
print(model.wv.get_vector('brain science'))

# 查找相似词语
print(model.wv.similar_by_word('brain science'))

PAC dimensionality reduction, visualization of two-dimensional embedding of all entries

# 导入节点的嵌入向量
X = model.wv.vectors

pca = PCA(n_components=2) # 输入2维
embed_2d = pca.fit_transform(X)

embed_2d.shape
>>(2880, 2)

Visualization:

plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
plt.show()

insert image description here

Visualize the two-dimensional embedding of some terms

Use PageRank to find the top 30 nodes of importance:

pagerank = nx.pagerank(G)
node_importamce = sorted(pagerank.items(), key=lambda x:x[1], reverse=True)

# 关注前30重要的
n = 30
term_chosen = []
for each in node_importamce[:n]:
    term_chosen.append(each[0])

# 输入词条,输出词典中的索引号
term2index = model.wv.key_to_index

visualization

plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])

for item in term_chosen:
    idx = term2index[item]
    plt.scatter(embed_2d[idx, 0], embed_2d[idx, 1], c='r', s=50)
    plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]), c='k', fontsize=12)
plt.show()

insert image description here

Better dimensionality reduction, TSNE dimensionality reduction visualization

tsne = TSNE(n_components=2, n_iter=1000)
embed_2d = tsne.fit_transform(X)

plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
plt.show()

insert image description here

plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])

for item in term_chosen:
    idx = term2index[item]
    plt.scatter(embed_2d[idx, 0], embed_2d[idx, 1], c='r', s=50)
    plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]), c='k', fontsize=12)
plt.show()

insert image description here

Visualize in 3D space

tsne = TSNE(n_components=3, n_iter=1000)
embed_3d = tsne.fit_transform(X)

plt.figure(figsize=(14,14))

ax = plt.axes(projection="3d")
ax.scatter3D(embed_3d[:,0], embed_3d[:,1], embed_3d[:,2])
plt.show()

insert image description here

Guess you like

Origin blog.csdn.net/D_Ddd0701/article/details/131151272