Natural language processing--visualize semantic word relationships on a two-dimensional plane (draw a scatter diagram reflecting the relationship between word vectors)

The relationship between semantic words is very useful, and some interesting findings can be obtained through visualization.
Draw them on a two-dimensional semantic map based on the distance between the Word2vec vectors of US city names, that is, map the Word2vec distance to a two-dimensional plane, and then find them in the Google News corpus trained googlenews-vectors-negative300.bin.gz Although some cities of similar size and culture are far apart geographically, they are tightly clustered together in semantics.

import os
from nlpia.loaders import get_data
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import numpy as np
from nlpia.data.loaders import get_data
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# 从谷歌新闻语料库的 Word2vec 模型中加载所有词向量
wv = KeyedVectors.load_word2vec_format('xxx\\googlenews-vectors-negative300.bin.gz',
                                                    binary=True)
# 谷歌新闻的 Word2vec 模型非常庞大:包含 300 万个词,每个词有 300 个向量维数
print(len(wv.vocab))

# Word2vec 词频
vocab = pd.Series(wv.vocab)
# 键值映射中的值是一个gensimVocab 对象,它不仅包含了一个词对应的 Word2vec 向量的索引位置,
# 还包含了该词在谷歌新闻语料库中出现的次数。
print(vocab.iloc[1000000:1000006])

# 检索某个词或 n-gram 的 300 维向量
print(wv['Illini'])

# 查看Illinois 与 Illini 的距离
# 欧几里得距离
print(np.linalg.norm(wv['Illinois'] - wv['Illini']))
# 余弦相似度是归一化的点积
cos_similarity = np.dot(wv['Illinois'], wv['Illini']) / (np.linalg.norm(wv['Illinois']) * np.linalg.norm(wv['Illini']))
print(cos_similarity)
# 余弦距离
print(1 - cos_similarity)

# 将美国城市名称的 Word2vec 向量绘制在二维语义图上
# 美国城市和州数据
us = pd.read_csv('xxx\\cities_us.csv')
states = pd.read_csv('xxx\\america-states.csv')
states = dict(zip(states.Abbreviation, states.State))
us['_city'] = us.city.copy()
us['st'] = us.admin1_code.copy()
us['_state'] = us.st.map(states)
print(us[us.columns[-3:]].head())

# 检查看看 Word2vec 词汇表中有哪些州名和城市名
vocab = pd.np.concatenate([us._city, us.st, us._state])
print('wv.wv:\n', wv.wv)
vocab = np.array([word for word in vocab if word in wv.wv])
print(vocab[:5])

# 通过州词向量增强的城市词向量:
# 将州的 Word2vec 词向量加到城市词向量上来对词的含义进行组合,以此解决城市同名问题
city_plus_state = []
for c, state, st in zip(us._city, us._state, us.st):
    if c not in vocab:
        continue
    row = []
    if state in vocab:
        row.extend(wv[c] + wv[state])
    else:
        row.extend(wv[c] + wv[st])
    city_plus_state.append(row)

us_300D = pd.DataFrame(city_plus_state)
print("us_300D:\n", us_300D.shape, '\n', us_300D.head())

# 使用 PCA 将其投影到二维图上
pca = PCA(n_components=2)
# 在 nlpia 包中保存了这些增强的城市词向量
# us_300D = get_data('cities_us_wordvectors')
us_2D = pca.fit_transform(us_300D.iloc[:, :300])
print("us_2D:\n", us_2D)

# 美国城市词向量散点图
# 这两行代码解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 去除边框
fig, ax = plt.subplots()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
# 散点图
plt.scatter(us_2D[:1500,0], us_2D[:1500,1], alpha=0.6, c="g")  # 画散点图, alpha=0.6 表示不透明度为 0.6
# 绘制水平or垂直参考线
plt.axhline(y=0.0,c="y",ls="--",lw=2)
plt.axvline(x=0.0,c="y",ls="--",lw=2)
plt.xlabel('x')  # 横坐标轴标题
plt.ylabel('y')  # 纵坐标轴标题
plt.title("美国城市词向量散点图")
plt.grid()
plt.show()

Guess you like

Origin blog.csdn.net/fgg1234567890/article/details/112974997