埋め込み後の元のテキストの位置をグラフィカルに表示するには、umap_plot を使用します。
1.エフェクト表示
2. ユーティリティ機能
import umap
import altair as alt
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
def umap_plot(text, emb):
cols = list(text.columns)
# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=2)
umap_embeds = reducer.fit_transform(emb)
# Prepare the data to plot and interactive visualization
# using Altair
#df_explore = pd.DataFrame(data={'text': qa['text']})
#print(df_explore)
#df_explore = pd.DataFrame(data={'text': qa_df[0]})
df_explore = text.copy()
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
x=#'x',
alt.X('x',
scale=alt.Scale(zero=False)
),
y=
alt.Y('y',
scale=alt.Scale(zero=False)
),
tooltip=cols
#tooltip=['text']
).properties(
width=700,
height=400
)
return chart
def umap_plot_big(text, emb):
cols = list(text.columns)
# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=100)
umap_embeds = reducer.fit_transform(emb)
# Prepare the data to plot and interactive visualization
# using Altair
#df_explore = pd.DataFrame(data={'text': qa['text']})
#print(df_explore)
#df_explore = pd.DataFrame(data={'text': qa_df[0]})
df_explore = text.copy()
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
x=#'x',
alt.X('x',
scale=alt.Scale(zero=False)
),
y=
alt.Y('y',
scale=alt.Scale(zero=False)
),
tooltip=cols
#tooltip=['text']
).properties(
width=700,
height=400
)
return chart
3. サンプルコード1
依存ライブラリをインストールし、
pip install cohere umap-learn altair datasets
環境変数をロードし、
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
cohereクライアントを作成し、
import cohere
co = cohere.Client(os.environ['COHERE_API_KEY'])
サンプルデータを設定し、
import pandas as pd
sentences = pd.DataFrame({'text':
[
'Where is the world cup?',
'The world cup is in Qatar',
'What color is the sky?',
'The sky is blue',
'Where does the bear live?',
'The bear lives in the the woods',
'What is an apple?',
'An apple is a fruit',
]})
ベクトル化、
emb = co.embed(texts=list(sentences['text']),
model='embed-english-v2.0').embeddings
# Explore the 10 first entries of the embeddings of the 3 sentences:
for e in emb:
print(e[:3])
umapによるグラフィック表示、
chart = umap_plot(sentences, emb)
chart.interactive()
効果は以下の通りです。
4. サンプルコード2
データのダウンロード、
import pandas as pd
wiki_articles = pd.read_pickle('wikipedia.pkl')
wiki_articles
umapのグラフィック表示により、
import numpy as np
articles = wiki_articles[['title', 'text']]
embeds = np.array([d for d in wiki_articles['emb']])
chart = umap_plot_big(articles, embeds)
chart.interactive()
効果は以下の通り、
終わりました!