Reference: https://zhuanlan.zhihu.com/p/40236865 , but different final views
faiss Facebook is open-source library for fast computation of massive vector distance, but did not provide a cosine distance, cosine distance and usage is still very high, then how to solve it
import faiss
from faiss import normalize_L2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def faiss_cos_similar_search(x, k=None):
# 这个不是真的用faiss计算cos,而是找邻居的结果跟用cos得到的邻居结果是很接近,但是距离还是不同的哦
assert len(x.shape) == 2, "仅支持2维向量的距离计算"
nb, d = x.shape
x = x.astype('float32')
k_search = k if k else nb
normalize_L2(x)
index=faiss.IndexFlatIP(d)
index.train(x)
# index=faiss.IndexFlatL2(d)
index.add(x)
D, I =index.search(x, k=k_search)
return I
def sklearn_cos_search(x, k=None):
assert len(x.shape) == 2, "仅支持2维向量的距离计算"
nb, d = x.shape
ag=cosine_similarity(x)
np.argsort(-ag, axis=1)
k_search = k if k else nb
return np.argsort(-ag, axis=1)[:, :k_search]
def test_IndexFlatIP_only(nb = 1000, d = 100, kr = 0.005, n_times=10):
k = int(nb * kr)
print("recall count is %d" % (k))
for i in range(n_times):
x = np.random.random((nb, d)).astype('float32')
# x = np.random.randint(0,2, (nb,d))
# faiss_I = faiss_cos_similar_search(x, k)
index=faiss.IndexFlatIP(d)
index.train(x)
index.add(x)
D, faiss_I =index.search(x, k=k)
sklearn_I = sklearn_cos_search(x, k)
cmp_result = faiss_I == sklearn_I
print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
(np.all(cmp_result), \
np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )
def test_embedding(nb = 1000, d = 100, kr = 0.005, n_times=10):
k = int(nb * kr)
print("recall count is %d" % (k))
for i in range(n_times):
x = np.random.random((nb, d)).astype('float32')
# x = np.random.randint(0,2, (nb,d))
faiss_I = faiss_cos_similar_search(x, k)
sklearn_I = sklearn_cos_search(x, k)
cmp_result = faiss_I == sklearn_I
print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
(np.all(cmp_result), \
np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )
def test_one_hot(nb = 1000, d = 100, kr = 0.005, n_times=10):
k = int(nb * kr)
print("recall count is %d" % (k))
for i in range(n_times):
# x = np.random.random((nb, d)).astype('float32')
x = np.random.randint(0,2, (nb,d))
faiss_I = faiss_cos_similar_search(x, k)
sklearn_I = sklearn_cos_search(x, k)
cmp_result = faiss_I == sklearn_I
print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
(np.all(cmp_result), \
np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )
if __name__ == "__main__":
print("test use IndexFlatIP only")
test_IndexFlatIP_only()
print("-"*100 + "\n\n")
print("test when one hot")
test_one_hot()
print("-"*100 + "\n\n")
print("test use normalize_L2 + IndexFlatIP")
test_embedding()
print("-"*100 + "\n\n")
The following are results
Analysis: The first results (dash separated), is the only IndexFlatIP time, with the result cosine distance difference is very large
The second result, when the data is one hot when using normalize_L2 + IndexFlatIP, with the results of the cosine distance substantially on the results, but a lot of wrong
The second result, when the data of the vector when embedding, by normalize_L2 + IndexFlatIP, with the results of the cosine distance substantially on the result, very little wrong
Note that, here modified method of preprocessing of the data, then the Euclidean distance to simulate a cosine distance is not equivalent, because from the results, although similar, but still not the same place, in particular, to transfer large recall when the difference is larger