faiss not provide cosine distance how do

Reference: https://zhuanlan.zhihu.com/p/40236865 , but different final views

faiss Facebook is open-source library for fast computation of massive vector distance, but did not provide a cosine distance, cosine distance and usage is still very high, then how to solve it

import faiss
from faiss import normalize_L2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def faiss_cos_similar_search(x, k=None):
    # 这个不是真的用faiss计算cos,而是找邻居的结果跟用cos得到的邻居结果是很接近,但是距离还是不同的哦
    assert len(x.shape) == 2, "仅支持2维向量的距离计算"
    nb, d = x.shape
    x = x.astype('float32')
    k_search = k if k else nb
    normalize_L2(x)
    index=faiss.IndexFlatIP(d)
    index.train(x)

    # index=faiss.IndexFlatL2(d)
    
    index.add(x)
    D, I =index.search(x, k=k_search)
    return I

def sklearn_cos_search(x, k=None):
    assert len(x.shape) == 2, "仅支持2维向量的距离计算"
    nb, d = x.shape
    ag=cosine_similarity(x)
    np.argsort(-ag, axis=1)
    k_search = k if k else nb

    return np.argsort(-ag, axis=1)[:, :k_search]

def test_IndexFlatIP_only(nb = 1000, d = 100, kr = 0.005, n_times=10):
    k = int(nb * kr)
    print("recall count is %d" % (k))
    for i in range(n_times):
        
        
        x = np.random.random((nb, d)).astype('float32')
        # x = np.random.randint(0,2, (nb,d))
        # faiss_I = faiss_cos_similar_search(x, k)
        index=faiss.IndexFlatIP(d)
        index.train(x)
        index.add(x)
        D, faiss_I =index.search(x, k=k)

        sklearn_I = sklearn_cos_search(x, k)

        cmp_result = faiss_I == sklearn_I
        
        print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
            (np.all(cmp_result), \
            np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
            cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )

def test_embedding(nb = 1000, d = 100, kr = 0.005, n_times=10):
    k = int(nb * kr)
    print("recall count is %d" % (k))
    for i in range(n_times):
        
        
        x = np.random.random((nb, d)).astype('float32')
        # x = np.random.randint(0,2, (nb,d))
        faiss_I = faiss_cos_similar_search(x, k)
        sklearn_I = sklearn_cos_search(x, k)

        cmp_result = faiss_I == sklearn_I
        
        print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
            (np.all(cmp_result), \
            np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
            cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )

def test_one_hot(nb = 1000, d = 100, kr = 0.005, n_times=10):
    k = int(nb * kr)
    print("recall count is %d" % (k))
    for i in range(n_times):
        
        
        # x = np.random.random((nb, d)).astype('float32')
        x = np.random.randint(0,2, (nb,d))
        faiss_I = faiss_cos_similar_search(x, k)
        sklearn_I = sklearn_cos_search(x, k)

        cmp_result = faiss_I == sklearn_I
        
        print("is all correct: %s, correct batch rate: %d/%d, correct sample rate: %d/%d" % \
            (np.all(cmp_result), \
            np.all(cmp_result, axis=1).sum(),cmp_result.shape[0], \
            cmp_result.sum(),cmp_result.shape[0]*cmp_result.shape[1] ) )
if __name__ == "__main__":
    
    print("test use IndexFlatIP only")
    test_IndexFlatIP_only()
    print("-"*100 + "\n\n")
    print("test when one hot")
    test_one_hot()
    print("-"*100 + "\n\n")
    print("test use normalize_L2 + IndexFlatIP")
    test_embedding()
    print("-"*100 + "\n\n")

The following are results

Analysis: The first results (dash separated), is the only IndexFlatIP time, with the result cosine distance difference is very large

The second result, when the data is one hot when using normalize_L2 + IndexFlatIP, with the results of the cosine distance substantially on the results, but a lot of wrong

The second result, when the data of the vector when embedding, by normalize_L2 + IndexFlatIP, with the results of the cosine distance substantially on the result, very little wrong

Note that, here modified method of preprocessing of the data, then the Euclidean distance to simulate a cosine distance is not equivalent, because from the results, although similar, but still not the same place, in particular, to transfer large recall when the difference is larger

Guess you like

Origin www.cnblogs.com/paiandlu/p/12123859.html
Recommended