1. Cosine similarity
Accelerated computing refer to this article
from math import *
def square_rooted(x):
return round(sqrt(sum([a*a for a in x])), 3)
def cosine_similarity(x,y):
numerator = sum(a*b for a, b in zip(x,y))
denominator = square_rooted(x)*square_rooted(y)
return round(numerator/float(denominator),3)
res = cosine_similarity([1, 0], [0,1])
print('==res:', res)
2. Euclidean distance
from math import *
def euclidean_distance(x, y):
return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))
res = euclidean_distance([0, 1], [1, 0])
print('res:', res)
3. Manhattan distance
from math import *
def manhattan_distance(x,y):
return sum(abs(a-b) for a, b in zip(x,y))
print(manhattan_distance([1, 0], [0, 1]))
4. Hamming distance
The number of different characters in the corresponding positions of two strings of equal length.
def hamming_distance(s1, s2):
"""Return the Hamming distance between equal-length sequences"""
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(a != b for a, b in zip(s1, s2))
res = hamming_distance('12','13')
print('res:', res)
V. Chebyshev distance
The Chebyshev distance originated from the king's move in chess. In chess, the king can only take one step in the surrounding 8 squares at a time. Then if you want to walk from the A square (x1, y1) to the B square (x2) ,y2) At least a few steps need to be taken? You will find that the minimum number of steps is always max(|x2-x1|,|y2-y1|) steps. There is a similar distance measurement method called Chebyshev distance.
def chebyshev_distance(p, q):
assert len(p) == len(q)
return max([abs(x - y) for x, y in zip(p, q)])
res = chebyshev_distance([0,0], [1,3])
print('res:', res)
Six. Ran's distance
def canberra_distance(p, q):
n = len(p)
distance = 0
for i in range(n):
if p[i] == 0 and q[i] == 0:
distance += 0
else:
distance += abs(p[i] - q[i]) / (abs(p[i]) + abs(q[i]))
return distance
res = canberra_distance([1,0], [0,1])
print('res:', res)
7. Minkowski distance
p=2 is the Euclidean distance, and p=1 is the Manhattan distance. In the extreme case when p is infinite, the Chebyshev distance can be obtained:
def minkowski_distance(p, q, n):
assert len(p) == len(q)
return sum([abs(x - y) ** n for x, y in zip(p, q)]) ** (1. / n)
res = minkowski_distance([1, 0], [0, 1], n=2.)
print('res:', res)
8. Edit distance
Edit distance , also known as Levenshtein distance (Levenshtein distance , also known as Edit Distance), refers to the minimum number of edit operations required to convert two strings from one to the other. If their distance is larger, they are explained. The more different. The permitted editing operations include replacing one character with another , inserting a character , and deleting a character .
Method 1: Adjust the package
import Levenshtein
texta = '者記聞新'
textb = '浪(第'
print(Levenshtein.distance(texta, textb))
Method 2: dynamic programming
import os
import numpy as np
def edit_distance(S1,S2):
#S1列 S2行
mat = [[0] *(len(S1)+1) for i in range(len(S2)+1)]
# print('mat:', mat)
for i in range(len(S2)):
mat[i+1][0] = mat[i][0]+1
# print('mat:', mat)
for i in range(len(S1)):
mat[0][i+1] = mat[0][i]+1
# print('mat:\n', np.array(mat))
#相等就为0 不想等加1
for i in range(len(S2)):
for j in range(len(S1)):
if S2[i] == S1[j]:
# print('S2[i]:', S2[i])
mat[i + 1][j + 1] = min(mat[i][j] + 0, mat[i + 1][j]+1, mat[i][j + 1]+1)
else:
mat[i + 1][j + 1] = min(mat[i][j] + 1, mat[i + 1][j]+1, mat[i][j + 1]+1)
# print('mat:\n', np.array(mat))
dis = mat[-1][-1]
print('dis:', dis)
return dis
# S1 = 'iva1'
# S2 = 'iva'
S2 = '者記聞新'
S1 = '浪(第'
dis = edit_distance(S1, S2)
print('dis:', dis)
Nine. Jackard similarity
def jaccard_sim(a, b):
unions = len(set(a).union(set(b)))
intersections = len(set(a).intersection(set(b)))
return intersections / unions
a = ['1', '0']
b = ['1', '1', '1']
res = jaccard_sim(a, b)
print('res:', res)
10. Dice distance
def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
intersections = len(set(a).intersection(set(b)))
return intersections * 2.0/(len(set(a)) + len(set(b)))
res = dice_coefficient(a = [1, 0], b =[0, 1])
print('===res:',res)