1.相关性
先写几个相关性的基础算法和相应的api吧
欧式距离
from scipy import spatial import math # 欧式距离 def d_euclidean(*args): #args为所有向量组成数组,其实就是个二维数组 ''' 第一个值是其它值的对比对象,其它值都是以第一个值为对比标准''' """ 返回也是个数组,为和第一个值对比出来的欧氏距离""" """ 欧式距离公式 d(x,y) = ((y1 - x1)^2 + (y2 - x2)^2 +(y3 - x3)^2 .....)^(0.5)""" base_vector = args[0] d_values = [] for o_vector in args: if base_vector is o_vector: continue d_value = 0 for index in range(len(base_vector)): d_value += pow((o_vector[index] - base_vector[index]),2) d_values.append(math.sqrt(d_value)) if len(d_values) > 0: return d_values else: return -1 vector1 = [0,1,2,3] vector2 = [1,2,3,4] vector3 = [4,3,2,1] print(d_euclidean(vector1,vector2,vector3)) #结果为 [2.0, 4.898979485566356] print(spatial.distance.euclidean(vector1,vector2)) print(spatial.distance.euclidean(vector1,vector3)) # 2.0 # 4.898979485566356 #结果是一样的
曼哈顿距离
#曼哈顿距离
import numpy as np from scipy.spatial.distance import pdist
def d_manhattan(*args): """这个就超级简单了,就是求相差的绝对值""" """然后把这些绝对值加起来就妥了""" base_vector = args[0] d_values = [] for o_vector in args: if base_vector is o_vector: continue d_value = 0 for index in range(len(base_vector)): d_value += abs((o_vector[index] - base_vector[index])) d_values.append(d_value) if len(d_values) > 0: return d_values else: return -1 vector1 = [0,1,2,3] vector2 = [1,2,3,4] vector3 = [4,3,2,1] print(d_manhattan(vector1,vector2,vector3)) # [4,8] print(d_manhattan(vector2,vector3)) #[8] print(pdist(np.vstack([vector1,vector2,vector3]),'cityblock')) #这个api会返回所有的相似关系,是ndarray形式 # 结果 [4 8 8]
余弦相似度
#余弦相似度 import numpy as np from scipy.spatial.distance import pdist def d_cos(*args): base_vector = args[0] a_vals = [np.sum(np.square(x)) for x in args] b_val = a_vals[0] d_values = [] for i in range(len(args)): o_vector = args[i] if i == 0: continue d_value = 0 for index in range(len(base_vector)): d_value += (o_vector[index] * base_vector[index]) d_values.append(1 - d_value/np.sqrt(b_val*a_vals[i])) if len(d_values) > 0: return d_values else: return -1 vector1 = [0,1,2,3] vector2 = [1,2,3,4] vector3 = [4,3,2,1] print(d_cos(vector1,vector2,vector3)) #[0.024099927051466796, 0.5120499635257334] print(d_cos(vector2,vector3)) #[0.33333333333333337] print(pdist(np.vstack([vector1,vector2,vector3]),'cosine')) #[0.02409993 0.51204996 0.33333333]
皮尔森相关系数
import numpy as np from scipy.stats import pearsonr def get_pearsonr(b,o): b_s = np.sum(b) o_s = np.sum(o) b_ss = np.sum(np.square(b)) o_ss = np.sum(np.square(o)) t_bo = 0 n = len(b) for i in range(len(b)): t_bo += b[i] * o[i] denominator = np.sqrt(b_ss - np.square(b_s)/n)*np.sqrt(o_ss - np.square(o_s)/n) if denominator: return (t_bo - (o_s * b_s)/n) / denominator else: return 0 def d_pearsonr(*args): pass vector1 = np.random.normal(1,100,50) vector2 = np.random.normal(2,10,50) vector3 = [4,3,2,1] print(get_pearsonr(vector1,vector2)) #随机数,得到的结果每次都不一样比如这次的结果是-0.10288209400426439 print(pearsonr(vector1,vector2)) #(-0.10288209400426439, 0.4770990724691394) 这次是一样的,但有时候会不一样,有空研究下怎么还会不一样scipy的皮尔森函数返回两个值,第一个是皮尔森系数,另一个是p-value也是表示相关性的