In statistics and related fields, a similarity measure or similarity function is a real-valued function that quantifies the similarity between two objects. In short, a similarity function quantifies how much alike two data objects are [1].
# Load the Python libraries
from math import *
from decimal import Decimal
from scipy import stats as ss
import sklearn.metrics.pairwise as sm
import math
# (1) Euclidean distance function
def euclidean_distance(x, y):
return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
# (2) manhattan distance function
def manhattan_distance(x, y):
return sum(abs(a-b) for a,b in zip(x,y))
# (3) Minkowski distance function
def _nth_root(value, n_root):
root_value = 1/float(n_root)
return round(Decimal(value) ** Decimal(root_value),3)
def minkowski_distance(x, y, p = 3):
return float(_nth_root(sum(pow(abs(a-b), p) for a,b in zip(x, y)), p))
# (4) Cosine similarity function
def _square_rooted(x):
return round(sqrt(sum([a*a for a in x])),3)
def cosine_similarity(x, y):
numerator = sum(a*b for a,b in zip(x,y))
denominator = _square_rooted(x) * _square_rooted(y)
return round(numerator/float(denominator),3)
# (5) Pearson similarity function
def _avg(x):
assert len(x) > 0
return float(sum(x)) / len(x)
def pearson_similarity(x, y):
assert len(x) == len(y)
n = len(x)
assert n > 0
avg_x = _avg(x)
avg_y = _avg(y)
diffprod = 0
xdiff2 = 0
ydiff2 = 0
for idx in range(n):
xdiff = x[idx] - avg_x
ydiff = y[idx] - avg_y
diffprod += xdiff * ydiff
xdiff2 += xdiff * xdiff
ydiff2 += ydiff * ydiff
return diffprod / math.sqrt(xdiff2 * ydiff2)
# (6) Jaccard similarity function
def jaccard_similarity(x, y):
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
return intersection_cardinality / float(union_cardinality)
# Vectors
x = [-4.593481, -5.478033, 1.127111, 1.252885, -2.286953] # Messi
y = [-4.080334, -3.406618, 4.334073, -0.485612, -2.817897] # CR
z = [-4.048185, -5.546171, 0.505673, 0.616553, -1.730906] # Neymar
euclidean_distance(x, y)
4.259455195846412
euclidean_distance(x, z)
1.1841800466723797
manhattan_distance(x, y)
8.060965
manhattan_distance(x, z)
2.4272509999999996
minkowski_distance(x, y)
3.619
minkowski_distance(x, z)
0.941
cosine_similarity(x, y)
0.842
cosine_similarity(x, z)
0.99
pearson_similarity(x, y)
0.8214001476231276
pearson_similarity(x, z)
0.9888645775446726
a = [0, 1, 2, 3, 4, 5]
b = [-1, 1, 2, 0, 3, 5]
jaccard_similarity(a, b)
0.7142857142857143
corr = sm.euclidean_distances([x], [y])
float(corr[0])
4.259455195846413
corr = sm.manhattan_distances([x], [y])
float(corr[0])
8.060965
corr = sm.cosine_similarity([x], [y])
float(corr[0])
0.841904969009294
corr, p_value = ss.pearsonr(x, y)
corr
0.8214001476231275
[1] Wikipedia - Similarity measure.