Packages Installation
conda config --set pip_interop_enabled True
pip install similaritymeasures
Methods:
# # Import libraries
import pandas as pd
import numpy as np
import math
import similaritymeasures
from collections import Counter
# Load visualization libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
# Reading data from CSV file
dataURL = "../data/country_cases_by_days.csv"
raw_data = pd.read_csv(dataURL)
# Show first 10 rows of header dataframe
raw_data.head(10)
# Control variables
cnt = Counter()
top_country = 10
n_days = 90
n_methods = 6
# Analysis variables
exp_country = 'Colombia'
exp_data = np.zeros((n_days, 2))
countries_data = dict()
countries_region = dict()
# Split data between expected and reality datasets
for ix, row in raw_data.iterrows():
country_name = row["country"]
countries_region[country_name] = row["region"]
y = []
for i in range(1, n_days + 1):
if pd.notna(row[str(i)]):
y.append(int(row[str(i)]))
else:
break
if country_name == exp_country:
exp_data[:, 0] = np.arange(n_days)
exp_data[:, 1] = y
else:
if len(y) == n_days:
temp_data = np.zeros((n_days, 2))
temp_data[:, 0] = np.arange(n_days)
temp_data[:, 1] = y
countries_data[country_name] = temp_data
mae_result = dict()
for country, num_data in countries_data.items():
mae = 0
for ix in range(0, n_days):
mae += abs(exp_data[ix, 1] - num_data[ix, 1])
mae_result[country] = mae / n_days
# Get and save top N
output = sorted(mae_result, key=mae_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, mae_result[country]))
# Quantify the difference between the two curves using PCM
pcm_result = dict()
for country, num_data in countries_data.items():
pcm = similaritymeasures.pcm(exp_data, num_data)
pcm_result[country] = pcm
# Get and save top N
output = sorted(pcm_result, key=pcm_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, pcm_result[country]))
### Quantify the difference between the two curves using Discrete Frechet distance
df_result = dict()
for country, num_data in countries_data.items():
df = similaritymeasures.frechet_dist(exp_data, num_data)
df_result[country] = df
# Get and save top N
output = sorted(df_result, key=df_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, df_result[country]))
# Quantify the difference between the two curves using Area between two Curves
area_result = dict()
for country, num_data in countries_data.items():
area = similaritymeasures.area_between_two_curves(exp_data, num_data)
area_result[country] = area
# Get and save top N
output = sorted(area_result, key=area_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, area_result[country]))
# Quantify the difference between the two curves using Curve Length based similarity measure
cl_result = dict()
for country, num_data in countries_data.items():
cl = similaritymeasures.curve_length_measure(exp_data, num_data)
cl_result[country] = cl
# Get and save top N
output = sorted(cl_result, key=cl_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, cl_result[country]))
# Quantify the difference between the two curves using Dynamic Time Warping distance
dtw_result = dict()
for country, num_data in countries_data.items():
dtw, d = similaritymeasures.dtw(exp_data, num_data)
dtw_result[country] = dtw
# Get and save top N
output = sorted(dtw_result, key=dtw_result.get, reverse=False)[:top_country]
for country in output:
cnt[country] += 1
print('%s: %s' % (country, dtw_result[country]))
cnt.most_common()
# Palette of colores
palette = {"Africa": "#FF7F0E", "Americas": "#D62728", "Asia": "#2CA02C", "Europe": "#1F77B4", "Oceania": "#9467BD"}
bars = []
values= []
for row in cnt.most_common():
bars.append(row[0])
values.append(row[1])
# Plot similarity results
plt.figure(figsize = (14, 8))
barlist = plt.bar(bars, values)
for i, v in enumerate(bars):
barlist[i].set_color(palette[countries_region[v]])
# Add custom legend
legend_list = []
for k, v in palette.items():
legend_list.append(mpatches.Patch(color = v, label = k))
# Plot setup
plt.xticks(rotation=45)
plt.title("Countries Similarity Results", fontsize = 18)
plt.xlabel("Country", fontsize = 12)
plt.ylabel("Total Occurrences", fontsize = 12)
plt.legend(handles = legend_list, loc = "best")
plt.show()
# Plot trends more similar to Colombia
country_list = ['Colombia']
plt.figure(figsize = (18, 8))
plt.plot(exp_data[:, 0], exp_data[:, 1])
for country in cnt:
if cnt[country] == n_methods:
country_list.append(country)
num_data = countries_data[country]
plt.plot(num_data[:, 0], num_data[:, 1], linestyle='--')
# Plot setup
plt.title("Total Cases from day 1 of Infection", fontsize = 18)
plt.xlabel("N Days from first infection", fontsize = 12)
plt.ylabel("Total cases", fontsize = 12)
plt.legend(country_list, loc = "best")
plt.show()
Jekel, C. F., Venter, G., Venter, M. P., Stander, N., & Haftka, R. T. (2018). Similarity measures for identifying material parameters from hysteresis loops using inverse analysis. International Journal of Material Forming. https://doi.org/10.1007/s12289-018-1421-8