# Import libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
# Reading data from CSV file
dataURL = "../data/country_info.csv"
countries = pd.read_csv(dataURL)
dataURL = "../data/current_data.csv"
raw_data = pd.read_csv(dataURL, usecols = lambda column : column not in ["row_index", "date", "datestamp"])
# Show first 10 rows of header dataframe
countries.head(10)
# Show first 10 rows of header dataframe
region = "Europe"
america_list = list(countries[countries["region"] == region]["country"])
len(america_list)
raw_data = raw_data[raw_data["country"].isin(america_list)]
raw_data = raw_data.reset_index(drop=True)
raw_data.head(10)
# Create numerical dataset
dataset = raw_data.drop(columns=["country"])
col_list = dataset.columns
# Filter data
min_deaths = 200
x = dataset.loc[dataset['total_deaths'] >= min_deaths]
len(x)
# Standardize the Data
x = StandardScaler().fit_transform(x)
# Show skills data in temporary dataframe
norm_data = pd.DataFrame(data = x, columns = col_list)
norm_data.head()
# Calculate skills correlations
corr = norm_data.corr()
corr
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax1 = plt.subplots(figsize = (18, 18))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n = 9)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
square = True, linewidths = .5, cbar_kws = {"shrink": .5}, annot=True)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 45, horizontalalignment = 'right');
# Add title
ax1.set_title("Skills Correlation Triangle", fontsize = 20)
plt.show()
# Principal Component Analysis
pca = PCA(n_components = 5)
pca_data = pca.fit_transform(x)
len(pca_data)
# Create and show principal components DataFrame
df_pca = pd.DataFrame(data = pca_data, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"])
df_pca = pd.concat([df_pca, raw_data["country"]], axis = 1)
df_pca = df_pca[df_pca["PC1"].notnull()]
df_pca.head(10)
# Show correlation between components
fig, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(df_pca.corr(), square=True, annot=True)
ax.set_title("Correlation between Components", fontsize = 16)
plt.show()
# The explained variance tells us how much information (variance) can be attributed to each of the principal components
list(pca.explained_variance_ratio_)
# Create horizontal bar chart data
bars = ("PC1", "PC2", "PC3", "PC4", "PC5")
y_pos = np.arange(len(bars))
values = pca.explained_variance_ratio_ * 100
cum = np.cumsum(values)
fig, ax2 = plt.subplots(figsize = (12, 10))
plt.bar(y_pos, values, align = "center", alpha = 0.7)
plt.xticks(y_pos, bars)
plt.plot(y_pos, cum, color = "orange", linewidth = 2, marker="o")
plt.title("Variance Ratio By Component", fontsize = 20)
# Add bar labels
for i, v in enumerate(cum):
ax2.text(i - .15, v + 1, (str(round(v, 1))+"%"), color = "black", fontweight = "normal", fontsize = 11)
# Plot setup
plt.xlabel("Components", fontsize = 12)
plt.ylabel("Explained variance in percent", fontsize = 12)
plt.legend(("Cum", "Var"), loc = "best")
plt.show()
# Create a matshow plot of the Principal Components dependencies
fig = plt.figure(figsize = (16, 2))
plt.matshow(pca.components_, cmap = "viridis", fignum = fig.number, aspect = "auto")
plt.yticks([0, 1, 2, 3, 4], ["PC1", "PC2", "PC3", "PC4", "PC5"], fontsize = 10)
plt.colorbar()
plt.xticks(range(len(col_list)), col_list, rotation = 65, ha = "left")
plt.show()
# Show the total explained variance ratio of model: Only 2 components
n_components = 2
sum(pca.explained_variance_ratio_[0:n_components]) * 100
n_vectors = 2
lengths = pca.explained_variance_[0:n_vectors]
vectors = pca.components_[0:n_components, 0:n_vectors]
means = pca.mean_[0:n_vectors]
# Function to draw vectors on plane
def draw_vector(v0, v1, ax = None):
ax = ax or plt.gca()
arrowprops = dict(arrowstyle = "->", linewidth = 2, shrinkA = 0, shrinkB = 0, color = "#ff7f0e")
ax.annotate("", v1, v0, arrowprops = arrowprops)
fig, ax3 = plt.subplots(figsize = (14, 14))
# Create 2D scatter plot
plot = sns.regplot(ax = ax3, data = df_pca, x = "PC1", y = "PC2", fit_reg = False
, marker = "o", color = "#1f77b4", scatter_kws = {"s": 75})
# Add annotations one by one with a loop
for ix in range(0, df_pca.shape[0]):
plot.text(df_pca.PC1[ix] + 0.1, df_pca.PC2[ix] - 0.03, df_pca.country[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Drawing the eigen-vectors
for length, vector in zip(lengths, vectors):
v = vector * 3 * np.sqrt(length)
draw_vector(means, means + v)
# Plot setup
ax3.set_xlabel("PC 1", fontsize = 12)
ax3.set_ylabel("PC 2", fontsize = 12)
ax3.set_title("2D Covid-19 Similarity by Country", fontsize = 20)
ax3.legend(["Countries (total deaths >= " + str(min_deaths) + ")"])
ax3.grid()
# Show the total explained variance ratio of model: Only 3 components
sum(pca.explained_variance_ratio_[0:3]) * 100
# Create 3D scatter plot
fig = plt.figure(figsize = (16, 16))
ax4 = fig.add_subplot(111, projection = "3d")
# Get (x, y, z) axis values
xx = df_pca.loc[:,["PC1"]].values
zz = df_pca.loc[:,["PC2"]].values
yy = df_pca.loc[:,["PC3"]].values
# Plot values
ax4.scatter(xx, yy, zz, c = "#1f77b4", marker = "o", s = 75)
# Add annotations one by one with a loop
for ix in range(0, len(x)):
ax4.text(float(xx[ix]), float(yy[ix]), float(zz[ix]), df_pca.country[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Plot setup
ax4.set_xlabel("PC 1", fontsize = 12)
ax4.set_ylabel("PC 2", fontsize = 12)
ax4.set_zlabel("PC 3", fontsize = 12)
ax4.set_title("3D Covid-19 Similarity by Country", fontsize = 20)
ax4.legend(["Countries (total deaths >= " + str(min_deaths) + ")"])
ax4.grid()
df_pca.head()
# Transpose the matrix
df_temp = df_pca.copy()
df_trans = df_temp.drop(['country'], axis = 1).T
df_trans.columns = list(df_temp['country'])
df_trans.head()
# Calculate the correlation matrix
corr = df_trans.corr(method = 'pearson')
corr.iloc[:10, :10]
# Create Players Correlation Triangle plot
def plotCorrTriangle(corr):
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize = (16, 16))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n=9)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
square = True, linewidths = .5, cbar_kws = {"shrink": .5})
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right');
# Add title
ax.set_title("Correlation Triangle between Countries", fontsize = 20)
# Plot Correlation Triangle
plotCorrTriangle(corr)