Similarity of the Curve Slopes

  • Created by: Andrés Segura Tinoco
  • Created on: May 19, 2020
  • Data: Covid 19

1. Read C19 data by country

In [1]:
# # Import libraries
import pandas as pd
import numpy as np
import datetime
In [2]:
# Reading historical data
dataURL = "../data/historical_data.csv"
column_list = ["country", "region", "subregion", "date", "total_cases", "total_deaths", "diff_total_cases", "diff_total_deaths"]
raw_data = pd.read_csv(dataURL, usecols = lambda column : column in column_list)
raw_data
Out[2]:
country region subregion date total_cases total_deaths diff_total_cases diff_total_deaths
0 China Asia Eastern Asia 01/22/2020 571 17 0 0
1 Japan Asia Eastern Asia 01/22/2020 2 0 0 0
2 China Asia Eastern Asia 01/23/2020 830 25 259 8
3 Japan Asia Eastern Asia 01/23/2020 2 0 0 0
4 China Asia Eastern Asia 01/24/2020 1287 41 457 16
... ... ... ... ... ... ... ... ...
22127 Vietnam Asia South-Eastern Asia 05/28/2020 327 0 0 0
22128 Western Sahara Africa Northern Africa 05/28/2020 9 1 0 0
22129 Yemen Asia Western Asia 05/28/2020 256 53 0 0
22130 Zambia Africa Eastern Africa 05/28/2020 1057 7 0 0
22131 Zimbabwe Africa Eastern Africa 05/28/2020 132 4 0 0

22132 rows × 8 columns

In [3]:
# Apply data type quality
raw_data["date"] = pd.to_datetime(raw_data["date"])
raw_data.dtypes
Out[3]:
country                      object
region                       object
subregion                    object
date                 datetime64[ns]
total_cases                   int64
total_deaths                  int64
diff_total_cases              int64
diff_total_deaths             int64
dtype: object

2. Get Country List

In [4]:
today = pd.Timestamp('today').floor('D')
min_total_cases = 1000
min_deaths = 50
country_data = raw_data[(raw_data["date"] >= today) &
                        (raw_data["total_cases"] >= min_total_cases) &
                        (raw_data["total_deaths"] >= min_deaths)]
country_dict = dict(zip(country_data.country, country_data.region))
len(country_dict)
Out[4]:
80

3. Calculate Curve Slope by Country

In [5]:
# Calculate the curve slope of each country
def calc_curve_slope(raw_data, country_list, var_name, top_date, norm=False):
    curve_slope = {}

    for country in country_list:
        
        # Filter data by country
        country_fulldata = raw_data[raw_data["country"] == country]
        country_data = country_fulldata[country_fulldata["date"] >= top_date]
        
        # Get x and y values
        y_values = np.array(country_data[var_name])
        x_values = np.arange(0, len(y_values))
        
        # Normalize curves
        if norm:
            y_max = max(np.array(country_fulldata[var_name]))
            if y_max > 0:
                y_values = y_values / y_max
            else:
                print('Error with country %s, max value is zero for %s.' % (country, var_name))
        
        # Calculate curve slope
        X = x_values - x_values.mean()
        Y = y_values - y_values.mean()
        slope = (X.dot(Y)) / (X.dot(X))
        
        curve_slope[country] = slope
    
    return curve_slope
In [6]:
# Filtering data
x_var_name = "total_cases"
y_var_name = "total_deaths"
norm = False
last_days = 15
top_date = datetime.datetime.today() - datetime.timedelta(days=last_days)
top_date
Out[6]:
datetime.datetime(2020, 5, 13, 11, 12, 3, 678732)
In [7]:
# Calculate X and Y slope by country
x_data_slope = calc_curve_slope(raw_data, country_dict.keys(), x_var_name, top_date, norm)
y_data_slope = calc_curve_slope(raw_data, country_dict.keys(), y_var_name, top_date, norm)

4. Select Top N by Variable

In [8]:
# Filter X variable
top_country = 10
output = sorted(x_data_slope, key=x_data_slope.get, reverse=True)[:top_country]
In [9]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, x_data_slope[country]))
country, curve_slope
USA, 21711.864285714284
Brazil, 16132.3
Russia, 9013.903571428571
India, 6026.303571428572
Peru, 4084.782142857143
Chile, 3636.8892857142855
Mexico, 2680.285714285714
UK, 2472.589285714286
Saudi Arabia, 2444.8464285714285
Iran, 2117.1892857142857
In [10]:
# Filter Y variable
output = sorted(y_data_slope, key=y_data_slope.get, reverse=True)[:top_country]
In [11]:
# Show Top 10 countries with the highest Curve Slope
print('country, curve_slope')
for country in output:
    print('%s, %s' % (country, y_data_slope[country]))
country, curve_slope
USA, 1114.825
Brazil, 888.6857142857143
Mexico, 311.42857142857144
UK, 291.7964285714286
India, 145.7357142857143
Russia, 129.83214285714286
Peru, 124.80714285714286
Italy, 122.45714285714286
Canada, 97.17857142857143
France, 77.075

5. Select Top N by Quadrant

In [12]:
# Quadrant variables
top_country = 10
x_median = np.median(list(x_data_slope.values()))
y_median = np.median(list(y_data_slope.values()))
x_median, y_median
Out[12]:
(198.72321428571428, 5.601785714285715)
In [13]:
# Showing data
print('country,region,%s,%s' % (x_var_name, y_var_name))
for country, region in country_dict.items():
    x = x_data_slope[country]
    y = y_data_slope[country]
    if norm or (x > 0 and y > 0):
        print("%s,%s,%.4f,%.4f" % (country, region, x, y))
country,region,total_cases,total_deaths
Afghanistan,Asia,541.6750,6.5214
Algeria,Africa,181.2571,7.1786
Argentina,Americas,533.0036,11.5179
Armenia,Asia,313.3000,3.9464
Australia,Oceania,10.0536,0.4179
Austria,Europe,39.1607,2.0107
Azerbaijan,Asia,129.8500,1.5321
Bangladesh,Asia,1565.0607,20.5107
Belarus,Europe,935.8964,4.8321
Belgium,Europe,251.2786,33.6786
Bolivia,Americas,335.1857,10.4857
Bosnia and Herzegovina,Europe,16.3429,1.9964
Brazil,Americas,16132.3000,888.6857
Bulgaria,Europe,27.6893,2.6464
Cameroon,Africa,202.3107,3.0821
Canada,Americas,1067.6107,97.1786
Chile,Americas,3636.8893,37.9964
China,Asia,4.4179,0.0643
Colombia,Americas,790.3643,21.0821
Croatia,Europe,1.9679,0.5821
Cuba,Americas,10.5857,0.2893
Czechia,Europe,58.3179,2.0500
Denmark,Europe,57.8429,2.2071
Dominican Republic,Americas,332.4429,4.2321
DRC,Africa,101.2036,1.1143
Ecuador,Americas,526.3750,59.6321
Egypt,Africa,684.7929,18.3286
Estonia,Europe,6.4464,0.2286
Finland,Europe,38.2750,1.6071
France,Europe,322.3214,77.0750
Germany,Europe,513.4821,44.9107
Greece,Europe,8.1750,1.1964
Guatemala,Americas,208.3464,2.9893
Honduras,Americas,180.4857,5.1393
Hungary,Europe,31.4214,5.0429
India,Asia,6026.3036,145.7357
Indonesia,Asia,629.9500,32.3464
Iran,Asia,2117.1893,56.3107
Iraq,Asia,160.7464,4.8214
Ireland,Europe,70.9393,8.9464
Israel,Asia,15.7071,1.1214
Italy,Europe,593.4214,122.4571
Japan,Asia,38.2857,11.6571
Kenya,Africa,56.5786,0.6893
Kuwait,Asia,878.4393,6.6857
Lithuania,Europe,10.7964,0.9536
Luxembourg,Europe,6.5893,0.4857
Malaysia,Asia,62.3571,0.2357
Mali,Africa,24.8536,1.9571
Mexico,Americas,2680.2857,311.4286
Moldova,Europe,148.6643,6.0643
Morocco,Africa,79.0464,0.9714
Netherlands,Europe,176.7893,20.3750
Nigeria,Africa,266.3286,6.6429
North Macedonia,Europe,25.3929,1.7679
Norway,Europe,15.1643,0.3000
Pakistan,Asia,1778.0750,33.7000
Panama,Americas,195.1357,4.1571
Peru,Americas,4084.7821,124.8071
Philippines,Asia,245.6179,7.9429
Poland,Europe,379.4071,10.6214
Portugal,Europe,228.7357,13.6607
Romania,Europe,177.7929,12.4357
Russia,Europe,9013.9036,129.8321
S. Korea,Asia,22.9500,0.6571
Saudi Arabia,Asia,2444.8464,11.1857
Serbia,Europe,70.6429,1.1821
Slovenia,Europe,0.4964,0.4143
Somalia,Africa,33.1250,1.0643
South Africa,Africa,991.6679,24.6000
Sudan,Africa,188.9214,8.1500
Sweden,Europe,497.2357,49.1071
Switzerland,Europe,22.0143,3.4643
Thailand,Asia,2.5107,0.0786
Turkey,Asia,1075.4429,30.0857
UAE,Asia,843.2500,3.6857
UK,Europe,2472.5893,291.7964
Ukraine,Europe,385.9893,14.7786
USA,Americas,21711.8643,1114.8250