# Load the Pandas libraries
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
# Load visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
# Read FIFA 19 players data only using the current skill values
dataURL = "../data/fifa19_overall_data.csv"
data = pd.read_csv(dataURL)
Only the numeric variables of the dataset will be used to perform the PCA. Then, the variables that you do not want to include in the analysis are discarded, and finally, the data is filtered with the median of the overall of the players.
# UTIL FUNCTION - Get Dataframe Columns List By Type
def getColumnsByType(data, colType, equals, show):
colList = []
for col in data.columns:
if (equals and str(data[col].dtype) == colType) or (not equals and str(data[col].dtype) != colType):
colList.append(col)
if show:
print(str(data.columns.get_loc(col)) + " - " + col + " - " + str(data[col].dtype))
return colList;
# Show object (string or date) type columns
objColList = getColumnsByType(data, "object", True, True)
2 - Name - object 4 - Photo - object 5 - Nationality - object 6 - Flag - object 9 - Club - object 10 - Club Logo - object 14 - Preferred Foot - object 18 - Work Rate - object 19 - Body Type - object 20 - Real Face - object 21 - Position - object 23 - Joined - object 24 - Loaned_From - object 25 - Contract Valid Until - object
# Show numeric (float or integer) type columns
numColList = getColumnsByType(data, "object", False, True)
0 - Order - int64 1 - ID - int64 3 - Age - int64 7 - Overall - int64 8 - Potential - int64 11 - Value € - float64 12 - Wage € - float64 13 - Special - int64 15 - International Reputation - float64 16 - Weak Foot - float64 17 - Skill Moves - float64 22 - Jersey Number - float64 26 - HeightMts - float64 27 - WeightLbs - float64 28 - LS - int64 29 - ST - int64 30 - RS - int64 31 - LW - int64 32 - LF - int64 33 - CF - int64 34 - RF - int64 35 - RW - int64 36 - LAM - int64 37 - CAM - int64 38 - RAM - int64 39 - LM - int64 40 - LCM - int64 41 - CM - int64 42 - RCM - int64 43 - RM - int64 44 - LWB - int64 45 - LDM - int64 46 - CDM - int64 47 - RDM - int64 48 - RWB - int64 49 - LB - int64 50 - LCB - int64 51 - CB - int64 52 - RCB - int64 53 - RB - int64 54 - Crossing - float64 55 - Finishing - float64 56 - HeadingAccuracy - float64 57 - ShortPassing - float64 58 - Volleys - float64 59 - Dribbling - float64 60 - Curve - float64 61 - FKAccuracy - float64 62 - LongPassing - float64 63 - BallControl - float64 64 - Acceleration - float64 65 - SprintSpeed - float64 66 - Agility - float64 67 - Reactions - float64 68 - Balance - float64 69 - ShotPower - float64 70 - Jumping - float64 71 - Stamina - float64 72 - Strength - float64 73 - LongShots - float64 74 - Aggression - float64 75 - Interceptions - float64 76 - Positioning - float64 77 - Vision - float64 78 - Penalties - float64 79 - Composure - float64 80 - Marking - float64 81 - StandingTackle - float64 82 - SlidingTackle - float64 83 - GKDiving - float64 84 - GKHandling - float64 85 - GKKicking - float64 86 - GKPositioning - float64 87 - GKReflexes - float64 88 - Release Clause € - float64
# Split data into header and skills dataframes
header = pd.DataFrame()
skills = pd.DataFrame()
for col in data.columns:
if col in objColList:
header[col] = data[col]
else:
skills[col] = data[col]
# Delete original dataset from memory
del data
# Show first 5 rows of header dataframe
header.head()
Name | Photo | Nationality | Flag | Club | Club Logo | Preferred Foot | Work Rate | Body Type | Real Face | Position | Joined | Loaned_From | Contract Valid Until | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | L. Messi | https://cdn.sofifa.org/players/4/19/158023.png | Argentina | https://cdn.sofifa.org/flags/52.png | FC Barcelona | https://cdn.sofifa.org/teams/2/light/241.png | Left | Medium/ Medium | Messi | Yes | RF | 1-Jul-04 | NaN | 2021 |
1 | Cristiano Ronaldo | https://cdn.sofifa.org/players/4/19/20801.png | Portugal | https://cdn.sofifa.org/flags/38.png | Juventus | https://cdn.sofifa.org/teams/2/light/45.png | Right | High/ Low | C. Ronaldo | Yes | ST | 10-Jul-18 | NaN | 2022 |
2 | Neymar Jr | https://cdn.sofifa.org/players/4/19/190871.png | Brazil | https://cdn.sofifa.org/flags/54.png | Paris Saint-Germain | https://cdn.sofifa.org/teams/2/light/73.png | Right | High/ Medium | Neymar | Yes | LW | 3-Aug-17 | NaN | 2022 |
3 | De Gea | https://cdn.sofifa.org/players/4/19/193080.png | Spain | https://cdn.sofifa.org/flags/45.png | Manchester United | https://cdn.sofifa.org/teams/2/light/11.png | Right | Medium/ Medium | Lean | Yes | GK | 1-Jul-11 | NaN | 2020 |
4 | K. De Bruyne | https://cdn.sofifa.org/players/4/19/192985.png | Belgium | https://cdn.sofifa.org/flags/7.png | Manchester City | https://cdn.sofifa.org/teams/2/light/10.png | Right | High/ High | Normal | Yes | RCM | 30-Aug-15 | NaN | 2023 |
# Show first 5 rows of skills dataframe
skills.head()
Order | ID | Age | Overall | Potential | Value € | Wage € | Special | International Reputation | Weak Foot | ... | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | Release Clause € | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 158023 | 31 | 94 | 94 | 110500000.0 | 565000.0 | 2202 | 5.0 | 4.0 | ... | 96.0 | 33.0 | 28.0 | 26.0 | 6.0 | 11.0 | 15.0 | 14.0 | 8.0 | 226500000.0 |
1 | 1 | 20801 | 33 | 94 | 94 | 77000000.0 | 405000.0 | 2228 | 5.0 | 4.0 | ... | 95.0 | 28.0 | 31.0 | 23.0 | 7.0 | 11.0 | 15.0 | 14.0 | 11.0 | 127100000.0 |
2 | 2 | 190871 | 26 | 92 | 93 | 118500000.0 | 290000.0 | 2143 | 5.0 | 5.0 | ... | 94.0 | 27.0 | 24.0 | 33.0 | 9.0 | 9.0 | 15.0 | 15.0 | 11.0 | 228100000.0 |
3 | 3 | 193080 | 27 | 91 | 93 | 72000000.0 | 260000.0 | 1471 | 4.0 | 3.0 | ... | 68.0 | 15.0 | 21.0 | 13.0 | 90.0 | 85.0 | 87.0 | 88.0 | 94.0 | 138600000.0 |
4 | 4 | 192985 | 27 | 91 | 92 | 102000000.0 | 355000.0 | 2281 | 4.0 | 5.0 | ... | 88.0 | 68.0 | 58.0 | 51.0 | 15.0 | 13.0 | 5.0 | 10.0 | 13.0 | 196400000.0 |
5 rows × 75 columns
# Filter/Delete player with an overall below 85
var_filter = "Overall"
threshold = 85
skills = skills.loc[skills[var_filter] >= threshold]
len(skills)
110
# Remove columns that are not relevant for the analysis
nonColumns = ["Order", "ID", "Overall", "Potential", "Value €", "Wage €", "Release Clause €"]
skills = skills.drop(nonColumns, axis = 1)
numColList = [e for e in numColList if e not in nonColumns]
# Columns size validation
len(skills.columns) == len(numColList)
True
# Replace NaN values with average of columns
skills = skills.fillna(skills.mean())
# Standardize the Data
x = skills.loc[:,numColList].values
x = StandardScaler().fit_transform(x)
# Show skills data in temporary dataframe
dfSkills = pd.DataFrame(data = x, columns = numColList)
dfSkills.head()
Age | Special | International Reputation | Weak Foot | Skill Moves | Jersey Number | HeightMts | WeightLbs | LS | ST | ... | Penalties | Composure | Marking | StandingTackle | SlidingTackle | GKDiving | GKHandling | GKKicking | GKPositioning | GKReflexes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.776706 | 0.724004 | 2.310325 | 0.743626 | 0.831174 | -0.173639 | -1.639945 | -0.770754 | 0.818270 | 0.818270 | ... | 0.595888 | 2.111447 | -1.062921 | -1.027494 | -0.951904 | -0.517153 | -0.308874 | -0.119577 | -0.178222 | -0.423847 |
1 | 1.361895 | 0.822752 | 2.310325 | 0.743626 | 1.693714 | -0.548156 | 0.723295 | 0.560044 | 0.938657 | 0.938657 | ... | 1.145324 | 1.955673 | -1.267257 | -0.919025 | -1.059078 | -0.475901 | -0.308874 | -0.119577 | -0.178222 | -0.303270 |
2 | -0.686267 | 0.499921 | 2.310325 | 2.084591 | 1.693714 | -0.173639 | -0.983490 | -1.269804 | 0.657753 | 0.657753 | ... | 0.925549 | 1.799899 | -1.308124 | -1.172118 | -0.701830 | -0.393397 | -0.393602 | -0.119577 | -0.136510 | -0.303270 |
3 | -0.393673 | -2.052339 | 0.882596 | -0.597339 | -1.756444 | -1.297189 | 1.379750 | -0.271705 | -2.713095 | -2.713095 | ... | -1.327137 | -2.250227 | -1.798531 | -1.280587 | -1.416327 | 2.948037 | 2.826078 | 3.146120 | 2.908430 | 3.032701 |
4 | -0.393673 | 1.024046 | 0.882596 | 2.084591 | 0.831174 | -0.548156 | -0.327034 | -1.048004 | 0.577495 | 0.577495 | ... | 0.815662 | 0.865254 | 0.367434 | 0.057193 | -0.058784 | -0.145883 | -0.224145 | -0.573146 | -0.345068 | -0.222885 |
5 rows × 68 columns
# Calculate skills correlations
corr = dfSkills.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax1 = plt.subplots(figsize = (18, 18))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n = 9)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
square = True, linewidths = .5, cbar_kws = {"shrink": .5})
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 45, horizontalalignment = 'right');
# Add title
ax1.set_title("Skills Correlation Triangle", fontsize = 20)
plt.show()
Definition: Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. Source: Wikipedia
# Principal Component Analysis
pca = PCA(n_components = 5)
pcaData = pca.fit_transform(x)
# Create and show principal components DataFrame
pcaDF = pd.DataFrame(data = pcaData, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"])
pcaDF = pd.concat([pcaDF, header[["Name"]]], axis = 1)
pcaDF = pcaDF[pcaDF["PC1"].notnull()]
pcaDF
PC1 | PC2 | PC3 | PC4 | PC5 | Name | |
---|---|---|---|---|---|---|
0 | -4.593481 | -5.478033 | 1.127111 | 1.252885 | -2.286953 | L. Messi |
1 | -4.080334 | -3.406618 | 4.334073 | -0.485612 | -2.817897 | Cristiano Ronaldo |
2 | -4.048185 | -5.546171 | 0.505673 | 0.616553 | -1.730906 | Neymar Jr |
3 | 17.658339 | -2.364764 | -0.405911 | 1.214861 | -1.087192 | De Gea |
4 | -4.982714 | -1.635134 | 0.422099 | 1.768055 | -0.096850 | K. De Bruyne |
... | ... | ... | ... | ... | ... | ... |
105 | -1.458081 | -2.661787 | 2.633530 | -0.396837 | 0.622312 | K. Benzema |
106 | -2.714554 | 2.392151 | -0.847898 | 0.752160 | -1.100449 | Filipe Luís |
107 | 0.784593 | 4.941324 | 1.491644 | 1.390592 | -0.776510 | V. Kompany |
108 | 1.749388 | 5.147898 | 0.692962 | 0.374109 | -1.701941 | Pepe |
109 | -1.101967 | -1.384143 | 5.518075 | 1.973968 | 0.789597 | Z. Ibrahimović |
110 rows × 6 columns
# Show correlation between components
fig, ax = plt.subplots(figsize = (10, 10))
sns.heatmap(pcaDF.corr(), square=True, annot=True)
ax.set_title("Correlation between Components", fontsize = 16)
plt.show()
As you can see, there is no correlation between the principal components because they are orthogonal.
# The explained variance tells us how much information (variance) can be attributed to each of the principal components
list(pca.explained_variance_ratio_)
[0.6559769839499195, 0.14857826650344874, 0.04305208015677819, 0.03029528547335026, 0.02500385682900391]
# Create horizontal bar chart data
bars = ("PC1", "PC2", "PC3", "PC4", "PC5")
y_pos = np.arange(len(bars))
values = pca.explained_variance_ratio_ * 100
cum = np.cumsum(values)
# Set up the matplotlib figure
fig, ax2 = plt.subplots(figsize = (12, 10))
plt.bar(y_pos, values, align = "center", alpha = 0.7)
plt.xticks(y_pos, bars)
plt.plot(y_pos, cum, color = "orange", linewidth = 2, marker="o")
plt.title("Variance Ratio By Component", fontsize = 20)
# Add bar labels
for i, v in enumerate(cum):
ax2.text(i - .15, v + 1, (str(round(v, 1))+"%"), color = "black", fontweight = "normal", fontsize = 11)
# Plot setup
plt.xlabel("Components", fontsize = 12)
plt.ylabel("Explained variance in percent", fontsize = 12)
plt.legend(("Cum", "Var"), loc = "best")
plt.show()
# Create a matshow plot of the Principal Components dependencies
fig = plt.figure(figsize = (16, 2))
plt.matshow(pca.components_, cmap = "viridis", fignum = fig.number, aspect = "auto")
plt.yticks([0, 1, 2, 3, 4], ["PC1", "PC2", "PC3", "PC4", "PC5"], fontsize = 10)
plt.colorbar()
plt.xticks(range(len(skills.columns)), skills.columns, rotation = 65, ha = "left")
plt.show()
# Show the total explained variance ratio of model: Only 2 components
n_components = 2
sum(pca.explained_variance_ratio_[0:n_components]) * 100
80.45552504533683
# Eigen-vectors data
n_vectors = 2
lengths = pca.explained_variance_[0:n_vectors]
vectors = pca.components_[0:n_components, 0:n_vectors]
means = pca.mean_[0:n_vectors]
# Function to draw vectors on plane
def draw_vector(v0, v1, ax = None):
ax = ax or plt.gca()
arrowprops = dict(arrowstyle = "->", linewidth = 2, shrinkA = 0, shrinkB = 0, color = "#ff7f0e")
ax.annotate("", v1, v0, arrowprops = arrowprops)
# Create scatter plot with players label
fig, ax3 = plt.subplots(figsize = (14, 14))
# Create 2D scatter plot
plot = sns.regplot(ax = ax3, data = pcaDF, x = "PC1", y = "PC2", fit_reg = False
, marker = "o", color = "#1f77b4", scatter_kws = {"s": 75})
# Add annotations one by one with a loop
for ix in range(0, pcaDF.shape[0]):
plot.text(pcaDF.PC1[ix] + 0.2, pcaDF.PC2[ix] - 0.05, pcaDF.Name[ix]
, horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Drawing the eigen-vectors
for length, vector in zip(lengths, vectors):
v = vector * 3 * np.sqrt(length)
draw_vector(means, means + v)
# Plot setup
ax3.set_xlabel("PC 1", fontsize = 12)
ax3.set_ylabel("PC 2", fontsize = 12)
ax3.set_title("2D Best Players FIFA 19", fontsize = 20)
ax3.legend(["Best Players (overall >= " + str(threshold) + ")"])
ax3.grid()
# Show the total explained variance ratio of model: Only 3 components
sum(pca.explained_variance_ratio_[0:3]) * 100
84.76073306101465
# Create 3D scatter plot
fig = plt.figure(figsize = (16, 16))
ax4 = fig.add_subplot(111, projection = "3d")
# Get (x, y, z) axis values
xx = pcaDF.loc[:,["PC1"]].values
yy = pcaDF.loc[:,["PC2"]].values
zz = pcaDF.loc[:,["PC3"]].values
# Plot values
ax4.scatter(xx, yy, zz, c = "#1f77b4", marker = "o", s = 75)
# Add annotations one by one with a loop
for ix in range(0, len(x)):
ax4.text(float(xx[ix]), float(yy[ix]), float(zz[ix]), pcaDF.Name[ix]
, horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Plot setup
ax4.set_xlabel("PC 1", fontsize = 12)
ax4.set_ylabel("PC 2", fontsize = 12)
ax4.set_zlabel("PC 3", fontsize = 12)
ax4.set_title("3D Best Players FIFA 19", fontsize = 20)
ax4.legend(["Best Players (overall >= " + str(threshold) + ")"])
ax4.grid()