# Load the Pandas libraries
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np


# Load scikit-learn library for K-Means
from sklearn.cluster import KMeans


# Load visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns


# Load Interact libraries
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


# Read FIFA 19 players data only using the current skill values
dataURL = "../data/fifa19_overall_data.csv"
rawdata = pd.read_csv(dataURL)


# Loading FIFA 19 positions dataset
dataURL = "../data/fifa19_positions.csv"
positions = pd.read_csv(dataURL)


# Function that obtains the numerical data from the data frame
def getNumericalData(data, quality):
    numData = pd.DataFrame()
    
    # Create DataFrame
    for col in data.columns:
        if str(data[col].dtype) != "object":
            numData[col] = data[col]
    
    # Remove columns that are not relevant for the analysis
    nonColumns = ["Order", "ID", "Overall", "Potential", "Value €", "Wage €", "Release Clause €"]
    numData = numData.drop(nonColumns, axis=1)
    
    # Data Quality process
    if quality and len(numData.columns) > 0:
        numData = numData.fillna(numData.mean())
    
    print(numData.shape)
    return numData;


# Set a Overall threshold
var_filter = "Overall"
threshold = rawdata[var_filter].median()
threshold

66.0


# Filter/Delete player with an overall below the threshold
rawdata = rawdata.loc[rawdata[var_filter] >= threshold]
len(rawdata)

9927


# Get only numeric columns/variables
numData = getNumericalData(rawdata, True)
numData.head()

(9927, 68)


# Function that apply Principal Component Analysis
def applyPCA(data, std):
    
    # Standardize the Data
    if std == True:
        x = StandardScaler().fit_transform(data.values)
    else:
        x = data.values
    
    # Create a DataFrame from PCA
    pca = PCA(n_components = 2)
    pcaData = pca.fit_transform(x)
    pcaDF = pd.DataFrame(data = pcaData, columns = ["PC1", "PC2"])
    
    # Show the total explained variance ratio of model
    print('Explained Variance Ratio:', sum(pca.explained_variance_ratio_) * 100)
    
    return pcaDF;


# Function that replace the player position by the zone
def replacePositionByZone(data):
    data["Zone"] = data["Position"]
    
    for ix in range(len(positions)):
        data["Zone"] = data["Zone"].replace(positions.Position[ix], positions.Zone[ix])
    
    return data;


# Apply the PCA algorithm
pcaDF = applyPCA(numData, True)

# Create the PCA data
pcaDF = pd.concat([pcaDF, rawdata[["Position"]]], axis = 1)
pcaDF = pcaDF[pcaDF["PC1"].notnull()]
pcaDF = replacePositionByZone(pcaDF)
pcaDF.head()

Explained Variance Ratio: 75.88965738363859


# Getting the values and plotting it
x = pcaDF['PC1'].values
y = pcaDF['PC2'].values
train = np.array(list(zip(x, y)))


# Calculating the Jambu Elbow
Nc = range(1, 20)
kmeans = [KMeans(n_clusters = i) for i in Nc]
score = [kmeans[i].fit(train).score(train) for i in range(len(kmeans))]


# Plot the results
fig, ax0 = plt.subplots(figsize = (14, 6))
plt.plot(Nc, score, marker='o')
plt.axvline(x = 4, color = "#8b0000", linestyle = "--")
plt.xticks(np.arange(1, 20, 1))
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.title("Jambu Elbow Curve", fontsize = 20)
plt.show()


# Calculates the K-Means for (x, y) dataset
def runKMeans(k_clusters):
    kmeans = KMeans(n_clusters = k_clusters, algorithm = "elkan", random_state = 0)
    kmeans = kmeans.fit(train)
    
    # Getting the cluster labels
    clusters = kmeans.predict(train)
    
    # Centroid values
    centroids = kmeans.cluster_centers_
    
    # Plotting K-Means result
    plotKMeansData(train, k_clusters, centroids, clusters)


# Create scatter plot with K-Means data
def plotKMeansData(data, k_clusters, centroids, clusters):
    fig, ax1 = plt.subplots(figsize = (14, 14))
    
    # Plotting vars
    colors = ["#1f77b4", "#2ca02c", "#d62728", "#ff7f0e", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
    nData = len(data)
    
    # Create scatter plot
    for i in range(k_clusters):
        points = np.array([data[j] for j in range(nData) if clusters[j] == i])
        sns.scatterplot(ax = ax1, x = points[:, 0], y = points[:, 1], size = 5, color = colors[i])
    plt.scatter(centroids[:, 0], centroids[:, 1], s = 20, color = "black", marker = "D")
    
    # Plot setup
    ax1.set_xlabel("PC 1", fontsize = 12)
    ax1.set_ylabel("PC 2", fontsize = 12)
    ax1.set_title("Players by Zones", fontsize = 20)
    ax1.legend(["K-Means: Players with overall >= " + str(threshold)])
    ax1.grid()


# Create interactive control to control k value
# interactive(runKMeans, k_clusters = widgets.IntSlider(min = 1, max = 7, step = 1, value = 4))
runKMeans(k_clusters = 4)


# Palette by positions dictionary
posPalette = dict()
posPalette["GoalKeper"] = "#dc3912"
posPalette["Defense"] = "#3366cc"
posPalette["Midfield"] = "#ff9900"
posPalette["Attack"] = "#109618"


# Create scatter plot with players label
fig, ax2 = plt.subplots(figsize = (14, 14))

# Create 2D scatter plot
plot = sns.scatterplot(ax = ax2, data = pcaDF, x = "PC1", y = "PC2", hue = "Zone", palette = posPalette)

# Plot setup
ax2.set_xlabel("PC 1", fontsize = 12)
ax2.set_ylabel("PC 2", fontsize = 12)
ax2.set_title("Players by Zones", fontsize = 20)
ax2.legend(["K-Means: Players with overall >= " + str(threshold)])
ax2.grid()

	Age	Special	International Reputation	Weak Foot	Skill Moves	Jersey Number	HeightMts	WeightLbs	LS	ST	...	Penalties	Composure	Marking	StandingTackle	SlidingTackle	GKDiving	GKHandling	GKKicking	GKPositioning	GKReflexes
0	31	2202	5.0	4.0	4.0	10.0	1.70	159.0	88	88	...	75.0	96.0	33.0	28.0	26.0	6.0	11.0	15.0	14.0	8.0
1	33	2228	5.0	4.0	5.0	7.0	1.88	183.0	91	91	...	85.0	95.0	28.0	31.0	23.0	7.0	11.0	15.0	14.0	11.0
2	26	2143	5.0	5.0	5.0	10.0	1.75	150.0	84	84	...	81.0	94.0	27.0	24.0	33.0	9.0	9.0	15.0	15.0	11.0
3	27	1471	4.0	3.0	1.0	1.0	1.93	168.0	0	0	...	40.0	68.0	15.0	21.0	13.0	90.0	85.0	87.0	88.0	94.0
4	27	2281	4.0	5.0	4.0	7.0	1.80	154.0	82	82	...	79.0	88.0	68.0	58.0	51.0	15.0	13.0	5.0	10.0	13.0

	PC1	PC2	Position	Zone
0	-9.391933	-7.024874	RF	Attack
1	-8.935503	-4.993332	ST	Attack
2	-8.717719	-6.896612	LW	Midfield
3	17.673012	-2.497028	GK	GoalKeper
4	-10.027496	-2.585399	RCM	Midfield

Visual and Data Analysis - FIFA 19 Players¶

4. Clustering Data¶

Loading main libraries and data¶

Apply PCA with Standardization¶

Clustering: K-Means¶

Compare results with K-means grouping¶

Compare results with natural grouping¶

Insights¶