# Load the Pandas libraries
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import OrderedDict


# Load Similarity libraries
from scipy import stats as ss
import sklearn.metrics.pairwise as sm


# Load visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


# Read FIFA 19 players data only using the current skill values
dataURL = "../data/fifa19_overall_data.csv"
rawdata = pd.read_csv(dataURL)
print(rawdata.shape)

(18207, 89)


# Function that obtains the numerical data from the data frame
def getNumericalData(data, quality):
    numData = pd.DataFrame()
    
    # Create DataFrame
    for col in data.columns:
        if str(data[col].dtype) != "object":
            numData[col] = data[col]
    
    # Remove columns that are not relevant for the analysis
    nonColumns = ["Order", "ID", "Overall", "Potential", "Value €", "Wage €", "Release Clause €"]
    numData = numData.drop(nonColumns, axis=1)
    
    # Data Quality process
    if quality and len(numData.columns) > 0:
        numData = numData.fillna(numData.mean())
    
    print(numData.shape)
    return numData;


# Set an Overall threshold
threshold = 80
threshold

80


# Filter/Delete player with an overall below the threshold
var_filter = "Overall"
rawdata = rawdata.loc[rawdata[var_filter] >= threshold]
len(rawdata)

555


# Get only numeric columns/variables
numData = getNumericalData(rawdata, True)
numData.head()

(555, 68)


# Function that apply Principal Component Analysis
def applyPCA(data, std):
    
    # Standardize the Data
    if std == True:
        x = StandardScaler().fit_transform(data.values)
    else:
        x = data.values
    
    # Create a DataFrame from PCA
    pca = PCA(n_components = 5)
    pcaData = pca.fit_transform(x)
    pcaDF = pd.DataFrame(data = pcaData, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"])
    
    # Show the total explained variance ratio of model
    print('Explained Variance Ratio:', sum(pca.explained_variance_ratio_) * 100)
    
    return pcaDF;


# Apply the PCA algorithm
pcaDF = applyPCA(numData, True)

# Create the PCA data
pcaDF = pd.concat([pcaDF, rawdata[["Name"]]], axis = 1)
pcaDF = pcaDF[pcaDF["PC1"].notnull()]
pcaDF.head(10)

Explained Variance Ratio: 88.99078649524127


# Returns the similarity between 2 vectors
def getSimilarity(func, x, y):
    similarity = 0
    
    if func == 'cosine':
        result = sm.cosine_similarity([x], [y])
        similarity = float(result[0])
    
    elif func == 'pearson':
        corr, p_value = ss.pearsonr(x, y)
        similarity = corr
        
    elif func == 'euclidean':
        result = sm.euclidean_distances([x], [y])
        similarity = float(result[0])
    
    elif func == 'manhattan':
        result = sm.manhattan_distances([x], [y])
        similarity = float(result[0])
    
    return similarity;


# Get more N similar instances
def getMoreSimilarInstances(df, func, n, target):
    solution = dict()
    simList = dict()
    currVector = df[df['Name'] == target]
    
    if (currVector is not None) and (len(currVector) > 0):
        currVector = currVector.values[0, 0:5];

        for ix, row in df.iterrows():
            currName = row['Name']
            if currName != target:
                neighVector = [row['PC1'], row['PC2'], row['PC3'], row['PC4'], row['PC5']]
                similarity = getSimilarity(func, currVector, neighVector)
                simList[currName] = similarity
                
        # Sorting dictionary in ascending order based on values
        n = min(len(simList), n)
        if n > 0:
            simList = OrderedDict(sorted(simList.items(), key = lambda kv: kv[1], reverse = True))
            ix = 0
            for key, value in simList.items():
                if ix < n:
                    solution[key] = round(value, 4)
                    ix += 1
                else:
                    break
    
    return solution;


# Palette by positions dictionary
posPalette = dict()
posPalette["Neighbor"] = "#3366cc"
posPalette["Current"] = "#109618"

# Create scatter plot with players label
def plot2DSimilarPlayers(df, corr, target):
    
    # Create temp data
    corr[target] = 1.0
    df1 = df.set_index('Name').copy()
    df2 = pd.DataFrame(data = corr.values(), index = corr.keys(), columns = ['Corr'])
    data = pd.concat([df1, df2], axis = 1, join = 'inner')
    data['Type'] = 'Neighbor'
    data.loc[target, 'Type'] = 'Current'
    
    # Create Grid
    fig = plt.figure(figsize = (18, 8))
    fig.subplots_adjust(hspace = 0.2, wspace = 0.2)
    fig.suptitle("Players similar to '" + target + "' in the plane", fontsize = 20)
    
    # Create 2D scatter plot
    ax0 = fig.add_subplot(121)
    plot = sns.scatterplot(ax = ax0, data = data, x = "PC1", y = "PC2", hue = "Type", size = 'Corr', palette = posPalette)
    
    # Add annotations one by one with a loop
    for ix, row in data.iterrows():
        lblName = ix
        if ix != target:
            lblName += '\nR: ' + str(row["Corr"])
        plot.text(row["PC1"] + 0.02, row["PC2"] - 0.03, lblName
                  , horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
    
    # Plot setup
    ax0.set_xlabel("PC 1", fontsize = 11)
    ax0.set_ylabel("PC 2", fontsize = 11)
    ax0.legend(["Target"], fontsize = 10, loc = 'lower right', frameon = True)
    ax0.grid()
    
    # Create 3D scatter plot
    ax1 = fig.add_subplot(122, projection = "3d")
    xx = data.loc[:,["PC1"]].values
    yy = data.loc[:,["PC2"]].values
    zz = data.loc[:,["PC3"]].values
    
    # Plot values
    ax1.scatter(xx, yy, zz, c = "#1f77b4", marker = "o", s = 50)
    
    # Add annotations one by one with a loop
    for ix, row in data.iterrows():
        lblName = ix
        if ix != target:
            lblName += '\nR: ' + str(row["Corr"])
        ax1.text(row["PC1"], row["PC2"], row["PC3"], lblName
                  , horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
    
    # Plot setup
    ax1.set_xlabel("PC 1", fontsize = 11)
    ax1.set_ylabel("PC 2", fontsize = 11)
    ax1.set_zlabel("PC 3", fontsize = 11)
    ax1.legend(["Players"], fontsize = 10, loc = 'lower right', frameon = True)
    ax1.grid()


# Global analysis params
fSimilarity = 'cosine'
nNeighbors = 10


# Show the N players most similar to 'L. Messi' using cosine metric
target = 'L. Messi'
solution = getMoreSimilarInstances(pcaDF, fSimilarity, nNeighbors, target)
pd.DataFrame.from_dict(solution, orient='index', columns=['Similarity'])


# Plot the N players most similar to 'L. Messi' using cosine metric
plot2DSimilarPlayers(pcaDF, solution, target)


# Show the N players most similar to 'L. Modrić' using cosine metric
target = 'L. Modrić'
solution = getMoreSimilarInstances(pcaDF, fSimilarity, nNeighbors, target)
pd.DataFrame.from_dict(solution, orient='index', columns=['Similarity'])


# Plot the N players most similar to 'L. Modrić' using cosine metric
plot2DSimilarPlayers(pcaDF, solution, target)


# Show the N players most similar to 'D. Godín' using cosine metric
target = 'D. Godín'
solution = getMoreSimilarInstances(pcaDF, fSimilarity, nNeighbors, target)
pd.DataFrame.from_dict(solution, orient='index', columns=['Similarity'])


# Plot the N players most similar to 'D. Godín' using cosine metric
plot2DSimilarPlayers(pcaDF, solution, target)


# Show the N players most similar to 'De Gea' using cosine metric
target = 'De Gea'
solution = getMoreSimilarInstances(pcaDF, fSimilarity, nNeighbors, target)
pd.DataFrame.from_dict(solution, orient='index', columns=['Similarity'])


# Plot the N players most similar to 'De Gea' using cosine metric
plot2DSimilarPlayers(pcaDF, solution, target)


pcaDF.head()


# Transpose the matrix
nPlayers = 50
tempDF = pcaDF.copy().head(nPlayers)
pcaDFT = tempDF.drop(['Name'], axis = 1).T
pcaDFT.columns = list(tempDF['Name'])
pcaDFT.head()


# Calculate the correlation matrix
corr = pcaDFT.corr(method = 'pearson')
corr.iloc[:10, :10]


# Create Players Correlation Triangle plot
def plotCorrTriangle(corr):
    
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype = np.bool)
    mask[np.triu_indices_from(mask)] = True
    
    # Set up the matplotlib figure
    fig, ax = plt.subplots(figsize = (18, 18))
    
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(10, 240, n=9)
    
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
                square = True, linewidths = .5, cbar_kws = {"shrink": .5})

    ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right');

    # Add title
    ax.set_title("Players Correlation Triangle", fontsize = 20)


# Plot Correlation Triangle
plotCorrTriangle(corr)

	Age	Special	International Reputation	Weak Foot	Skill Moves	Jersey Number	HeightMts	WeightLbs	LS	ST	...	Penalties	Composure	Marking	StandingTackle	SlidingTackle	GKDiving	GKHandling	GKKicking	GKPositioning	GKReflexes
0	31	2202	5.0	4.0	4.0	10.0	1.70	159.0	88	88	...	75.0	96.0	33.0	28.0	26.0	6.0	11.0	15.0	14.0	8.0
1	33	2228	5.0	4.0	5.0	7.0	1.88	183.0	91	91	...	85.0	95.0	28.0	31.0	23.0	7.0	11.0	15.0	14.0	11.0
2	26	2143	5.0	5.0	5.0	10.0	1.75	150.0	84	84	...	81.0	94.0	27.0	24.0	33.0	9.0	9.0	15.0	15.0	11.0
3	27	1471	4.0	3.0	1.0	1.0	1.93	168.0	0	0	...	40.0	68.0	15.0	21.0	13.0	90.0	85.0	87.0	88.0	94.0
4	27	2281	4.0	5.0	4.0	7.0	1.80	154.0	82	82	...	79.0	88.0	68.0	58.0	51.0	15.0	13.0	5.0	10.0	13.0

	PC1	PC2	PC3	PC4	PC5	Name
0	-5.526306	-5.808015	2.076504	2.350692	3.071319	L. Messi
1	-5.033655	-3.568653	4.896926	0.254852	4.140921	Cristiano Ronaldo
2	-4.986200	-5.803934	1.127417	1.682914	2.750608	Neymar Jr
3	16.489354	-1.978795	0.312424	2.100684	2.824599	De Gea
4	-6.030921	-1.962398	1.298546	2.143308	1.320396	K. De Bruyne
5	-4.936598	-4.930870	0.985261	1.042246	1.841818	E. Hazard
6	-6.034160	-1.682930	-0.712501	3.359507	1.979452	L. Modrić
7	-4.291498	-2.126353	3.635059	1.426361	2.758876	L. Suárez
8	-3.698536	3.403521	1.141814	0.939394	2.542291	Sergio Ramos
9	17.572773	-1.278344	0.143841	0.435352	2.526036	J. Oblak

	Similarity
Neymar Jr	0.9924
M. Reus	0.9855
E. Hazard	0.9825
A. Robben	0.9645
S. Agüero	0.9623
A. Sánchez	0.9548
A. Di María	0.9482
K. Gameiro	0.9309
David Silva	0.9294
A. Griezmann	0.9274

	Similarity
Thiago	0.9687
Dani Alves	0.9648
M. Götze	0.9605
David Silva	0.9481
Pizzi	0.9456
K. De Bruyne	0.9435
M. Pjanić	0.9397
A. Guardado	0.9378
Iniesta	0.9330
G. Bonaventura	0.9286

	Similarity
G. Chiellini	0.9775
Miranda	0.9738
N. Otamendi	0.9694
S. Mustafi	0.9674
R. Varane	0.9621
Pepe	0.9592
V. van Dijk	0.9555
T. Alderweireld	0.9514
J. Boateng	0.9512
M. Benatia	0.9506

Visual and Data Analysis - FIFA 19 Players¶

5. Analysis of Similarity¶

Loading main libraries and data¶

Apply PCA with Standardization¶

Apply Similarity Functions¶

Similarity function used¶

Correlation matrix between Players¶

Step 1 - Transpose the matrix by Players¶

Step 2 - Calculate Pearson correlation matrix¶

Step 3 - Plot the correlation matrix¶

Insights¶

	Similarity
Casillas	0.9978
K. Navas	0.9975
H. Lloris	0.9964
J. Oblak	0.9936
Rui Patrício	0.9919
G. Buffon	0.9917
Pepe Reina	0.9916
M. Neuer	0.9913
Y. Sommer	0.9902
T. Courtois	0.9901

	L. Messi	Cristiano Ronaldo	Neymar Jr	De Gea	K. De Bruyne	E. Hazard	L. Modrić	L. Suárez	Sergio Ramos	J. Oblak
L. Messi	1.000000	0.908333	0.996637	-0.385730	0.885932	0.999402	0.814337	0.934233	0.332183	-0.470835
Cristiano Ronaldo	0.908333	1.000000	0.892121	-0.503009	0.830084	0.918503	0.653822	0.977140	0.454924	-0.544258
Neymar Jr	0.996637	0.892121	1.000000	-0.329280	0.855786	0.995704	0.800578	0.909131	0.295869	-0.416183
De Gea	-0.385730	-0.503009	-0.329280	1.000000	-0.752732	-0.405158	-0.677299	-0.615761	-0.945480	0.992382
K. De Bruyne	0.885932	0.830084	0.855786	-0.752732	1.000000	0.891966	0.941467	0.925753	0.677389	-0.818650
E. Hazard	0.999402	0.918503	0.995704	-0.405158	0.891966	1.000000	0.818155	0.942326	0.357576	-0.487456
L. Modrić	0.814337	0.653822	0.800578	-0.677299	0.941467	0.818155	1.000000	0.774384	0.665971	-0.755309
L. Suárez	0.934233	0.977140	0.909131	-0.615761	0.925753	0.942326	0.774384	1.000000	0.538398	-0.668353
Sergio Ramos	0.332183	0.454924	0.295869	-0.945480	0.677389	0.357576	0.665971	0.538398	1.000000	-0.926766
J. Oblak	-0.470835	-0.544258	-0.416183	0.992382	-0.818650	-0.487456	-0.755309	-0.668353	-0.926766	1.000000