# Load the Python libraries
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
# Load Surprise libraries
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
# Load plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Path to dataset file
file_path = os.path.expanduser('../data/u.data')
# Read current ratings of the users
rawdata = pd.read_csv(file_path, sep = '\t', names = ['user_id','item_id','rating','timestamp'])
rawdata.head()
# Split data in training and test
train_data, test_data = train_test_split(rawdata, test_size = 0.2)
print("Train size:", train_data.shape) # 80.00%
print("Test size:", test_data.shape) # 20.00%
# Read the data into a Surprise dataset
reader = Reader(rating_scale = (1, 5))
data_train = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)
data_test = Dataset.load_from_df(test_data[['user_id', 'item_id', 'rating']], reader)
# Build full trainset
data_train = data_train.build_full_trainset()
data_test = data_test.build_full_trainset()
# Create the trainset and testset
data_trainset = data_train.build_testset()
data_testset = data_test.build_testset()
# Plot the model RMSE
def plot_model_rmse(xs, ys, title, x_label, y_label):
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize = (10, 5))
ax.plot(xs, ys, marker = 'o')
for x,y in zip(xs,ys):
label = "{:.2f}".format(y)
plt.annotate(label, (x,y), textcoords="offset points", xytext=(0,10), ha='center')
plt.title(title, fontsize = 12)
plt.xlabel(x_label, fontsize = 10)
plt.ylabel(y_label, fontsize = 10)
plt.draw()
Number of factors (k)
# Factors list
k_factors = [5, 10, 25, 50, 75, 100]
# CV results
train_rmse = []
test_rmse = []
# Loop in which errors are calculated
for k in k_factors:
algo = SVD(n_factors=k, n_epochs=200, biased=True, lr_all=0.005, reg_all=0, init_mean=0, init_std_dev=0.01, verbose=False)
algo.fit(data_train)
# The error of the training data is calculated and saved
predictions = algo.test(data_trainset)
error = accuracy.rmse(predictions, verbose = False)
train_rmse.append(error)
# The error of the testing data is calculated and saved
predictions_test = algo.test(data_testset)
error = accuracy.rmse(predictions_test, verbose = False)
test_rmse.append(error)
# Train RMSE dataframe
error_data = {'k': k_factors, 'error': train_rmse}
pd.DataFrame(error_data)
# Plotting the RMSE behaviour
plot_model_rmse(error_data['k'], error_data['error'], 'Train Model Errors', 'k', 'rmse')
# Test RMSE dataframe
error_data = {'k': k_factors, 'error': test_rmse}
pd.DataFrame(error_data)
# Plotting the RMSE behaviour
plot_model_rmse(error_data['k'], error_data['error'], 'Test Model Errors', 'k', 'rmse')
The regularization term for all parameters. Default is 0.02
# Lista de valores de regularización
k = 5
reg_all = [0.01, 0.02, 0.05, 0.1, 0.5]
# CV results
train_rmse = []
test_rmse = []
# Loop in which errors are calculated
for reg in reg_all:
algo = SVD(n_factors=k, n_epochs=200, biased=True, lr_all=0.005, reg_all=reg, init_mean=0, init_std_dev=0.01, verbose=False)
algo.fit(data_train)
# The error of the training data is calculated and saved
predictions = algo.test(data_trainset)
error = accuracy.rmse(predictions, verbose = False)
train_rmse.append(error)
# The error of the testing data is calculated and saved
predictions_test = algo.test(data_testset)
error = accuracy.rmse(predictions_test, verbose = False)
test_rmse.append(error)
# Train RMSE dataframe
error_data = {'reg_all': reg_all, 'error': train_rmse}
pd.DataFrame(error_data)
# Plotting the RMSE behaviour
plot_model_rmse(error_data['reg_all'], error_data['error'], 'Train Model Errors', 'reg all', 'rmse')
# Test RMSE dataframe
error_data = {'reg_all': reg_all, 'error': test_rmse}
pd.DataFrame(error_data)
# Plotting the RMSE behaviour
plot_model_rmse(error_data['reg_all'], error_data['error'], 'Train Model Errors', 'reg all', 'rmse')
The GridSearchCV class computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. This is useful for finding the best set of parameters for a prediction algorithm.
# Read the raw data into a Surprise dataset
reader = Reader(rating_scale = (1, 5))
dataset = Dataset.load_from_df(rawdata[['user_id', 'item_id', 'rating']], reader)
# SVD params: 3 * 3 * 3 * 3 combinations
param_grid = {'n_factors': [5, 10, 20],
'n_epochs': [20, 30, 50],
'lr_all': [0.002, 0.005, 0.01],
'reg_all': [0.02, 0.05, 0.1]}
# Tune algorithm parameters with GridSearchCV and k=4 cross-validation
gs = GridSearchCV(SVD, param_grid, measures = ['rmse', 'mae'], cv = 4)
gs.fit(dataset)
# Best RMSE and MAE scores
gs.best_score
# Combination of parameters that gave the best scores
gs.best_params
An item is considered relevant if its true rating $r_{ui}$ is greater than a given threshold. An item is considered recommended if its estimated rating $\hat{r}_{ui}$ is greater than the threshold, and if it is among the k highest estimated ratings.
# Return precision and recall at k metrics for each user
def precision_recall_at_k(predictions, k = 10, threshold = 3.5):
# First map the predictions to each user.
user_est_true = defaultdict(list)
for uid, _, true_r, est, _ in predictions:
user_est_true[uid].append((est, true_r))
precisions = dict()
recalls = dict()
for uid, user_ratings in user_est_true.items():
# Sort user ratings by estimated value
user_ratings.sort(key=lambda x: x[0], reverse=True)
# Number of relevant items
n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
# Number of recommended items in top k
n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
# Number of relevant and recommended items in top k
n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
# Precision@K: Proportion of recommended items that are relevant
precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
# Recall@K: Proportion of relevant items that are recommended
recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
return precisions, recalls;
# Returns the precision and recall of the model at k metrics
def get_precision_vs_recall(algo, k_max = 10, verbose = False):
precision_list = []
recall_list = []
f1_score_list = []
if algo:
for k_curr in range(1, k_max + 1):
algo.fit(data_train)
predictions = algo.test(data_testset)
# Get precision and recall at k metrics for each user
precisions, recalls = precision_recall_at_k(predictions, k = k_curr, threshold = 4)
# Precision and recall can then be averaged over all users
precision = sum(prec for prec in precisions.values()) / len(precisions)
recall = sum(rec for rec in recalls.values()) / len(recalls)
f1_score = 2 * (precision * recall) / (precision + recall)
# Save measures
precision_list.append(precision)
recall_list.append(recall)
f1_score_list.append(f1_score)
if verbose:
print('K =', k_curr, '- Precision:', precision, ', Recall:', recall, ', F1 score:', f1_score)
return {'precision': precision_list, 'recall': recall_list, 'f1_score': f1_score_list};
# Show best params for SVD algo
gs.best_params['rmse']
# Use SVD algorithm with the best params
algo = SVD() #gs.best_estimator['rmse']
# Calculate the precision and recall of the model at k metrics
k_max = 20
metrics = get_precision_vs_recall(algo, k_max, True)
# Get data
c1 = metrics['precision']
c2 = metrics['recall']
c3 = metrics['f1_score']
x = np.arange(len(c1))
# Set up the matplotlib figure
fig, ax1 = plt.subplots(figsize = (10, 5))
plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
plt.ylim(0, 1)
ax1.plot(x, c1, marker = 'o')
ax1.plot(x, c2, marker = 'o')
ax1.plot(x, c3, marker = 'o')
ax1.axvline(x = 10, color = "#8b0000", linestyle = "--")
# Chart setup
plt.title("Model's metrics", fontsize = 12)
plt.xlabel("k", fontsize = 10)
plt.ylabel("Precision and Recall", fontsize = 10)
plt.legend(("Precision", "Recall", "F1 score"), loc = "best")
plt.draw()
Based on this graph, we can select a k = 10, since from that value, the model's metrics practically do not vary.
# Get data
x = metrics['recall']
y = metrics['precision']
# Create scatter plot with the precision and recall results
fig, ax2 = plt.subplots(figsize = (10, 10))
# Create 2D scatter plot
sns.regplot(ax = ax2, x = x, y = y, fit_reg = False, marker = "o", color = "#1f77b4", scatter_kws = {"s": 30})
# Plot setup
ax2.set_title("Precision vs Recall", fontsize = 12)
ax2.set_xlabel("Recall", fontsize = 10)
ax2.set_ylabel("Precision", fontsize = 10)
ax2.grid()