# Load the Pandas libraries
import os
import io
import numpy as np
from collections import defaultdict
# Load Surprise libraries
from surprise import KNNBasic
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate
# Load Plotting libraries
%matplotlib inline
import matplotlib.pyplot as plt
# Path to dataset file
file_path = os.path.expanduser('../data/u.data')
# Read the data into a Surprise dataset
reader = Reader(line_format = 'user item rating timestamp', sep = '\t', rating_scale = (1, 5))
data = Dataset.load_from_file(file_path, reader = reader)
A basic collaborative filtering algorithm. The prediction $\hat{r}_{ui}$ is set as:
# Use k-NN algorithm with user-based collaborative filtering and cosine similarity
kk = 50
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(k = kk, sim_options = sim_options, verbose = True)
# Run 5-fold cross-validation and print results
cv = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)
# Get data
rmse = cv['test_rmse']
mae = cv['test_mae']
x = np.arange(len(rmse))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize = (10, 5))
plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
plt.ylim(0.5, 1.3)
ax.plot(x, rmse, marker='o', label="rmse")
ax.plot(x, mae, marker='o', label="mae")
# Chart setup
plt.title("Model Errors", fontsize = 12)
plt.xlabel("CV", fontsize = 10)
plt.ylabel("Error", fontsize = 10)
plt.legend()
plt.show()
# Without real rating
p1 = algo.predict(uid = '13', iid = '181', verbose = True)
# With real rating
p2 = algo.predict(uid = '196', iid = '302', r_ui = 4, verbose = True)
# Return two mappings to convert raw ids into movie names and movie names into raw ids
def read_item_names(file_path):
rid_to_name = {}
name_to_rid = {}
with io.open(file_path, 'r', encoding = 'ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
# Read the mappings raw id <-> movie name
item_filepath = '../data/u.item'
rid_to_name, name_to_rid = read_item_names(item_filepath)
# Target movie
target_movie = 'Toy Story (1995)'
# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid[target_movie]
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
print(target_movie + ':', toy_story_inner_id)
# Retrieve inner ids of the nearest neighbors of Toy Story
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k = 10)
toy_story_neighbors
# The 10 nearest neighbors of Toy Story are:
print("The movies most similar to '" + target_movie + " are:")
for inner_id in toy_story_neighbors:
raw_id = algo.trainset.to_raw_iid(inner_id)
movie = rid_to_name[raw_id]
print(raw_id, '-', movie)
# Return the top-N recommendation for each user from a set of predictions.
def get_top_n(predictions, n = 10):
# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
# Create train_set and test_set
train_set = data.build_full_trainset()
test_set = train_set.build_anti_testset()
# First train a KNN algorithm on the whole dataset
algo.fit(train_set)
predictions = algo.test(test_set)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose = True)
# Than predict ratings for all pairs (u, i) that are NOT in the training set
top_n = 10
top_pred = get_top_n(predictions, n = top_n)
# User raw Id
uid_list = ['196']
# Print the recommended items for a specific user
for uid, user_ratings in top_pred.items():
if uid in uid_list:
for (iid, rating) in user_ratings:
movie = rid_to_name[iid]
print('Movie:', iid, '-', movie, ', rating:', str(rating))