# Load the Python libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Load Surprise libraries
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
# Load plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Path to dataset file
file_path = os.path.expanduser('../data/u.data')
# Read current ratings of the users
data = pd.read_csv(file_path, sep = '\t', names = ['user_id','item_id','rating','timestamp'])
data.head()
# Path to dataset file
file_path = os.path.expanduser('../data/u.item')
# Read items dataframe
items = pd.read_csv(file_path, sep = '|', header = None, usecols = [0, 1, 2], encoding = 'ISO-8859-1')
items.columns = ['item_id', 'item_name', 'item_date']
items.head()
# Merge data (user-item rating) with item (item description) dataframes
data_item = pd.merge(data, items, on = 'item_id')
data_item.head()
# Create dataframe with 'rating' and 'count' values
ratings = pd.DataFrame(data_item.groupby('item_name')['rating'].mean())
ratings['count'] = pd.DataFrame(data_item.groupby('item_name')['rating'].count())
ratings.head()
# Sorting values according to the num of rating column
ratings.sort_values('count', ascending = False).head(10)
# Plot graph of 'count' column
plt.figure(figsize = (10, 4))
ratings['count'].hist(bins = 70)
plt.show()
# Plot graph of 'ratings' column
plt.figure(figsize = (10, 4))
ratings['rating'].hist(bins = 70)
plt.show()
# Split data in training and test
train_data, test_data = train_test_split(data, test_size = 0.2)
print("Train size:", train_data.shape) # 80.00%
print("Test size:", test_data.shape) # 20.00%
# Plot a ratings histogram of training data
plt.figure(figsize = (10, 4))
train_data.rating.plot.hist(bins = 10)
plt.show()
# Plot a ratings histogram of training data
plt.figure(figsize = (10, 4))
test_data.rating.plot.hist(bins = 10)
plt.show()
Note: The two histograms look similar. Both datasets have a similar distribution of the rating variable.
# Read the data into a Surprise dataset
reader = Reader(rating_scale = (1, 5))
data_train = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)
data_test = Dataset.load_from_df(test_data[['user_id', 'item_id', 'rating']], reader)
# Build full trainset
data_train = data_train.build_full_trainset()
data_test = data_test.build_full_trainset()
mean = data_train.global_mean
print('Train rating', mean)
mean = data_test.global_mean
print('Test rating', mean)
# Create the trainset and testset
data_trainset = data_train.build_testset()
data_testset = data_test.build_testset()
Use the famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize
# Create SVD algorithm with 5 factors
k_factors = 5
algo = SVD(n_factors= k_factors, n_epochs= 200, biased= True, lr_all= 0.005, reg_all= 0, init_mean= 0, init_std_dev= 0.01)
# Train the algorithm on the trainset
algo.fit(data_train)
# Calculate RMSE for training dataset
train_pred = algo.test(data_trainset)
accuracy.rmse(train_pred)
# Calculate RMSE for test dataset
test_pred = algo.test(data_testset)
accuracy.rmse(test_pred)
# Show first 5 rows
train_data.head(5)
# Prediction without real rating
p1 = algo.predict(uid = train_data.iloc[0].user_id, iid = train_data.iloc[0].item_id, verbose = True)
# Prediction with real rating
p2 = algo.predict(uid = 196, iid = 302, r_ui = 4, verbose = True)
# Reconstruction of original matrix
original = np.zeros((data_train.n_users, data_train.n_items))
for (u, i, r) in data_train.all_ratings():
original[u][i] = r
# Plot matrix
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(original, ax=ax)
ax.set_title("Original Matrix", fontsize = 16)
ax.set_xlabel('item', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# Users factors matrix with 5 factors
pu = algo.pu
# Plot users factors
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(pu, ax=ax)
ax.set_title("Users Factors Matrix", fontsize = 16)
ax.set_xlabel('factors', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# Items factors matrix with 5 factors
qi = algo.qi
# Plot items factors
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(qi, ax=ax)
ax.set_title("Items Factors Matrix", fontsize = 16)
ax.set_xlabel('factors', fontsize = 12)
ax.set_ylabel('item', fontsize = 12)
plt.show()
You can also view the bias of users $b_u$ and items $b_i$
# Users bias
bu = algo.bu.reshape(algo.bu.shape[0], 1)
# Plot bias
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(bu, ax=ax)
ax.set_title("Users Bias", fontsize = 16)
ax.set_xlabel('bias', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# Items bias
bi = algo.bi.reshape(algo.bi.shape[0], 1)
# Plot bias
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(bi, ax=ax)
ax.set_title("Items Bias", fontsize = 16)
ax.set_xlabel('bias', fontsize = 12)
ax.set_ylabel('item', fontsize = 12)
plt.show()
Reconstruction of Prediction matrix $$ \hat{r}_{ui} = \mu + b_{u} + b_{i} + p_{u} \dot q_{i}^{T} \tag{1} $$
# Reconstruction of original matrix
mean = data_train.global_mean
reconstruct = mean + bu + bi.T + (pu).dot((qi).T)
# Plot matrix
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(reconstruct, ax=ax)
ax.set_title("Prediction Matrix with Noise", fontsize = 16)
ax.set_xlabel('item', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# The predictions are adjusted, since there are values less than 1 and greater than 5
reconstruct = np.clip(reconstruct, 1, 5)
# Plot matrix
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(reconstruct, ax=ax)
ax.set_title("Prediction Matrix w/o Noise", fontsize = 16)
ax.set_xlabel('item', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# Get known predictions
known_entries = (original == 0)
reconstruct[known_entries] = 0
# Plot matrix
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(reconstruct, ax=ax)
ax.set_title("Known Predictions Matrix", fontsize = 16)
ax.set_xlabel('item', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
# Get diff matrix
diff_matrix = np.abs(original - reconstruct)
# Plot diff matrix
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(diff_matrix, ax=ax)
ax.set_title("Original vs Prediction", fontsize = 16)
ax.set_xlabel('item', fontsize = 12)
ax.set_ylabel('user', fontsize = 12)
plt.show()
Average Absolute Difference between Original ratings and Prediction ratings.
diff_matrix.sum() / len(train_data)