87 KiB
87 KiB
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict, deque
import torch
import torch.nn as nn
import torch.optim as optim
# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
Load data
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)
ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]
display(HTML(ml_movies_df.head(10).to_html()))
print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id | title | genres | |
---|---|---|---|
118 | 145 | Bad Boys (1995) | Action|Comedy|Crime|Drama|Thriller |
143 | 171 | Jeffrey (1995) | Comedy|Drama |
194 | 228 | Destiny Turns on the Radio (1995) | Comedy |
199 | 233 | Exotica (1994) | Drama |
230 | 267 | Major Payne (1995) | Comedy |
313 | 355 | Flintstones, The (1994) | Children|Comedy|Fantasy |
379 | 435 | Coneheads (1993) | Comedy|Sci-Fi |
419 | 481 | Kalifornia (1993) | Drama|Thriller |
615 | 780 | Independence Day (a.k.a. ID4) (1996) | Action|Adventure|Sci-Fi|Thriller |
737 | 959 | Of Human Bondage (1934) | Drama |
Number of interactions left: 1170
Generalized Matrix Factorization (GMF)
from livelossplot import PlotLosses
from recommenders.recommender import Recommender
class GMFModel(nn.Module):
def __init__(self, n_items, n_users, embedding_dim, seed):
super().__init__()
self.seed = torch.manual_seed(seed)
self.item_embedding = nn.Embedding(n_items, embedding_dim)
self.user_embedding = nn.Embedding(n_users, embedding_dim)
self.fc = nn.Linear(embedding_dim, 1, bias=False)
def forward(self, x):
user_ids = x[:, 0]
item_ids = x[:, 1]
user_embedding = self.user_embedding(user_ids)
item_embedding = self.item_embedding(item_ids)
x = self.fc(user_embedding * item_embedding)
x = torch.sigmoid(x)
return x
class GMFRecommender(Recommender):
"""
General Matrix Factorization recommender as described in:
- He X., Liao L., Zhang H., Nie L., Hu X., Chua T., Neural Collaborative Filtering, WWW Conference, 2017
"""
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.r = None
self.most_popular_items = None
self.nn_model = None
self.optimizer = None
self.n_neg_per_pos = n_neg_per_pos
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
self.n_epochs = params['n_epochs']
else:
self.n_epochs = 10
if 'lr' in params: # learning rate
self.lr = params['lr']
else:
self.lr = 0.01
if 'weight_decay' in params: # weight decay (L2 regularization)
self.weight_decay = params['weight_decay']
else:
self.weight_decay = 0.001
if 'embedding_dim' in params:
self.embedding_dim = params['embedding_dim']
else:
self.embedding_dim = 4
if 'batch_size' in params:
self.batch_size = params['batch_size']
else:
self.batch_size = 64
if 'device' in params:
self.device = params['device']
else:
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
else:
self.should_recommend_already_bought = False
if 'train' in params:
self.train = params['train']
else:
self.train = False
self.validation_set_size = 0.2
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
torch.manual_seed(seed)
if 'should_save_model' in params:
self.should_save_model = params['should_save_model']
self.print_type = print_type
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
del users_df, items_df
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df.copy()
n_users = np.max(interactions_df['user_id']) + 1
n_items = np.max(interactions_df['item_id']) + 1
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
self.r = r
# Indicate positive interactions
interactions_df.loc[:, 'interacted'] = 1
# Generate negative interactions
negative_interactions = []
i = 0
while i < self.n_neg_per_pos * len(interactions_df):
sample_size = 1000
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
j = 0
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
if r[user_ids[j]][item_ids[j]] == 0:
negative_interactions.append([user_ids[j], item_ids[j], 0])
i += 1
j += 1
interactions_df = pd.concat(
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
interactions_df = interactions_df.reset_index(drop=True)
# Initialize losses and loss visualization
if self.print_type is not None and self.print_type == 'live':
liveloss = PlotLosses()
training_losses = deque(maxlen=50)
training_avg_losses = []
training_epoch_losses = []
validation_losses = deque(maxlen=50)
validation_avg_losses = []
validation_epoch_losses = []
last_training_total_loss = 0.0
last_validation_total_loss = 0.0
# Initialize the network
self.nn_model = GMFModel(n_items, n_users, self.embedding_dim, self.seed)
self.nn_model.train()
self.nn_model.to(self.device)
self.optimizer = optim.Adam(self.nn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
# Split the data
if self.train:
interaction_ids = self.rng.permutation(len(interactions_df))
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
training_ids = interaction_ids[:train_validation_slice_idx]
validation_ids = interaction_ids[train_validation_slice_idx:]
else:
interaction_ids = self.rng.permutation(len(interactions_df))
training_ids = interaction_ids
validation_ids = []
# Train the model
for epoch in range(self.n_epochs):
if self.print_type is not None and self.print_type == 'live':
logs = {}
# Train
training_losses.clear()
training_total_loss = 0.0
self.rng.shuffle(training_ids)
batch_idx = 0
n_batches = int(np.ceil(len(training_ids) / self.batch_size))
for batch_idx in range(n_batches):
batch_ids = training_ids[(batch_idx * self.batch_size):((batch_idx + 1) * self.batch_size)]
batch = interactions_df.loc[batch_ids]
batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)
y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)
# Create responses
y = self.nn_model(batch_input).clip(0.000001, 0.999999)
# Define loss and backpropagate
self.optimizer.zero_grad()
loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()
loss.backward()
self.optimizer.step()
training_total_loss += loss.item()
if self.print_type is not None and self.print_type == 'text':
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
training_losses.append(loss.item())
training_avg_losses.append(np.mean(training_losses))
# Validate
validation_total_loss = 0.0
batch = interactions_df.loc[validation_ids]
batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)
y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)
# Create responses
y = self.nn_model(batch_input).clip(0.000001, 0.999999)
# Calculate validation loss
loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()
validation_total_loss += loss.item()
# Save and print epoch losses
training_last_avg_loss = training_total_loss / len(training_ids)
validation_last_avg_loss = validation_total_loss / len(validation_ids)
if self.print_type is not None and self.print_type == 'live' and epoch >= 0:
# A bound on epoch prevents showing extremely high losses in the first epochs
logs['loss'] = training_last_avg_loss
logs['val_loss'] = validation_last_avg_loss
liveloss.update(logs)
liveloss.send()
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
mapped_user_id = self.user_id_mapping[user_id]
ids_list = items_df['item_id'].tolist()
id_to_pos = np.array([0]*len(ids_list))
for k in range(len(ids_list)):
id_to_pos[ids_list[k]] = k
net_input = torch.tensor(list(zip([mapped_user_id]*len(ids_list), ids_list))).to(self.device)
scores = self.nn_model(net_input).flatten().detach().cpu().numpy()
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
x_list = self.interactions_df.loc[
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
scores[id_to_pos[x_list]] = -np.inf
chosen_pos = np.argsort(-scores)[:n_recommendations]
for item_pos in chosen_pos:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
'score': scores[item_pos]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df
def get_user_repr(self, user_id):
mapped_user_id = self.user_id_mapping[user_id]
return self.nn_model.user_embedding(torch.tensor(mapped_user_id).to(self.device)).detach().cpu().numpy()
def get_item_repr(self, item_id):
mapped_item_id = self.item_id_mapping[item_id]
return self.nn_model.item_embedding(torch.tensor(mapped_item_id).to(self.device)).detach().cpu().numpy()
class MLPModel(nn.Module):
def __init__(self, n_items, n_users, embedding_dim, seed):
super().__init__()
self.seed = torch.manual_seed(seed)
self.item_embedding = nn.Embedding(n_items, embedding_dim)
self.user_embedding = nn.Embedding(n_users, embedding_dim)
self.fc1 = nn.Linear(2 * embedding_dim, 32, bias=False)
self.fc2 = nn.Linear(32, 16, bias=False)
self.fc3 = nn.Linear(16, 1, bias=False)
def forward(self, x):
user = x[:, 0]
item = x[:, 1]
user_embedding = self.user_embedding(user)
item_embedding = self.item_embedding(item)
x = torch.cat([user_embedding, item_embedding], dim=1)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
class NeuMFModel(nn.Module):
def __init__(self, n_items, n_users, gmf_embedding_dim, mlp_embedding_dim, seed):
super().__init__()
self.seed = torch.manual_seed(seed)
# GMF
self.gmf_user_embedding = nn.Embedding(n_users, gmf_embedding_dim)
self.gmf_item_embedding = nn.Embedding(n_items, gmf_embedding_dim)
# MLP
self.mlp_user_embedding = nn.Embedding(n_users, mlp_embedding_dim)
self.mlp_item_embedding = nn.Embedding(n_items, mlp_embedding_dim)
self.mlp_fc1 = nn.Linear(2 * mlp_embedding_dim, 32, bias=False)
self.mlp_fc2 = nn.Linear(32, 16, bias=False)
# Merge
self.fc = nn.Linear(32, 1, bias=False)
def forward(self, x):
user = x[:, 0]
item = x[:, 1]
# GMF
gmf_user_embedding = self.gmf_user_embedding(user)
gmf_item_embedding = self.gmf_item_embedding(item)
gmf_x = gmf_user_embedding * gmf_item_embedding
# MLP
mlp_user_embedding = self.mlp_user_embedding(user)
mlp_item_embedding = self.mlp_item_embedding(item)
mlp_x = torch.cat([mlp_user_embedding, mlp_item_embedding], dim=1)
mlp_x = torch.relu(self.mlp_fc1(mlp_x))
mlp_x = torch.relu(self.mlp_fc2(mlp_x))
# Final score
x = torch.cat([gmf_x, mlp_x], dim=1)
x = torch.sigmoid(self.fc(x))
return x
Quick test of the recommender (training)
gmf_recommender = GMFRecommender(print_type='live', n_neg_per_pos=10, batch_size=16,
embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20, seed=1)
gmf_recommender.fit(ml_ratings_df, None, ml_movies_df)
Loss training (min: 0.130, max: 0.706, cur: 0.130) validation (min: 0.224, max: 0.696, cur: 0.226)
Quick test of the recommender (recommending)
recommendations = gmf_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)
recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))
Recommendations
user_id | item_id | score | title | genres | |
---|---|---|---|---|---|
0 | 1 | 4896 | 0.768898 | Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) | Adventure|Children|Fantasy |
1 | 1 | 435 | 0.650600 | Coneheads (1993) | Comedy|Sci-Fi |
2 | 1 | 41566 | 0.609373 | Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) | Adventure|Children|Fantasy |
3 | 1 | 6502 | 0.535332 | 28 Days Later (2002) | Action|Horror|Sci-Fi |
4 | 1 | 145 | 0.441272 | Bad Boys (1995) | Action|Comedy|Crime|Drama|Thriller |
5 | 1 | 6537 | 0.432268 | Terminator 3: Rise of the Machines (2003) | Action|Adventure|Sci-Fi |
6 | 1 | 355 | 0.421626 | Flintstones, The (1994) | Children|Comedy|Fantasy |
7 | 1 | 5673 | 0.242538 | Punch-Drunk Love (2002) | Comedy|Drama|Romance |
8 | 1 | 481 | 0.218651 | Kalifornia (1993) | Drama|Thriller |
9 | 1 | 267 | 0.213728 | Major Payne (1995) | Comedy |
10 | 4 | 780 | 0.858898 | Independence Day (a.k.a. ID4) (1996) | Action|Adventure|Sci-Fi|Thriller |
11 | 4 | 435 | 0.634766 | Coneheads (1993) | Comedy|Sci-Fi |
12 | 4 | 41566 | 0.597829 | Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) | Adventure|Children|Fantasy |
13 | 4 | 6502 | 0.531417 | 28 Days Later (2002) | Action|Horror|Sci-Fi |
14 | 4 | 145 | 0.447853 | Bad Boys (1995) | Action|Comedy|Crime|Drama|Thriller |
15 | 4 | 6537 | 0.439573 | Terminator 3: Rise of the Machines (2003) | Action|Adventure|Sci-Fi |
16 | 4 | 355 | 0.430258 | Flintstones, The (1994) | Children|Comedy|Fantasy |
17 | 4 | 5673 | 0.266561 | Punch-Drunk Love (2002) | Comedy|Drama|Romance |
18 | 4 | 481 | 0.243838 | Kalifornia (1993) | Drama|Thriller |
19 | 4 | 267 | 0.239114 | Major Payne (1995) | Comedy |
20 | 6 | 4896 | 0.687780 | Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) | Adventure|Children|Fantasy |
21 | 6 | 41566 | 0.572620 | Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) | Adventure|Children|Fantasy |
22 | 6 | 1500 | 0.572483 | Grosse Pointe Blank (1997) | Comedy|Crime|Romance |
23 | 6 | 6502 | 0.523220 | 28 Days Later (2002) | Action|Horror|Sci-Fi |
24 | 6 | 6537 | 0.455307 | Terminator 3: Rise of the Machines (2003) | Action|Adventure|Sci-Fi |
25 | 6 | 5673 | 0.321320 | Punch-Drunk Love (2002) | Comedy|Drama|Romance |
26 | 6 | 481 | 0.302354 | Kalifornia (1993) | Drama|Thriller |
27 | 6 | 4890 | 0.270704 | Shallow Hal (2001) | Comedy|Fantasy|Romance |
28 | 6 | 5954 | 0.261981 | 25th Hour (2002) | Crime|Drama |
29 | 6 | 3468 | 0.239384 | Hustler, The (1961) | Drama |
User and item representations
user_id = 1
user_repr = gmf_recommender.get_user_repr(user_id=user_id)
print("User id={}".format(user_id))
print(user_repr)
print()
print("User watched")
print(ml_df.loc[ml_df['user_id'] == user_id, 'title'].tolist())
print()
print('User history item representations')
for item_id in ml_df.loc[ml_df['user_id'] == user_id, 'item_id'].tolist():
item_repr = gmf_recommender.get_item_repr(item_id=item_id)
print("Item id = {}\titem title = {}".format(
item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
print(item_repr)
scalar_product = np.dot(user_repr, item_repr)
print("Scalar product={:.6f}".format(scalar_product))
score = gmf_recommender.nn_model(
torch.tensor([[gmf_recommender.user_id_mapping[user_id],
gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
print("Score={:.6f}".format(score))
print()
print("===============")
item_id = 145
item_repr = gmf_recommender.get_item_repr(item_id=item_id)
print("Item id = {}\titem title = {}".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
print(item_repr)
score = np.dot(user_repr, item_repr)
print("Scalar product={:.6f}".format(score))
score = gmf_recommender.nn_model(
torch.tensor([[gmf_recommender.user_id_mapping[user_id],
gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
print("Score={:.6f}".format(score))
print()
item_id = 171
item_repr = gmf_recommender.get_item_repr(item_id=item_id)
print("Item id = {}\titem title = {}".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
print(item_repr)
score = np.dot(user_repr, item_repr)
print("Scalar product={:.6f}".format(score))
score = gmf_recommender.nn_model(
torch.tensor([[gmf_recommender.user_id_mapping[user_id],
gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
print("Score={:.6f}".format(score))
User id=1 [ 8.8694301e-03 -1.1293894e-09 7.6482260e-01 6.5688614e-06 6.1402158e-03 -3.4989858e-10 3.0581679e-05 1.6342730e-05] User watched ['Independence Day (a.k.a. ID4) (1996)', 'Grosse Pointe Blank (1997)', 'Ladyhawke (1985)'] User history item representations Item id = 780 item title = Independence Day (a.k.a. ID4) (1996) [-2.0800237e-01 -3.2530998e-08 -7.2467870e-01 -7.6390163e-04 6.0946174e-02 -1.0309565e-09 -1.6934791e-03 -3.3520073e-02] Scalar product=-0.555722 Score=0.884161 Item id = 1500 item title = Grosse Pointe Blank (1997) [-4.7350328e-02 -1.4992246e-09 -1.5850608e-01 -2.9982104e-05 6.0663655e-02 4.1064720e-08 1.5929480e-04 1.2831817e-03] Scalar product=-0.121276 Score=0.609364 Item id = 3479 item title = Ladyhawke (1985) [-2.8682781e-02 6.1106755e-09 6.3241005e-01 -3.3657509e-06 9.6770316e-02 9.6757424e-10 -6.0637249e-05 1.5274031e-03] Scalar product=0.484021 Score=0.145174 =============== Item id = 145 item title = Bad Boys (1995) [-9.6727222e-02 1.2952676e-09 8.4303088e-02 1.5707446e-05 9.7245917e-02 -9.5372132e-10 -9.6978983e-05 1.0601738e-02] Scalar product=0.064216 Score=0.441272 Item id = 171 item title = Jeffrey (1995) [ 7.6405336e-03 -6.6923184e-10 9.0268552e-01 -5.7306852e-06 -1.5152089e-02 -9.7515729e-10 -1.3149886e-04 4.9494698e-08] Scalar product=0.690369 Score=0.073709
Training-test split evaluation
from evaluation_and_testing.testing import evaluate_train_test_split_implicit
gmf_recommender = GMFRecommender(n_neg_per_pos=10, batch_size=16,
embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20)
gmf_tts_results = [['GMFRecommender'] + list(evaluate_train_test_split_implicit(
gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
gmf_tts_results = pd.DataFrame(
gmf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(gmf_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | GMFRecommender | 0.292208 | 0.487013 | 0.662338 | 0.805195 | 0.292208 | 0.404914 | 0.477292 | 0.52351 |
from recommenders.netflix_recommender import NetflixRecommender
netflix_recommender = NetflixRecommender(n_epochs=150)
netflix_tts_results = [['NetflixRecommender'] + list(evaluate_train_test_split_implicit(
netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
netflix_tts_results = pd.DataFrame(
netflix_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(netflix_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | NetflixRecommender | 0.292208 | 0.538961 | 0.733766 | 0.948052 | 0.292208 | 0.434289 | 0.514203 | 0.583217 |
from recommenders.amazon_recommender import AmazonRecommender
amazon_recommender = AmazonRecommender()
amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
amazon_tts_results = pd.DataFrame(
amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(amazon_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.181818 | 0.311688 | 0.402597 | 0.551948 | 0.181818 | 0.257806 | 0.294682 | 0.34147 |
from recommenders.tfidf_recommender import TFIDFRecommender
tfidf_recommender = TFIDFRecommender()
tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
tfidf_tts_results = pd.DataFrame(
tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(tfidf_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | TFIDFRecommender | 0.025974 | 0.090909 | 0.136364 | 0.318182 | 0.025974 | 0.064393 | 0.083685 | 0.140799 |
tts_results = pd.concat([gmf_tts_results, netflix_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | GMFRecommender | 0.292208 | 0.487013 | 0.662338 | 0.805195 | 0.292208 | 0.404914 | 0.477292 | 0.523510 |
1 | NetflixRecommender | 0.292208 | 0.538961 | 0.733766 | 0.948052 | 0.292208 | 0.434289 | 0.514203 | 0.583217 |
2 | AmazonRecommender | 0.181818 | 0.311688 | 0.402597 | 0.551948 | 0.181818 | 0.257806 | 0.294682 | 0.341470 |
3 | TFIDFRecommender | 0.025974 | 0.090909 | 0.136364 | 0.318182 | 0.025974 | 0.064393 | 0.083685 | 0.140799 |
Leave-one-out evaluation
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit
gmf_recommender = GMFRecommender(n_epochs=10)
gmf_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(
gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
gmf_loo_results = pd.DataFrame(
gmf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(gmf_loo_results.to_html()))
netflix_recommender = NetflixRecommender(n_epochs=10)
netflix_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(
netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
netflix_loo_results = pd.DataFrame(
netflix_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(netflix_loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | UserBasedCosineNearestNeighborsRecommender | 0.096667 | 0.146667 | 0.186667 | 0.306667 | 0.096667 | 0.124285 | 0.140782 | 0.178962 |
from recommenders.amazon_recommender import AmazonRecommender
amazon_recommender = AmazonRecommender()
amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
amazon_loo_results = pd.DataFrame(
amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(amazon_loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.166667 | 0.256667 | 0.32 | 0.426667 | 0.166667 | 0.219086 | 0.245486 | 0.279978 |
tfidf_recommender = TFIDFRecommender()
tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
tfidf_loo_results = pd.DataFrame(
tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(tfidf_loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | TFIDFRecommender | 0.006667 | 0.053333 | 0.123333 | 0.233333 | 0.006667 | 0.033491 | 0.062178 | 0.096151 |
loo_results = pd.concat([gmf_loo_results, netflix_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | UserBasedCosineNearestNeighborsRecommender | 0.096667 | 0.146667 | 0.186667 | 0.306667 | 0.096667 | 0.124285 | 0.140782 | 0.178962 |
1 | UserBasedCosineNearestNeighborsRecommender | 0.100000 | 0.150000 | 0.180000 | 0.313333 | 0.100000 | 0.127182 | 0.139518 | 0.181748 |
2 | UserBasedCosineNearestNeighborsRecommender | 0.266667 | 0.420000 | 0.513333 | 0.650000 | 0.266667 | 0.357736 | 0.396033 | 0.440599 |
3 | UserBasedCosineNearestNeighborsRecommender | 0.173333 | 0.280000 | 0.336667 | 0.420000 | 0.173333 | 0.234522 | 0.257759 | 0.284723 |
4 | AmazonRecommender | 0.166667 | 0.256667 | 0.320000 | 0.426667 | 0.166667 | 0.219086 | 0.245486 | 0.279978 |
5 | TFIDFRecommender | 0.006667 | 0.053333 | 0.123333 | 0.233333 | 0.006667 | 0.033491 | 0.062178 | 0.096151 |