meh/recommender-systems-class-master/class_13_generalized_matrix_factorization.ipynb
2021-07-07 20:03:54 +02:00

87 KiB

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict, deque

import torch
import torch.nn as nn
import torch.optim as optim

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

Load data

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(HTML(ml_movies_df.head(10).to_html()))

print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id title genres
118 145 Bad Boys (1995) Action|Comedy|Crime|Drama|Thriller
143 171 Jeffrey (1995) Comedy|Drama
194 228 Destiny Turns on the Radio (1995) Comedy
199 233 Exotica (1994) Drama
230 267 Major Payne (1995) Comedy
313 355 Flintstones, The (1994) Children|Comedy|Fantasy
379 435 Coneheads (1993) Comedy|Sci-Fi
419 481 Kalifornia (1993) Drama|Thriller
615 780 Independence Day (a.k.a. ID4) (1996) Action|Adventure|Sci-Fi|Thriller
737 959 Of Human Bondage (1934) Drama
Number of interactions left: 1170

Generalized Matrix Factorization (GMF)

from livelossplot import PlotLosses

from recommenders.recommender import Recommender


class GMFModel(nn.Module):
    def __init__(self, n_items, n_users, embedding_dim, seed):
        super().__init__()

        self.seed = torch.manual_seed(seed)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 1, bias=False)

    def forward(self, x):
        user_ids = x[:, 0]
        item_ids = x[:, 1]
        user_embedding = self.user_embedding(user_ids)
        item_embedding = self.item_embedding(item_ids)
        x = self.fc(user_embedding * item_embedding)
        x = torch.sigmoid(x)

        return x


class GMFRecommender(Recommender):
    """
    General Matrix Factorization recommender as described in:
    - He X., Liao L., Zhang H., Nie L., Hu X., Chua T., Neural Collaborative Filtering, WWW Conference, 2017
    """

    def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.r = None
        self.most_popular_items = None
        
        self.nn_model = None
        self.optimizer = None
        
        self.n_neg_per_pos = n_neg_per_pos
        if 'n_epochs' in params:  # number of epochs (each epoch goes through the entire training set)
            self.n_epochs = params['n_epochs']
        else:
            self.n_epochs = 10
        if 'lr' in params:  # learning rate
            self.lr = params['lr']
        else:
            self.lr = 0.01
        if 'weight_decay' in params:  # weight decay (L2 regularization)
            self.weight_decay = params['weight_decay']
        else:
            self.weight_decay = 0.001
        if 'embedding_dim' in params:
            self.embedding_dim = params['embedding_dim']
        else:
            self.embedding_dim = 4
        if 'batch_size' in params:
            self.batch_size = params['batch_size']
        else:
            self.batch_size = 64
        if 'device' in params:
            self.device = params['device']
        else:
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']
        else:
            self.should_recommend_already_bought = False
        
        if 'train' in params:
            self.train = params['train']
        else:
            self.train = False
        self.validation_set_size = 0.2
        
        self.seed = seed
        self.rng = np.random.RandomState(seed=seed)
        torch.manual_seed(seed)
        
        if 'should_save_model' in params:
            self.should_save_model = params['should_save_model']
        self.print_type = print_type

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        del users_df, items_df

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df.copy()
        n_users = np.max(interactions_df['user_id']) + 1
        n_items = np.max(interactions_df['item_id']) + 1

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        self.r = r
        
        # Indicate positive interactions
        
        interactions_df.loc[:, 'interacted'] = 1

        # Generate negative interactions
        negative_interactions = []

        i = 0
        while i < self.n_neg_per_pos * len(interactions_df):
            sample_size = 1000
            user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
            item_ids = self.rng.choice(np.arange(n_items), size=sample_size)

            j = 0
            while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
                if r[user_ids[j]][item_ids[j]] == 0:
                    negative_interactions.append([user_ids[j], item_ids[j], 0])
                    i += 1
                j += 1
        
        interactions_df = pd.concat(
            [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
        interactions_df = interactions_df.reset_index(drop=True)
        
        # Initialize losses and loss visualization
        
        if self.print_type is not None and self.print_type == 'live':
            liveloss = PlotLosses()

        training_losses = deque(maxlen=50)
        training_avg_losses = []
        training_epoch_losses = []
        validation_losses = deque(maxlen=50)
        validation_avg_losses = []
        validation_epoch_losses = []
        last_training_total_loss = 0.0
        last_validation_total_loss = 0.0
        
        # Initialize the network
        
        self.nn_model = GMFModel(n_items, n_users, self.embedding_dim, self.seed)
        self.nn_model.train()
        self.nn_model.to(self.device)
        self.optimizer = optim.Adam(self.nn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        
        # Split the data
        
        if self.train:
            interaction_ids = self.rng.permutation(len(interactions_df))
            train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
            training_ids = interaction_ids[:train_validation_slice_idx]
            validation_ids = interaction_ids[train_validation_slice_idx:]
        else:
            interaction_ids = self.rng.permutation(len(interactions_df))
            training_ids = interaction_ids
            validation_ids = []
        
        # Train the model
        
        for epoch in range(self.n_epochs):
            if self.print_type is not None and self.print_type == 'live':
                logs = {}
                
            # Train
            
            training_losses.clear()
            training_total_loss = 0.0
            
            self.rng.shuffle(training_ids)
            
            batch_idx = 0
            n_batches = int(np.ceil(len(training_ids) / self.batch_size))
            
            for batch_idx in range(n_batches):
                
                batch_ids = training_ids[(batch_idx * self.batch_size):((batch_idx + 1) * self.batch_size)]
                
                batch = interactions_df.loc[batch_ids]
                batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)
                y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)
                
                # Create responses

                y = self.nn_model(batch_input).clip(0.000001, 0.999999)

                # Define loss and backpropagate

                self.optimizer.zero_grad()
                loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()
                
                loss.backward()
                self.optimizer.step()
                
                training_total_loss += loss.item()
                
                if self.print_type is not None and self.print_type == 'text':
                    print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
                        epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
                
                training_losses.append(loss.item())
                training_avg_losses.append(np.mean(training_losses))
                
            # Validate

            validation_total_loss = 0.0
            
            batch = interactions_df.loc[validation_ids]
            batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)
            y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)
            
            # Create responses

            y = self.nn_model(batch_input).clip(0.000001, 0.999999)

            # Calculate validation loss

            loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()
            validation_total_loss += loss.item()
                
            # Save and print epoch losses
            
            training_last_avg_loss = training_total_loss / len(training_ids)
            validation_last_avg_loss = validation_total_loss / len(validation_ids)

            if self.print_type is not None and self.print_type == 'live' and epoch >= 0:
                # A bound on epoch prevents showing extremely high losses in the first epochs
                logs['loss'] = training_last_avg_loss
                logs['val_loss'] = validation_last_avg_loss
                liveloss.update(logs)
                liveloss.send()

        # Find the most popular items for the cold start problem

        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]

        # Handle users not in the training data

        # Map item ids

        items_df = items_df.copy()
        items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []

            user_id = user['user_id']

            if user_id in self.user_id_mapping:
                
                mapped_user_id = self.user_id_mapping[user_id]
                
                ids_list = items_df['item_id'].tolist()
                id_to_pos = np.array([0]*len(ids_list))
                for k in range(len(ids_list)):
                    id_to_pos[ids_list[k]] = k
                
                net_input = torch.tensor(list(zip([mapped_user_id]*len(ids_list), ids_list))).to(self.device)
                
                scores = self.nn_model(net_input).flatten().detach().cpu().numpy()
                
                # Choose n recommendations based on highest scores
                if not self.should_recommend_already_bought:
                    x_list = self.interactions_df.loc[
                        self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                    scores[id_to_pos[x_list]] = -np.inf

                chosen_pos = np.argsort(-scores)[:n_recommendations]

                for item_pos in chosen_pos:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
                            'score': scores[item_pos]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
    
    def get_user_repr(self, user_id):
        mapped_user_id = self.user_id_mapping[user_id]
        return self.nn_model.user_embedding(torch.tensor(mapped_user_id).to(self.device)).detach().cpu().numpy()
    
    def get_item_repr(self, item_id):
        mapped_item_id = self.item_id_mapping[item_id]
        return self.nn_model.item_embedding(torch.tensor(mapped_item_id).to(self.device)).detach().cpu().numpy()

    
class MLPModel(nn.Module):
    def __init__(self, n_items, n_users, embedding_dim, seed):
        super().__init__()

        self.seed = torch.manual_seed(seed)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.fc1 = nn.Linear(2 * embedding_dim, 32, bias=False)
        self.fc2 = nn.Linear(32, 16, bias=False)
        self.fc3 = nn.Linear(16, 1, bias=False)

    def forward(self, x):
        user = x[:, 0]
        item = x[:, 1]
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        x = torch.cat([user_embedding, item_embedding], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))

        return x

    
class NeuMFModel(nn.Module):
    def __init__(self, n_items, n_users, gmf_embedding_dim, mlp_embedding_dim, seed):
        super().__init__()

        self.seed = torch.manual_seed(seed)

        # GMF

        self.gmf_user_embedding = nn.Embedding(n_users, gmf_embedding_dim)
        self.gmf_item_embedding = nn.Embedding(n_items, gmf_embedding_dim)

        # MLP

        self.mlp_user_embedding = nn.Embedding(n_users, mlp_embedding_dim)
        self.mlp_item_embedding = nn.Embedding(n_items, mlp_embedding_dim)
        self.mlp_fc1 = nn.Linear(2 * mlp_embedding_dim, 32, bias=False)
        self.mlp_fc2 = nn.Linear(32, 16, bias=False)

        # Merge

        self.fc = nn.Linear(32, 1, bias=False)

    def forward(self, x):
        user = x[:, 0]
        item = x[:, 1]

        # GMF

        gmf_user_embedding = self.gmf_user_embedding(user)
        gmf_item_embedding = self.gmf_item_embedding(item)
        gmf_x = gmf_user_embedding * gmf_item_embedding

        # MLP

        mlp_user_embedding = self.mlp_user_embedding(user)
        mlp_item_embedding = self.mlp_item_embedding(item)
        mlp_x = torch.cat([mlp_user_embedding, mlp_item_embedding], dim=1)
        mlp_x = torch.relu(self.mlp_fc1(mlp_x))
        mlp_x = torch.relu(self.mlp_fc2(mlp_x))

        # Final score

        x = torch.cat([gmf_x, mlp_x], dim=1)
        x = torch.sigmoid(self.fc(x))

        return x

Quick test of the recommender (training)

gmf_recommender = GMFRecommender(print_type='live', n_neg_per_pos=10, batch_size=16, 
                                 embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20, seed=1)
gmf_recommender.fit(ml_ratings_df, None, ml_movies_df)
Loss
	training         	 (min:    0.130, max:    0.706, cur:    0.130)
	validation       	 (min:    0.224, max:    0.696, cur:    0.226)

Quick test of the recommender (recommending)

recommendations = gmf_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))
Recommendations
user_id item_id score title genres
0 1 4896 0.768898 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) Adventure|Children|Fantasy
1 1 435 0.650600 Coneheads (1993) Comedy|Sci-Fi
2 1 41566 0.609373 Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) Adventure|Children|Fantasy
3 1 6502 0.535332 28 Days Later (2002) Action|Horror|Sci-Fi
4 1 145 0.441272 Bad Boys (1995) Action|Comedy|Crime|Drama|Thriller
5 1 6537 0.432268 Terminator 3: Rise of the Machines (2003) Action|Adventure|Sci-Fi
6 1 355 0.421626 Flintstones, The (1994) Children|Comedy|Fantasy
7 1 5673 0.242538 Punch-Drunk Love (2002) Comedy|Drama|Romance
8 1 481 0.218651 Kalifornia (1993) Drama|Thriller
9 1 267 0.213728 Major Payne (1995) Comedy
10 4 780 0.858898 Independence Day (a.k.a. ID4) (1996) Action|Adventure|Sci-Fi|Thriller
11 4 435 0.634766 Coneheads (1993) Comedy|Sci-Fi
12 4 41566 0.597829 Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) Adventure|Children|Fantasy
13 4 6502 0.531417 28 Days Later (2002) Action|Horror|Sci-Fi
14 4 145 0.447853 Bad Boys (1995) Action|Comedy|Crime|Drama|Thriller
15 4 6537 0.439573 Terminator 3: Rise of the Machines (2003) Action|Adventure|Sci-Fi
16 4 355 0.430258 Flintstones, The (1994) Children|Comedy|Fantasy
17 4 5673 0.266561 Punch-Drunk Love (2002) Comedy|Drama|Romance
18 4 481 0.243838 Kalifornia (1993) Drama|Thriller
19 4 267 0.239114 Major Payne (1995) Comedy
20 6 4896 0.687780 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) Adventure|Children|Fantasy
21 6 41566 0.572620 Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005) Adventure|Children|Fantasy
22 6 1500 0.572483 Grosse Pointe Blank (1997) Comedy|Crime|Romance
23 6 6502 0.523220 28 Days Later (2002) Action|Horror|Sci-Fi
24 6 6537 0.455307 Terminator 3: Rise of the Machines (2003) Action|Adventure|Sci-Fi
25 6 5673 0.321320 Punch-Drunk Love (2002) Comedy|Drama|Romance
26 6 481 0.302354 Kalifornia (1993) Drama|Thriller
27 6 4890 0.270704 Shallow Hal (2001) Comedy|Fantasy|Romance
28 6 5954 0.261981 25th Hour (2002) Crime|Drama
29 6 3468 0.239384 Hustler, The (1961) Drama

User and item representations

user_id = 1
user_repr = gmf_recommender.get_user_repr(user_id=user_id)
print("User id={}".format(user_id))
print(user_repr)
print()

print("User watched")
print(ml_df.loc[ml_df['user_id'] == user_id, 'title'].tolist())
print()

print('User history item representations')
for item_id in ml_df.loc[ml_df['user_id'] == user_id, 'item_id'].tolist():
    item_repr = gmf_recommender.get_item_repr(item_id=item_id)
    print("Item id = {}\titem title = {}".format(
        item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
    print(item_repr)
    scalar_product = np.dot(user_repr, item_repr)
    print("Scalar product={:.6f}".format(scalar_product))
    score = gmf_recommender.nn_model(
        torch.tensor([[gmf_recommender.user_id_mapping[user_id], 
                       gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
    print("Score={:.6f}".format(score))
    print()

print("===============")
    
item_id = 145
item_repr = gmf_recommender.get_item_repr(item_id=item_id)
print("Item id = {}\titem title = {}".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
print(item_repr)
score = np.dot(user_repr, item_repr)
print("Scalar product={:.6f}".format(score))
score = gmf_recommender.nn_model(
    torch.tensor([[gmf_recommender.user_id_mapping[user_id], 
                   gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
print("Score={:.6f}".format(score))
print()

item_id = 171
item_repr = gmf_recommender.get_item_repr(item_id=item_id)
print("Item id = {}\titem title = {}".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))
print(item_repr)
score = np.dot(user_repr, item_repr)
print("Scalar product={:.6f}".format(score))
score = gmf_recommender.nn_model(
    torch.tensor([[gmf_recommender.user_id_mapping[user_id], 
                   gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()
print("Score={:.6f}".format(score))
User id=1
[ 8.8694301e-03 -1.1293894e-09  7.6482260e-01  6.5688614e-06
  6.1402158e-03 -3.4989858e-10  3.0581679e-05  1.6342730e-05]

User watched
['Independence Day (a.k.a. ID4) (1996)', 'Grosse Pointe Blank (1997)', 'Ladyhawke (1985)']

User history item representations
Item id = 780	item title = Independence Day (a.k.a. ID4) (1996)
[-2.0800237e-01 -3.2530998e-08 -7.2467870e-01 -7.6390163e-04
  6.0946174e-02 -1.0309565e-09 -1.6934791e-03 -3.3520073e-02]
Scalar product=-0.555722
Score=0.884161

Item id = 1500	item title = Grosse Pointe Blank (1997)
[-4.7350328e-02 -1.4992246e-09 -1.5850608e-01 -2.9982104e-05
  6.0663655e-02  4.1064720e-08  1.5929480e-04  1.2831817e-03]
Scalar product=-0.121276
Score=0.609364

Item id = 3479	item title = Ladyhawke (1985)
[-2.8682781e-02  6.1106755e-09  6.3241005e-01 -3.3657509e-06
  9.6770316e-02  9.6757424e-10 -6.0637249e-05  1.5274031e-03]
Scalar product=0.484021
Score=0.145174

===============
Item id = 145	item title = Bad Boys (1995)
[-9.6727222e-02  1.2952676e-09  8.4303088e-02  1.5707446e-05
  9.7245917e-02 -9.5372132e-10 -9.6978983e-05  1.0601738e-02]
Scalar product=0.064216
Score=0.441272

Item id = 171	item title = Jeffrey (1995)
[ 7.6405336e-03 -6.6923184e-10  9.0268552e-01 -5.7306852e-06
 -1.5152089e-02 -9.7515729e-10 -1.3149886e-04  4.9494698e-08]
Scalar product=0.690369
Score=0.073709

Training-test split evaluation

from evaluation_and_testing.testing import evaluate_train_test_split_implicit
gmf_recommender = GMFRecommender(n_neg_per_pos=10, batch_size=16, 
                                 embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20)

gmf_tts_results = [['GMFRecommender'] + list(evaluate_train_test_split_implicit(
    gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

gmf_tts_results = pd.DataFrame(
    gmf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(gmf_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 GMFRecommender 0.292208 0.487013 0.662338 0.805195 0.292208 0.404914 0.477292 0.52351
from recommenders.netflix_recommender import NetflixRecommender

netflix_recommender = NetflixRecommender(n_epochs=150)

netflix_tts_results = [['NetflixRecommender'] + list(evaluate_train_test_split_implicit(
    netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

netflix_tts_results = pd.DataFrame(
    netflix_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(netflix_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 NetflixRecommender 0.292208 0.538961 0.733766 0.948052 0.292208 0.434289 0.514203 0.583217
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.34147
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799
tts_results = pd.concat([gmf_tts_results, netflix_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 GMFRecommender 0.292208 0.487013 0.662338 0.805195 0.292208 0.404914 0.477292 0.523510
1 NetflixRecommender 0.292208 0.538961 0.733766 0.948052 0.292208 0.434289 0.514203 0.583217
2 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.341470
3 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799

Leave-one-out evaluation

from evaluation_and_testing.testing import evaluate_leave_one_out_implicit
gmf_recommender = GMFRecommender(n_epochs=10)

gmf_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(
    gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

gmf_loo_results = pd.DataFrame(
    gmf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(gmf_loo_results.to_html()))
netflix_recommender = NetflixRecommender(n_epochs=10)

netflix_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(
    netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

netflix_loo_results = pd.DataFrame(
    netflix_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(netflix_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.096667 0.146667 0.186667 0.306667 0.096667 0.124285 0.140782 0.178962
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.166667 0.256667 0.32 0.426667 0.166667 0.219086 0.245486 0.279978
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151
loo_results = pd.concat([gmf_loo_results, netflix_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.096667 0.146667 0.186667 0.306667 0.096667 0.124285 0.140782 0.178962
1 UserBasedCosineNearestNeighborsRecommender 0.100000 0.150000 0.180000 0.313333 0.100000 0.127182 0.139518 0.181748
2 UserBasedCosineNearestNeighborsRecommender 0.266667 0.420000 0.513333 0.650000 0.266667 0.357736 0.396033 0.440599
3 UserBasedCosineNearestNeighborsRecommender 0.173333 0.280000 0.336667 0.420000 0.173333 0.234522 0.257759 0.284723
4 AmazonRecommender 0.166667 0.256667 0.320000 0.426667 0.166667 0.219086 0.245486 0.279978
5 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151