{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "verified-accommodation", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from IPython.display import Markdown, display, HTML\n", "from collections import defaultdict, deque\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "\n", "# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n", "import os\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'\n", "os.environ['CUDA_LAUNCH_BLOCKING'] = '1'" ] }, { "cell_type": "markdown", "id": "educated-tourist", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 2, "id": "prepared-fraction", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
item_idtitlegenres
118145Bad Boys (1995)Action|Comedy|Crime|Drama|Thriller
143171Jeffrey (1995)Comedy|Drama
194228Destiny Turns on the Radio (1995)Comedy
199233Exotica (1994)Drama
230267Major Payne (1995)Comedy
313355Flintstones, The (1994)Children|Comedy|Fantasy
379435Coneheads (1993)Comedy|Sci-Fi
419481Kalifornia (1993)Drama|Thriller
615780Independence Day (a.k.a. ID4) (1996)Action|Adventure|Sci-Fi|Thriller
737959Of Human Bondage (1934)Drama
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Number of interactions left: 1170\n" ] } ], "source": [ "ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n", "ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n", "ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n", "\n", "# Filter the data to reduce the number of movies\n", "seed = 6789\n", "rng = np.random.RandomState(seed=seed)\n", "left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n", "\n", "ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n", "ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n", "ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n", "\n", "display(HTML(ml_movies_df.head(10).to_html()))\n", "\n", "print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))" ] }, { "cell_type": "markdown", "id": "opponent-prediction", "metadata": {}, "source": [ "# Generalized Matrix Factorization (GMF)" ] }, { "cell_type": "code", "execution_count": 32, "id": "fancy-return", "metadata": {}, "outputs": [], "source": [ "from livelossplot import PlotLosses\n", "\n", "from recommenders.recommender import Recommender\n", "\n", "\n", "class GMFModel(nn.Module):\n", " def __init__(self, n_items, n_users, embedding_dim, seed):\n", " super().__init__()\n", "\n", " self.seed = torch.manual_seed(seed)\n", " self.item_embedding = nn.Embedding(n_items, embedding_dim)\n", " self.user_embedding = nn.Embedding(n_users, embedding_dim)\n", " self.fc = nn.Linear(embedding_dim, 1, bias=False)\n", "\n", " def forward(self, x):\n", " user_ids = x[:, 0]\n", " item_ids = x[:, 1]\n", " user_embedding = self.user_embedding(user_ids)\n", " item_embedding = self.item_embedding(item_ids)\n", " x = self.fc(user_embedding * item_embedding)\n", " x = torch.sigmoid(x)\n", "\n", " return x\n", "\n", "\n", "class GMFRecommender(Recommender):\n", " \"\"\"\n", " General Matrix Factorization recommender as described in:\n", " - He X., Liao L., Zhang H., Nie L., Hu X., Chua T., Neural Collaborative Filtering, WWW Conference, 2017\n", " \"\"\"\n", "\n", " def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):\n", " super().__init__()\n", " self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n", " self.interactions_df = None\n", " self.item_id_mapping = None\n", " self.user_id_mapping = None\n", " self.item_id_reverse_mapping = None\n", " self.user_id_reverse_mapping = None\n", " self.r = None\n", " self.most_popular_items = None\n", " \n", " self.nn_model = None\n", " self.optimizer = None\n", " \n", " self.n_neg_per_pos = n_neg_per_pos\n", " if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)\n", " self.n_epochs = params['n_epochs']\n", " else:\n", " self.n_epochs = 10\n", " if 'lr' in params: # learning rate\n", " self.lr = params['lr']\n", " else:\n", " self.lr = 0.01\n", " if 'weight_decay' in params: # weight decay (L2 regularization)\n", " self.weight_decay = params['weight_decay']\n", " else:\n", " self.weight_decay = 0.001\n", " if 'embedding_dim' in params:\n", " self.embedding_dim = params['embedding_dim']\n", " else:\n", " self.embedding_dim = 4\n", " if 'batch_size' in params:\n", " self.batch_size = params['batch_size']\n", " else:\n", " self.batch_size = 64\n", " if 'device' in params:\n", " self.device = params['device']\n", " else:\n", " self.device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", " \n", " if 'should_recommend_already_bought' in params:\n", " self.should_recommend_already_bought = params['should_recommend_already_bought']\n", " else:\n", " self.should_recommend_already_bought = False\n", " \n", " if 'train' in params:\n", " self.train = params['train']\n", " else:\n", " self.train = False\n", " self.validation_set_size = 0.2\n", " \n", " self.seed = seed\n", " self.rng = np.random.RandomState(seed=seed)\n", " torch.manual_seed(seed)\n", " \n", " if 'should_save_model' in params:\n", " self.should_save_model = params['should_save_model']\n", " self.print_type = print_type\n", "\n", " def fit(self, interactions_df, users_df, items_df):\n", " \"\"\"\n", " Training of the recommender.\n", "\n", " :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n", " defined by user_id, item_id and features of the interaction.\n", " :param pd.DataFrame users_df: DataFrame with users and their features defined by\n", " user_id and the user feature columns.\n", " :param pd.DataFrame items_df: DataFrame with items and their features defined\n", " by item_id and the item feature columns.\n", " \"\"\"\n", "\n", " del users_df, items_df\n", "\n", " # Shift item ids and user ids so that they are consecutive\n", "\n", " unique_item_ids = interactions_df['item_id'].unique()\n", " self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n", " self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n", " unique_user_ids = interactions_df['user_id'].unique()\n", " self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n", " self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n", "\n", " interactions_df = interactions_df.copy()\n", " interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n", "\n", " # Get the number of items and users\n", "\n", " self.interactions_df = interactions_df.copy()\n", " n_users = np.max(interactions_df['user_id']) + 1\n", " n_items = np.max(interactions_df['item_id']) + 1\n", "\n", " # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n", " r = np.zeros(shape=(n_users, n_items))\n", " for idx, interaction in interactions_df.iterrows():\n", " r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n", "\n", " self.r = r\n", " \n", " # Indicate positive interactions\n", " \n", " interactions_df.loc[:, 'interacted'] = 1\n", "\n", " # Generate negative interactions\n", " negative_interactions = []\n", "\n", " i = 0\n", " while i < self.n_neg_per_pos * len(interactions_df):\n", " sample_size = 1000\n", " user_ids = self.rng.choice(np.arange(n_users), size=sample_size)\n", " item_ids = self.rng.choice(np.arange(n_items), size=sample_size)\n", "\n", " j = 0\n", " while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):\n", " if r[user_ids[j]][item_ids[j]] == 0:\n", " negative_interactions.append([user_ids[j], item_ids[j], 0])\n", " i += 1\n", " j += 1\n", " \n", " interactions_df = pd.concat(\n", " [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])\n", " interactions_df = interactions_df.reset_index(drop=True)\n", " \n", " # Initialize losses and loss visualization\n", " \n", " if self.print_type is not None and self.print_type == 'live':\n", " liveloss = PlotLosses()\n", "\n", " training_losses = deque(maxlen=50)\n", " training_avg_losses = []\n", " training_epoch_losses = []\n", " validation_losses = deque(maxlen=50)\n", " validation_avg_losses = []\n", " validation_epoch_losses = []\n", " last_training_total_loss = 0.0\n", " last_validation_total_loss = 0.0\n", " \n", " # Initialize the network\n", " \n", " self.nn_model = GMFModel(n_items, n_users, self.embedding_dim, self.seed)\n", " self.nn_model.train()\n", " self.nn_model.to(self.device)\n", " self.optimizer = optim.Adam(self.nn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)\n", " \n", " # Split the data\n", " \n", " if self.train:\n", " interaction_ids = self.rng.permutation(len(interactions_df))\n", " train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))\n", " training_ids = interaction_ids[:train_validation_slice_idx]\n", " validation_ids = interaction_ids[train_validation_slice_idx:]\n", " else:\n", " interaction_ids = self.rng.permutation(len(interactions_df))\n", " training_ids = interaction_ids\n", " validation_ids = []\n", " \n", " # Train the model\n", " \n", " for epoch in range(self.n_epochs):\n", " if self.print_type is not None and self.print_type == 'live':\n", " logs = {}\n", " \n", " # Train\n", " \n", " training_losses.clear()\n", " training_total_loss = 0.0\n", " \n", " self.rng.shuffle(training_ids)\n", " \n", " batch_idx = 0\n", " n_batches = int(np.ceil(len(training_ids) / self.batch_size))\n", " \n", " for batch_idx in range(n_batches):\n", " \n", " batch_ids = training_ids[(batch_idx * self.batch_size):((batch_idx + 1) * self.batch_size)]\n", " \n", " batch = interactions_df.loc[batch_ids]\n", " batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)\n", " y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)\n", " \n", " # Create responses\n", "\n", " y = self.nn_model(batch_input).clip(0.000001, 0.999999)\n", "\n", " # Define loss and backpropagate\n", "\n", " self.optimizer.zero_grad()\n", " loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()\n", " \n", " loss.backward()\n", " self.optimizer.step()\n", " \n", " training_total_loss += loss.item()\n", " \n", " if self.print_type is not None and self.print_type == 'text':\n", " print(\"\\rEpoch: {}\\tBatch: {}\\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}\".format(\n", " epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end=\"\")\n", " \n", " training_losses.append(loss.item())\n", " training_avg_losses.append(np.mean(training_losses))\n", " \n", " # Validate\n", "\n", " validation_total_loss = 0.0\n", " \n", " batch = interactions_df.loc[validation_ids]\n", " batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)\n", " y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)\n", " \n", " # Create responses\n", "\n", " y = self.nn_model(batch_input).clip(0.000001, 0.999999)\n", "\n", " # Calculate validation loss\n", "\n", " loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()\n", " validation_total_loss += loss.item()\n", " \n", " # Save and print epoch losses\n", " \n", " training_last_avg_loss = training_total_loss / len(training_ids)\n", " validation_last_avg_loss = validation_total_loss / len(validation_ids)\n", "\n", " if self.print_type is not None and self.print_type == 'live' and epoch >= 0:\n", " # A bound on epoch prevents showing extremely high losses in the first epochs\n", " logs['loss'] = training_last_avg_loss\n", " logs['val_loss'] = validation_last_avg_loss\n", " liveloss.update(logs)\n", " liveloss.send()\n", "\n", " # Find the most popular items for the cold start problem\n", "\n", " offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n", " offers_count = offers_count.sort_values('user_id', ascending=False)\n", " self.most_popular_items = offers_count.index\n", "\n", " def recommend(self, users_df, items_df, n_recommendations=1):\n", " \"\"\"\n", " Serving of recommendations. Scores items in items_df for each user in users_df and returns\n", " top n_recommendations for each user.\n", "\n", " :param pd.DataFrame users_df: DataFrame with users and their features for which\n", " recommendations should be generated.\n", " :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n", " :param int n_recommendations: Number of recommendations to be returned for each user.\n", " :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n", " for each user.\n", " :rtype: pd.DataFrame\n", " \"\"\"\n", "\n", " # Clean previous recommendations (iloc could be used alternatively)\n", " self.recommender_df = self.recommender_df[:0]\n", "\n", " # Handle users not in the training data\n", "\n", " # Map item ids\n", "\n", " items_df = items_df.copy()\n", " items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]\n", " items_df.replace({'item_id': self.item_id_mapping}, inplace=True)\n", "\n", " # Generate recommendations\n", "\n", " for idx, user in users_df.iterrows():\n", " recommendations = []\n", "\n", " user_id = user['user_id']\n", "\n", " if user_id in self.user_id_mapping:\n", " \n", " mapped_user_id = self.user_id_mapping[user_id]\n", " \n", " ids_list = items_df['item_id'].tolist()\n", " id_to_pos = np.array([0]*len(ids_list))\n", " for k in range(len(ids_list)):\n", " id_to_pos[ids_list[k]] = k\n", " \n", " net_input = torch.tensor(list(zip([mapped_user_id]*len(ids_list), ids_list))).to(self.device)\n", " \n", " scores = self.nn_model(net_input).flatten().detach().cpu().numpy()\n", " \n", " # Choose n recommendations based on highest scores\n", " if not self.should_recommend_already_bought:\n", " x_list = self.interactions_df.loc[\n", " self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n", " scores[id_to_pos[x_list]] = -np.inf\n", "\n", " chosen_pos = np.argsort(-scores)[:n_recommendations]\n", "\n", " for item_pos in chosen_pos:\n", " recommendations.append(\n", " {\n", " 'user_id': self.user_id_reverse_mapping[mapped_user_id],\n", " 'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],\n", " 'score': scores[item_pos]\n", " }\n", " )\n", " else: # For new users recommend most popular items\n", " for i in range(n_recommendations):\n", " recommendations.append(\n", " {\n", " 'user_id': user['user_id'],\n", " 'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n", " 'score': 1.0\n", " }\n", " )\n", "\n", " user_recommendations = pd.DataFrame(recommendations)\n", "\n", " self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n", "\n", " return self.recommender_df\n", " \n", " def get_user_repr(self, user_id):\n", " mapped_user_id = self.user_id_mapping[user_id]\n", " return self.nn_model.user_embedding(torch.tensor(mapped_user_id).to(self.device)).detach().cpu().numpy()\n", " \n", " def get_item_repr(self, item_id):\n", " mapped_item_id = self.item_id_mapping[item_id]\n", " return self.nn_model.item_embedding(torch.tensor(mapped_item_id).to(self.device)).detach().cpu().numpy()\n", "\n", " \n", "class MLPModel(nn.Module):\n", " def __init__(self, n_items, n_users, embedding_dim, seed):\n", " super().__init__()\n", "\n", " self.seed = torch.manual_seed(seed)\n", " self.item_embedding = nn.Embedding(n_items, embedding_dim)\n", " self.user_embedding = nn.Embedding(n_users, embedding_dim)\n", " self.fc1 = nn.Linear(2 * embedding_dim, 32, bias=False)\n", " self.fc2 = nn.Linear(32, 16, bias=False)\n", " self.fc3 = nn.Linear(16, 1, bias=False)\n", "\n", " def forward(self, x):\n", " user = x[:, 0]\n", " item = x[:, 1]\n", " user_embedding = self.user_embedding(user)\n", " item_embedding = self.item_embedding(item)\n", " x = torch.cat([user_embedding, item_embedding], dim=1)\n", " x = torch.relu(self.fc1(x))\n", " x = torch.relu(self.fc2(x))\n", " x = torch.sigmoid(self.fc3(x))\n", "\n", " return x\n", "\n", " \n", "class NeuMFModel(nn.Module):\n", " def __init__(self, n_items, n_users, gmf_embedding_dim, mlp_embedding_dim, seed):\n", " super().__init__()\n", "\n", " self.seed = torch.manual_seed(seed)\n", "\n", " # GMF\n", "\n", " self.gmf_user_embedding = nn.Embedding(n_users, gmf_embedding_dim)\n", " self.gmf_item_embedding = nn.Embedding(n_items, gmf_embedding_dim)\n", "\n", " # MLP\n", "\n", " self.mlp_user_embedding = nn.Embedding(n_users, mlp_embedding_dim)\n", " self.mlp_item_embedding = nn.Embedding(n_items, mlp_embedding_dim)\n", " self.mlp_fc1 = nn.Linear(2 * mlp_embedding_dim, 32, bias=False)\n", " self.mlp_fc2 = nn.Linear(32, 16, bias=False)\n", "\n", " # Merge\n", "\n", " self.fc = nn.Linear(32, 1, bias=False)\n", "\n", " def forward(self, x):\n", " user = x[:, 0]\n", " item = x[:, 1]\n", "\n", " # GMF\n", "\n", " gmf_user_embedding = self.gmf_user_embedding(user)\n", " gmf_item_embedding = self.gmf_item_embedding(item)\n", " gmf_x = gmf_user_embedding * gmf_item_embedding\n", "\n", " # MLP\n", "\n", " mlp_user_embedding = self.mlp_user_embedding(user)\n", " mlp_item_embedding = self.mlp_item_embedding(item)\n", " mlp_x = torch.cat([mlp_user_embedding, mlp_item_embedding], dim=1)\n", " mlp_x = torch.relu(self.mlp_fc1(mlp_x))\n", " mlp_x = torch.relu(self.mlp_fc2(mlp_x))\n", "\n", " # Final score\n", "\n", " x = torch.cat([gmf_x, mlp_x], dim=1)\n", " x = torch.sigmoid(self.fc(x))\n", "\n", " return x" ] }, { "cell_type": "markdown", "id": "expensive-offering", "metadata": {}, "source": [ "## Quick test of the recommender (training)" ] }, { "cell_type": "code", "execution_count": 42, "id": "nonprofit-roads", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Loss\n", "\ttraining \t (min: 0.130, max: 0.706, cur: 0.130)\n", "\tvalidation \t (min: 0.224, max: 0.696, cur: 0.226)\n" ] } ], "source": [ "gmf_recommender = GMFRecommender(print_type='live', n_neg_per_pos=10, batch_size=16, \n", " embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20, seed=1)\n", "gmf_recommender.fit(ml_ratings_df, None, ml_movies_df)" ] }, { "cell_type": "markdown", "id": "incorporated-messaging", "metadata": {}, "source": [ "## Quick test of the recommender (recommending)" ] }, { "cell_type": "code", "execution_count": 5, "id": "accessible-value", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommendations\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idscoretitlegenres
0148960.768898Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)Adventure|Children|Fantasy
114350.650600Coneheads (1993)Comedy|Sci-Fi
21415660.609373Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)Adventure|Children|Fantasy
3165020.53533228 Days Later (2002)Action|Horror|Sci-Fi
411450.441272Bad Boys (1995)Action|Comedy|Crime|Drama|Thriller
5165370.432268Terminator 3: Rise of the Machines (2003)Action|Adventure|Sci-Fi
613550.421626Flintstones, The (1994)Children|Comedy|Fantasy
7156730.242538Punch-Drunk Love (2002)Comedy|Drama|Romance
814810.218651Kalifornia (1993)Drama|Thriller
912670.213728Major Payne (1995)Comedy
1047800.858898Independence Day (a.k.a. ID4) (1996)Action|Adventure|Sci-Fi|Thriller
1144350.634766Coneheads (1993)Comedy|Sci-Fi
124415660.597829Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)Adventure|Children|Fantasy
13465020.53141728 Days Later (2002)Action|Horror|Sci-Fi
1441450.447853Bad Boys (1995)Action|Comedy|Crime|Drama|Thriller
15465370.439573Terminator 3: Rise of the Machines (2003)Action|Adventure|Sci-Fi
1643550.430258Flintstones, The (1994)Children|Comedy|Fantasy
17456730.266561Punch-Drunk Love (2002)Comedy|Drama|Romance
1844810.243838Kalifornia (1993)Drama|Thriller
1942670.239114Major Payne (1995)Comedy
20648960.687780Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)Adventure|Children|Fantasy
216415660.572620Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)Adventure|Children|Fantasy
22615000.572483Grosse Pointe Blank (1997)Comedy|Crime|Romance
23665020.52322028 Days Later (2002)Action|Horror|Sci-Fi
24665370.455307Terminator 3: Rise of the Machines (2003)Action|Adventure|Sci-Fi
25656730.321320Punch-Drunk Love (2002)Comedy|Drama|Romance
2664810.302354Kalifornia (1993)Drama|Thriller
27648900.270704Shallow Hal (2001)Comedy|Fantasy|Romance
28659540.26198125th Hour (2002)Crime|Drama
29634680.239384Hustler, The (1961)Drama
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "recommendations = gmf_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n", "\n", "recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n", "print(\"Recommendations\")\n", "display(HTML(recommendations.to_html()))" ] }, { "cell_type": "markdown", "id": "documentary-barcelona", "metadata": {}, "source": [ "## User and item representations" ] }, { "cell_type": "code", "execution_count": 8, "id": "balanced-detective", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User id=1\n", "[ 8.8694301e-03 -1.1293894e-09 7.6482260e-01 6.5688614e-06\n", " 6.1402158e-03 -3.4989858e-10 3.0581679e-05 1.6342730e-05]\n", "\n", "User watched\n", "['Independence Day (a.k.a. ID4) (1996)', 'Grosse Pointe Blank (1997)', 'Ladyhawke (1985)']\n", "\n", "User history item representations\n", "Item id = 780\titem title = Independence Day (a.k.a. ID4) (1996)\n", "[-2.0800237e-01 -3.2530998e-08 -7.2467870e-01 -7.6390163e-04\n", " 6.0946174e-02 -1.0309565e-09 -1.6934791e-03 -3.3520073e-02]\n", "Scalar product=-0.555722\n", "Score=0.884161\n", "\n", "Item id = 1500\titem title = Grosse Pointe Blank (1997)\n", "[-4.7350328e-02 -1.4992246e-09 -1.5850608e-01 -2.9982104e-05\n", " 6.0663655e-02 4.1064720e-08 1.5929480e-04 1.2831817e-03]\n", "Scalar product=-0.121276\n", "Score=0.609364\n", "\n", "Item id = 3479\titem title = Ladyhawke (1985)\n", "[-2.8682781e-02 6.1106755e-09 6.3241005e-01 -3.3657509e-06\n", " 9.6770316e-02 9.6757424e-10 -6.0637249e-05 1.5274031e-03]\n", "Scalar product=0.484021\n", "Score=0.145174\n", "\n", "===============\n", "Item id = 145\titem title = Bad Boys (1995)\n", "[-9.6727222e-02 1.2952676e-09 8.4303088e-02 1.5707446e-05\n", " 9.7245917e-02 -9.5372132e-10 -9.6978983e-05 1.0601738e-02]\n", "Scalar product=0.064216\n", "Score=0.441272\n", "\n", "Item id = 171\titem title = Jeffrey (1995)\n", "[ 7.6405336e-03 -6.6923184e-10 9.0268552e-01 -5.7306852e-06\n", " -1.5152089e-02 -9.7515729e-10 -1.3149886e-04 4.9494698e-08]\n", "Scalar product=0.690369\n", "Score=0.073709\n" ] } ], "source": [ "user_id = 1\n", "user_repr = gmf_recommender.get_user_repr(user_id=user_id)\n", "print(\"User id={}\".format(user_id))\n", "print(user_repr)\n", "print()\n", "\n", "print(\"User watched\")\n", "print(ml_df.loc[ml_df['user_id'] == user_id, 'title'].tolist())\n", "print()\n", "\n", "print('User history item representations')\n", "for item_id in ml_df.loc[ml_df['user_id'] == user_id, 'item_id'].tolist():\n", " item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n", " print(\"Item id = {}\\titem title = {}\".format(\n", " item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n", " print(item_repr)\n", " scalar_product = np.dot(user_repr, item_repr)\n", " print(\"Scalar product={:.6f}\".format(scalar_product))\n", " score = gmf_recommender.nn_model(\n", " torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n", " gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n", " print(\"Score={:.6f}\".format(score))\n", " print()\n", "\n", "print(\"===============\")\n", " \n", "item_id = 145\n", "item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n", "print(\"Item id = {}\\titem title = {}\".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n", "print(item_repr)\n", "score = np.dot(user_repr, item_repr)\n", "print(\"Scalar product={:.6f}\".format(score))\n", "score = gmf_recommender.nn_model(\n", " torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n", " gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n", "print(\"Score={:.6f}\".format(score))\n", "print()\n", "\n", "item_id = 171\n", "item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n", "print(\"Item id = {}\\titem title = {}\".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n", "print(item_repr)\n", "score = np.dot(user_repr, item_repr)\n", "print(\"Scalar product={:.6f}\".format(score))\n", "score = gmf_recommender.nn_model(\n", " torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n", " gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n", "print(\"Score={:.6f}\".format(score))" ] }, { "cell_type": "markdown", "id": "framed-negative", "metadata": {}, "source": [ "# Training-test split evaluation" ] }, { "cell_type": "code", "execution_count": 9, "id": "amended-future", "metadata": {}, "outputs": [], "source": [ "from evaluation_and_testing.testing import evaluate_train_test_split_implicit" ] }, { "cell_type": "code", "execution_count": 43, "id": "unsigned-video", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0GMFRecommender0.2922080.4870130.6623380.8051950.2922080.4049140.4772920.52351
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "gmf_recommender = GMFRecommender(n_neg_per_pos=10, batch_size=16, \n", " embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20)\n", "\n", "gmf_tts_results = [['GMFRecommender'] + list(evaluate_train_test_split_implicit(\n", " gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "gmf_tts_results = pd.DataFrame(\n", " gmf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(gmf_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 14, "id": "romantic-music", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0NetflixRecommender0.2922080.5389610.7337660.9480520.2922080.4342890.5142030.583217
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.netflix_recommender import NetflixRecommender\n", "\n", "netflix_recommender = NetflixRecommender(n_epochs=150)\n", "\n", "netflix_tts_results = [['NetflixRecommender'] + list(evaluate_train_test_split_implicit(\n", " netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "netflix_tts_results = pd.DataFrame(\n", " netflix_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(netflix_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 15, "id": "standing-tiffany", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1818180.3116880.4025970.5519480.1818180.2578060.2946820.34147
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.amazon_recommender import AmazonRecommender\n", "\n", "amazon_recommender = AmazonRecommender()\n", "\n", "amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n", " amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "amazon_tts_results = pd.DataFrame(\n", " amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(amazon_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 16, "id": "saving-harrison", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0TFIDFRecommender0.0259740.0909090.1363640.3181820.0259740.0643930.0836850.140799
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.tfidf_recommender import TFIDFRecommender\n", "\n", "tfidf_recommender = TFIDFRecommender()\n", "\n", "tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n", " tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "tfidf_tts_results = pd.DataFrame(\n", " tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(tfidf_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 44, "id": "random-source", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0GMFRecommender0.2922080.4870130.6623380.8051950.2922080.4049140.4772920.523510
1NetflixRecommender0.2922080.5389610.7337660.9480520.2922080.4342890.5142030.583217
2AmazonRecommender0.1818180.3116880.4025970.5519480.1818180.2578060.2946820.341470
3TFIDFRecommender0.0259740.0909090.1363640.3181820.0259740.0643930.0836850.140799
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tts_results = pd.concat([gmf_tts_results, netflix_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n", "display(HTML(tts_results.to_html()))" ] }, { "cell_type": "markdown", "id": "continued-harassment", "metadata": {}, "source": [ "# Leave-one-out evaluation" ] }, { "cell_type": "code", "execution_count": 30, "id": "exact-stuff", "metadata": {}, "outputs": [], "source": [ "from evaluation_and_testing.testing import evaluate_leave_one_out_implicit" ] }, { "cell_type": "code", "execution_count": null, "id": "divided-resistance", "metadata": {}, "outputs": [], "source": [ "gmf_recommender = GMFRecommender(n_epochs=10)\n", "\n", "gmf_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(\n", " gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "gmf_loo_results = pd.DataFrame(\n", " gmf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(gmf_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 31, "id": "prerequisite-lounge", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0UserBasedCosineNearestNeighborsRecommender0.0966670.1466670.1866670.3066670.0966670.1242850.1407820.178962
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "netflix_recommender = NetflixRecommender(n_epochs=10)\n", "\n", "netflix_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(\n", " netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "netflix_loo_results = pd.DataFrame(\n", " netflix_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(netflix_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 35, "id": "social-escape", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1666670.2566670.320.4266670.1666670.2190860.2454860.279978
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.amazon_recommender import AmazonRecommender\n", "\n", "amazon_recommender = AmazonRecommender()\n", "\n", "amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n", " amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "amazon_loo_results = pd.DataFrame(\n", " amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(amazon_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 36, "id": "behind-cambodia", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0TFIDFRecommender0.0066670.0533330.1233330.2333330.0066670.0334910.0621780.096151
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tfidf_recommender = TFIDFRecommender()\n", "\n", "tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n", " tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "tfidf_loo_results = pd.DataFrame(\n", " tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(tfidf_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 37, "id": "lightweight-password", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0UserBasedCosineNearestNeighborsRecommender0.0966670.1466670.1866670.3066670.0966670.1242850.1407820.178962
1UserBasedCosineNearestNeighborsRecommender0.1000000.1500000.1800000.3133330.1000000.1271820.1395180.181748
2UserBasedCosineNearestNeighborsRecommender0.2666670.4200000.5133330.6500000.2666670.3577360.3960330.440599
3UserBasedCosineNearestNeighborsRecommender0.1733330.2800000.3366670.4200000.1733330.2345220.2577590.284723
4AmazonRecommender0.1666670.2566670.3200000.4266670.1666670.2190860.2454860.279978
5TFIDFRecommender0.0066670.0533330.1233330.2333330.0066670.0334910.0621780.096151
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "loo_results = pd.concat([gmf_loo_results, netflix_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n", "display(HTML(loo_results.to_html()))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }