meh/recommender-systems-class-master/class_13_generalized_matrix_factorization.ipynb

1751 lines
87 KiB
Plaintext
Raw Normal View History

2021-07-07 20:03:54 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "verified-accommodation",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from IPython.display import Markdown, display, HTML\n",
"from collections import defaultdict, deque\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"\n",
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
"import os\n",
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'\n",
"os.environ['CUDA_LAUNCH_BLOCKING'] = '1'"
]
},
{
"cell_type": "markdown",
"id": "educated-tourist",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "prepared-fraction",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>item_id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>145</td>\n",
" <td>Bad Boys (1995)</td>\n",
" <td>Action|Comedy|Crime|Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>171</td>\n",
" <td>Jeffrey (1995)</td>\n",
" <td>Comedy|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194</th>\n",
" <td>228</td>\n",
" <td>Destiny Turns on the Radio (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>233</td>\n",
" <td>Exotica (1994)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>230</th>\n",
" <td>267</td>\n",
" <td>Major Payne (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>313</th>\n",
" <td>355</td>\n",
" <td>Flintstones, The (1994)</td>\n",
" <td>Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>379</th>\n",
" <td>435</td>\n",
" <td>Coneheads (1993)</td>\n",
" <td>Comedy|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419</th>\n",
" <td>481</td>\n",
" <td>Kalifornia (1993)</td>\n",
" <td>Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>615</th>\n",
" <td>780</td>\n",
" <td>Independence Day (a.k.a. ID4) (1996)</td>\n",
" <td>Action|Adventure|Sci-Fi|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>737</th>\n",
" <td>959</td>\n",
" <td>Of Human Bondage (1934)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of interactions left: 1170\n"
]
}
],
"source": [
"ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n",
"ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n",
"ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n",
"\n",
"# Filter the data to reduce the number of movies\n",
"seed = 6789\n",
"rng = np.random.RandomState(seed=seed)\n",
"left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n",
"\n",
"ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n",
"ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n",
"ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n",
"\n",
"display(HTML(ml_movies_df.head(10).to_html()))\n",
"\n",
"print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))"
]
},
{
"cell_type": "markdown",
"id": "opponent-prediction",
"metadata": {},
"source": [
"# Generalized Matrix Factorization (GMF)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "fancy-return",
"metadata": {},
"outputs": [],
"source": [
"from livelossplot import PlotLosses\n",
"\n",
"from recommenders.recommender import Recommender\n",
"\n",
"\n",
"class GMFModel(nn.Module):\n",
" def __init__(self, n_items, n_users, embedding_dim, seed):\n",
" super().__init__()\n",
"\n",
" self.seed = torch.manual_seed(seed)\n",
" self.item_embedding = nn.Embedding(n_items, embedding_dim)\n",
" self.user_embedding = nn.Embedding(n_users, embedding_dim)\n",
" self.fc = nn.Linear(embedding_dim, 1, bias=False)\n",
"\n",
" def forward(self, x):\n",
" user_ids = x[:, 0]\n",
" item_ids = x[:, 1]\n",
" user_embedding = self.user_embedding(user_ids)\n",
" item_embedding = self.item_embedding(item_ids)\n",
" x = self.fc(user_embedding * item_embedding)\n",
" x = torch.sigmoid(x)\n",
"\n",
" return x\n",
"\n",
"\n",
"class GMFRecommender(Recommender):\n",
" \"\"\"\n",
" General Matrix Factorization recommender as described in:\n",
" - He X., Liao L., Zhang H., Nie L., Hu X., Chua T., Neural Collaborative Filtering, WWW Conference, 2017\n",
" \"\"\"\n",
"\n",
" def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):\n",
" super().__init__()\n",
" self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
" self.interactions_df = None\n",
" self.item_id_mapping = None\n",
" self.user_id_mapping = None\n",
" self.item_id_reverse_mapping = None\n",
" self.user_id_reverse_mapping = None\n",
" self.r = None\n",
" self.most_popular_items = None\n",
" \n",
" self.nn_model = None\n",
" self.optimizer = None\n",
" \n",
" self.n_neg_per_pos = n_neg_per_pos\n",
" if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)\n",
" self.n_epochs = params['n_epochs']\n",
" else:\n",
" self.n_epochs = 10\n",
" if 'lr' in params: # learning rate\n",
" self.lr = params['lr']\n",
" else:\n",
" self.lr = 0.01\n",
" if 'weight_decay' in params: # weight decay (L2 regularization)\n",
" self.weight_decay = params['weight_decay']\n",
" else:\n",
" self.weight_decay = 0.001\n",
" if 'embedding_dim' in params:\n",
" self.embedding_dim = params['embedding_dim']\n",
" else:\n",
" self.embedding_dim = 4\n",
" if 'batch_size' in params:\n",
" self.batch_size = params['batch_size']\n",
" else:\n",
" self.batch_size = 64\n",
" if 'device' in params:\n",
" self.device = params['device']\n",
" else:\n",
" self.device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
" \n",
" if 'should_recommend_already_bought' in params:\n",
" self.should_recommend_already_bought = params['should_recommend_already_bought']\n",
" else:\n",
" self.should_recommend_already_bought = False\n",
" \n",
" if 'train' in params:\n",
" self.train = params['train']\n",
" else:\n",
" self.train = False\n",
" self.validation_set_size = 0.2\n",
" \n",
" self.seed = seed\n",
" self.rng = np.random.RandomState(seed=seed)\n",
" torch.manual_seed(seed)\n",
" \n",
" if 'should_save_model' in params:\n",
" self.should_save_model = params['should_save_model']\n",
" self.print_type = print_type\n",
"\n",
" def fit(self, interactions_df, users_df, items_df):\n",
" \"\"\"\n",
" Training of the recommender.\n",
"\n",
" :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n",
" defined by user_id, item_id and features of the interaction.\n",
" :param pd.DataFrame users_df: DataFrame with users and their features defined by\n",
" user_id and the user feature columns.\n",
" :param pd.DataFrame items_df: DataFrame with items and their features defined\n",
" by item_id and the item feature columns.\n",
" \"\"\"\n",
"\n",
" del users_df, items_df\n",
"\n",
" # Shift item ids and user ids so that they are consecutive\n",
"\n",
" unique_item_ids = interactions_df['item_id'].unique()\n",
" self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
" self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
" unique_user_ids = interactions_df['user_id'].unique()\n",
" self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
" self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
"\n",
" interactions_df = interactions_df.copy()\n",
" interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n",
"\n",
" # Get the number of items and users\n",
"\n",
" self.interactions_df = interactions_df.copy()\n",
" n_users = np.max(interactions_df['user_id']) + 1\n",
" n_items = np.max(interactions_df['item_id']) + 1\n",
"\n",
" # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n",
" r = np.zeros(shape=(n_users, n_items))\n",
" for idx, interaction in interactions_df.iterrows():\n",
" r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
"\n",
" self.r = r\n",
" \n",
" # Indicate positive interactions\n",
" \n",
" interactions_df.loc[:, 'interacted'] = 1\n",
"\n",
" # Generate negative interactions\n",
" negative_interactions = []\n",
"\n",
" i = 0\n",
" while i < self.n_neg_per_pos * len(interactions_df):\n",
" sample_size = 1000\n",
" user_ids = self.rng.choice(np.arange(n_users), size=sample_size)\n",
" item_ids = self.rng.choice(np.arange(n_items), size=sample_size)\n",
"\n",
" j = 0\n",
" while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):\n",
" if r[user_ids[j]][item_ids[j]] == 0:\n",
" negative_interactions.append([user_ids[j], item_ids[j], 0])\n",
" i += 1\n",
" j += 1\n",
" \n",
" interactions_df = pd.concat(\n",
" [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])\n",
" interactions_df = interactions_df.reset_index(drop=True)\n",
" \n",
" # Initialize losses and loss visualization\n",
" \n",
" if self.print_type is not None and self.print_type == 'live':\n",
" liveloss = PlotLosses()\n",
"\n",
" training_losses = deque(maxlen=50)\n",
" training_avg_losses = []\n",
" training_epoch_losses = []\n",
" validation_losses = deque(maxlen=50)\n",
" validation_avg_losses = []\n",
" validation_epoch_losses = []\n",
" last_training_total_loss = 0.0\n",
" last_validation_total_loss = 0.0\n",
" \n",
" # Initialize the network\n",
" \n",
" self.nn_model = GMFModel(n_items, n_users, self.embedding_dim, self.seed)\n",
" self.nn_model.train()\n",
" self.nn_model.to(self.device)\n",
" self.optimizer = optim.Adam(self.nn_model.parameters(), lr=self.lr, weight_decay=self.weight_decay)\n",
" \n",
" # Split the data\n",
" \n",
" if self.train:\n",
" interaction_ids = self.rng.permutation(len(interactions_df))\n",
" train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))\n",
" training_ids = interaction_ids[:train_validation_slice_idx]\n",
" validation_ids = interaction_ids[train_validation_slice_idx:]\n",
" else:\n",
" interaction_ids = self.rng.permutation(len(interactions_df))\n",
" training_ids = interaction_ids\n",
" validation_ids = []\n",
" \n",
" # Train the model\n",
" \n",
" for epoch in range(self.n_epochs):\n",
" if self.print_type is not None and self.print_type == 'live':\n",
" logs = {}\n",
" \n",
" # Train\n",
" \n",
" training_losses.clear()\n",
" training_total_loss = 0.0\n",
" \n",
" self.rng.shuffle(training_ids)\n",
" \n",
" batch_idx = 0\n",
" n_batches = int(np.ceil(len(training_ids) / self.batch_size))\n",
" \n",
" for batch_idx in range(n_batches):\n",
" \n",
" batch_ids = training_ids[(batch_idx * self.batch_size):((batch_idx + 1) * self.batch_size)]\n",
" \n",
" batch = interactions_df.loc[batch_ids]\n",
" batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)\n",
" y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)\n",
" \n",
" # Create responses\n",
"\n",
" y = self.nn_model(batch_input).clip(0.000001, 0.999999)\n",
"\n",
" # Define loss and backpropagate\n",
"\n",
" self.optimizer.zero_grad()\n",
" loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()\n",
" \n",
" loss.backward()\n",
" self.optimizer.step()\n",
" \n",
" training_total_loss += loss.item()\n",
" \n",
" if self.print_type is not None and self.print_type == 'text':\n",
" print(\"\\rEpoch: {}\\tBatch: {}\\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}\".format(\n",
" epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end=\"\")\n",
" \n",
" training_losses.append(loss.item())\n",
" training_avg_losses.append(np.mean(training_losses))\n",
" \n",
" # Validate\n",
"\n",
" validation_total_loss = 0.0\n",
" \n",
" batch = interactions_df.loc[validation_ids]\n",
" batch_input = torch.from_numpy(batch.loc[:, ['user_id', 'item_id']].values).long().to(self.device)\n",
" y_target = torch.from_numpy(batch.loc[:, ['interacted']].values).float().to(self.device)\n",
" \n",
" # Create responses\n",
"\n",
" y = self.nn_model(batch_input).clip(0.000001, 0.999999)\n",
"\n",
" # Calculate validation loss\n",
"\n",
" loss = -(y_target * y.log() + (1 - y_target) * (1 - y).log()).sum()\n",
" validation_total_loss += loss.item()\n",
" \n",
" # Save and print epoch losses\n",
" \n",
" training_last_avg_loss = training_total_loss / len(training_ids)\n",
" validation_last_avg_loss = validation_total_loss / len(validation_ids)\n",
"\n",
" if self.print_type is not None and self.print_type == 'live' and epoch >= 0:\n",
" # A bound on epoch prevents showing extremely high losses in the first epochs\n",
" logs['loss'] = training_last_avg_loss\n",
" logs['val_loss'] = validation_last_avg_loss\n",
" liveloss.update(logs)\n",
" liveloss.send()\n",
"\n",
" # Find the most popular items for the cold start problem\n",
"\n",
" offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n",
" offers_count = offers_count.sort_values('user_id', ascending=False)\n",
" self.most_popular_items = offers_count.index\n",
"\n",
" def recommend(self, users_df, items_df, n_recommendations=1):\n",
" \"\"\"\n",
" Serving of recommendations. Scores items in items_df for each user in users_df and returns\n",
" top n_recommendations for each user.\n",
"\n",
" :param pd.DataFrame users_df: DataFrame with users and their features for which\n",
" recommendations should be generated.\n",
" :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n",
" :param int n_recommendations: Number of recommendations to be returned for each user.\n",
" :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n",
" for each user.\n",
" :rtype: pd.DataFrame\n",
" \"\"\"\n",
"\n",
" # Clean previous recommendations (iloc could be used alternatively)\n",
" self.recommender_df = self.recommender_df[:0]\n",
"\n",
" # Handle users not in the training data\n",
"\n",
" # Map item ids\n",
"\n",
" items_df = items_df.copy()\n",
" items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]\n",
" items_df.replace({'item_id': self.item_id_mapping}, inplace=True)\n",
"\n",
" # Generate recommendations\n",
"\n",
" for idx, user in users_df.iterrows():\n",
" recommendations = []\n",
"\n",
" user_id = user['user_id']\n",
"\n",
" if user_id in self.user_id_mapping:\n",
" \n",
" mapped_user_id = self.user_id_mapping[user_id]\n",
" \n",
" ids_list = items_df['item_id'].tolist()\n",
" id_to_pos = np.array([0]*len(ids_list))\n",
" for k in range(len(ids_list)):\n",
" id_to_pos[ids_list[k]] = k\n",
" \n",
" net_input = torch.tensor(list(zip([mapped_user_id]*len(ids_list), ids_list))).to(self.device)\n",
" \n",
" scores = self.nn_model(net_input).flatten().detach().cpu().numpy()\n",
" \n",
" # Choose n recommendations based on highest scores\n",
" if not self.should_recommend_already_bought:\n",
" x_list = self.interactions_df.loc[\n",
" self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
" scores[id_to_pos[x_list]] = -np.inf\n",
"\n",
" chosen_pos = np.argsort(-scores)[:n_recommendations]\n",
"\n",
" for item_pos in chosen_pos:\n",
" recommendations.append(\n",
" {\n",
" 'user_id': self.user_id_reverse_mapping[mapped_user_id],\n",
" 'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],\n",
" 'score': scores[item_pos]\n",
" }\n",
" )\n",
" else: # For new users recommend most popular items\n",
" for i in range(n_recommendations):\n",
" recommendations.append(\n",
" {\n",
" 'user_id': user['user_id'],\n",
" 'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n",
" 'score': 1.0\n",
" }\n",
" )\n",
"\n",
" user_recommendations = pd.DataFrame(recommendations)\n",
"\n",
" self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n",
"\n",
" return self.recommender_df\n",
" \n",
" def get_user_repr(self, user_id):\n",
" mapped_user_id = self.user_id_mapping[user_id]\n",
" return self.nn_model.user_embedding(torch.tensor(mapped_user_id).to(self.device)).detach().cpu().numpy()\n",
" \n",
" def get_item_repr(self, item_id):\n",
" mapped_item_id = self.item_id_mapping[item_id]\n",
" return self.nn_model.item_embedding(torch.tensor(mapped_item_id).to(self.device)).detach().cpu().numpy()\n",
"\n",
" \n",
"class MLPModel(nn.Module):\n",
" def __init__(self, n_items, n_users, embedding_dim, seed):\n",
" super().__init__()\n",
"\n",
" self.seed = torch.manual_seed(seed)\n",
" self.item_embedding = nn.Embedding(n_items, embedding_dim)\n",
" self.user_embedding = nn.Embedding(n_users, embedding_dim)\n",
" self.fc1 = nn.Linear(2 * embedding_dim, 32, bias=False)\n",
" self.fc2 = nn.Linear(32, 16, bias=False)\n",
" self.fc3 = nn.Linear(16, 1, bias=False)\n",
"\n",
" def forward(self, x):\n",
" user = x[:, 0]\n",
" item = x[:, 1]\n",
" user_embedding = self.user_embedding(user)\n",
" item_embedding = self.item_embedding(item)\n",
" x = torch.cat([user_embedding, item_embedding], dim=1)\n",
" x = torch.relu(self.fc1(x))\n",
" x = torch.relu(self.fc2(x))\n",
" x = torch.sigmoid(self.fc3(x))\n",
"\n",
" return x\n",
"\n",
" \n",
"class NeuMFModel(nn.Module):\n",
" def __init__(self, n_items, n_users, gmf_embedding_dim, mlp_embedding_dim, seed):\n",
" super().__init__()\n",
"\n",
" self.seed = torch.manual_seed(seed)\n",
"\n",
" # GMF\n",
"\n",
" self.gmf_user_embedding = nn.Embedding(n_users, gmf_embedding_dim)\n",
" self.gmf_item_embedding = nn.Embedding(n_items, gmf_embedding_dim)\n",
"\n",
" # MLP\n",
"\n",
" self.mlp_user_embedding = nn.Embedding(n_users, mlp_embedding_dim)\n",
" self.mlp_item_embedding = nn.Embedding(n_items, mlp_embedding_dim)\n",
" self.mlp_fc1 = nn.Linear(2 * mlp_embedding_dim, 32, bias=False)\n",
" self.mlp_fc2 = nn.Linear(32, 16, bias=False)\n",
"\n",
" # Merge\n",
"\n",
" self.fc = nn.Linear(32, 1, bias=False)\n",
"\n",
" def forward(self, x):\n",
" user = x[:, 0]\n",
" item = x[:, 1]\n",
"\n",
" # GMF\n",
"\n",
" gmf_user_embedding = self.gmf_user_embedding(user)\n",
" gmf_item_embedding = self.gmf_item_embedding(item)\n",
" gmf_x = gmf_user_embedding * gmf_item_embedding\n",
"\n",
" # MLP\n",
"\n",
" mlp_user_embedding = self.mlp_user_embedding(user)\n",
" mlp_item_embedding = self.mlp_item_embedding(item)\n",
" mlp_x = torch.cat([mlp_user_embedding, mlp_item_embedding], dim=1)\n",
" mlp_x = torch.relu(self.mlp_fc1(mlp_x))\n",
" mlp_x = torch.relu(self.mlp_fc2(mlp_x))\n",
"\n",
" # Final score\n",
"\n",
" x = torch.cat([gmf_x, mlp_x], dim=1)\n",
" x = torch.sigmoid(self.fc(x))\n",
"\n",
" return x"
]
},
{
"cell_type": "markdown",
"id": "expensive-offering",
"metadata": {},
"source": [
"## Quick test of the recommender (training)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "nonprofit-roads",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAbkAAAI4CAYAAAD3UJfIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABN6klEQVR4nO3deXzU1b3/8deZmez7BglZCCCEJewJBhHRqhXc96Vqq63aWq31drXtvfXW297bvdb+tNZa7eZStO7iUnetC4Qt7IusCQHCkkDInjm/P2YICSQQyCTfWd7PxyOPyZz5zsxnSODN+Z7zPcdYaxEREQlHLqcLEBER6S8KORERCVsKORERCVsKORERCVsKORERCVsKORERCVsKORERCVsKOZEBZIzZZIw5y+k6RCKFQk5ERMKWQk7EYcaYGGPMvcaYbf6ve40xMf7HMo0xLxljao0xe4wx7xtjXP7HvmuMqTLG7DfGrDHGnOnsJxEJPh6nCxARfgCUAZMACzwP/CfwX8A3gUogy39sGWCNMUXA7UCptXabMaYQcA9s2SLBTz05EeddC9xjrd1pra0BfgRc73+sFcgBhlprW62171vfgrPtQAww1hgTZa3dZK391JHqRYKYQk7EeUOAzZ3ub/a3AfwCWA+8bozZYIy5C8Baux64E/hvYKcx5kljzBBEpAuFnIjztgFDO90v8Ldhrd1vrf2mtXY4cCHwjYNjb9bax621p/qfa4GfDWzZIsFPIScy8KKMMbEHv4AngP80xmQZYzKBHwJ/BzDGnG+MOckYY4A6fKcpvcaYImPMZ/wTVJqARsDrzMcRCV4KOZGBNw9fKB38igXKgQpgGbAI+LH/2JHAG0A98BHwgLX2bXzjcT8FdgHbgUHA9wbuI4iEBqNNU0VEJFypJyciImFLISciImFLISciImFLISciImHLsWW9MjMzbWFhoVNvLyIiYWLhwoW7rLVZ3T3mWMgVFhZSXl7u1NuLiEiYMMZs7ukxna4UEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwpZATEZGwFdIh19Lm5cWl22j3WqdLERGRIBTSITdvWTVfe2Ixn/3Nu7y4dBtehZ2IiHQS0iF34cQh3P+5KbiM4WtPLGbOb9/n1eXVWKuwExGRXoacMWa2MWaNMWa9Meaubh7/jTFmif9rrTGmNuCVdsPlMpw3IYdX7zyN3149idZ2L1/5+yLO/90HvLFyh8JORCTCmWMFgTHGDawFzgYqgQXANdbalT0c/zVgsrX2i0d73ZKSElteXn5CRfekrd3Lc0u2cd+b69iyp4GJeSn8x9mjmDUqC2NMQN9LRESCgzFmobW2pLvHetOTmwast9ZusNa2AE8CFx3l+GuAJ46/zL7zuF1cPjWPN785i59eOp5d9S3c8OgCLn/wI/69fpd6diIiEaY3IZcLbO10v9LfdgRjzFBgGPBW30s7cVFuF1dPK+Dtb53O/1xcTNXeRq59+BOufuhj5m/c42RpIiIygAI98eRq4GlrbXt3DxpjbjHGlBtjymtqagL81keK9ri4vmwo73z7dO6+YCwbdh3gyj98xHUPf8LCzXv7/f1FRMRZvQm5KiC/0/08f1t3ruYopyqttQ9Za0ustSVZWVm9r7KPYqPc3DhjGO99+wx+cO4YVlXv47Lff8gNj86norJ2wOoQEZGB1ZuJJx58E0/OxBduC4DPWWtXHHbcaOBVYJjtxeBXQCaebFsC7/wUErMgcTAkDDry+5hkOGzSyYHmNv7y0SYeem8DtQ2tnD12MP9x1ijGDknuWz0iIjLgjjbxxHOsJ1tr24wxtwOvAW7gEWvtCmPMPUC5tfYF/6FXA0/2JuACpqUe6iph2yI4UAPWe+QxntjDwi+LhMRBfDVxMDdemM7LG9p5tGI5V963kdOKh3F60WCS4zwkxUaRHBvV8X1SrIcod0hfVigiEnGO2ZPrLwG/hMDbDg174MBOqN8B9TXdfO//atjVbSA22SgaicGLwYsLL4Z2XL7vrQHjwho3xuUC/61xHbz14Ha5MG4Pbrcbt9uNy+3BuA7eenC5D35F4fJ4cLk84HKDy+P/ch+6NYe1u6MgNgXi0iAu3X+bBvHpEBUXuD9HEZEQ06eeXMhwuf29tSwYPO7ox3rboWG3P/R2+HqB9Tvx7NtOVGMDLW1teFvbaG1ro7Wtnfa2Vlrb22lva6OtvZ329jba29t9X23t2PY22r1eXLYdQxtuWnAZi5t23P649ODtuO/Gi4d23MbX7jHtuLG+Nv/xvmPbcXHs/4RYdyzEpWHiDwZgatcQPPh953BMygGXeqYiEt7CJ+SOh8sNiYN8XxR3NHuAxBN8SWstzW1e9jW2sq+plX1NbRxobqOlzUtzm5fmtnaaW3v4vs3rv99OS7u3y2MtrW20trbR1tqMbdyLu7mOVFNPKvWHbtvqSW2uJ33fAbLce0g1W0mhniS7jyjb2m293rThuMq+ApM+BzFJJ/ipRUSCW/icrowQbe1eahtb2Xughb0Nrew50EJtg+/7vQ0t/nb//fpmGhvqoWlvRyimcIAsU8slnn8zxayjxZNI28TriD/1q5A21OmPJyJy3I52ulIhFwHavZZ9ja3safAFYs3+Fj7ZuJuq5e9zXsPznOv6BJexbMw8g+hTbyN/whm+cUcRkRCgkJNuWWtZs2M/Hy6qIHnZnzmrYR6p5gCrXSNYPfQ6sqdfQ8mIwXg0q1REgphCTnpl+649bH77EfLX/JkhbVvZaVOZa2azfeQ1TB9fxKyiLBJjInMYV0SCl0JOjo/XS9OaN9j/zn1k7XifZqJ4tm0Gf7fnkj5iMmePHcxZYwaRk6JLF0TEeQo5OXE1a/B+/Hvskidwtzex0DWBB5rO5i3vZMblpnL2mGwumJjD8KwTnZcqItI3Cjnpu4Y9sOgv2Pl/xOyrojYun6fd53Hv7lLaPAk8f9upFGXrUgQRGXh93U9OxHdR+an/gfn6Urj8EVIzsrmp/kEqku7ka1EvcOtjCznQ3OZ0lSIiXSjk5Pi4o6D4MrjpDfjSG7gKyrjN+zjJu5fy/WeXaWNaEQkqCjk5cfmlcMWjEJfGvdmv8fySbTw+f4vTVYmIdFDISd/EJMEpX6Nwz7+5YehufvTCSpZX1TldlYgIoJCTQJh2C8Sl8f2EF0hPiOarjy1iX1P3a2aKiAwkhZz0XUwSTL+d6A3/4tHPuthW28h3nqrQ+JyIOE4hJ4Ex7RaITWXM2gf57uzRvLpiO4/+e5PTVYlIhFPISWDEJsP022Htq9w0oo6zxw7mf+etYtGWvU5XJiIRTCEngXOyrzdn3v0Zv7x8Ijmpsdz+2CL2HmhxujIRiVAKOQmc2BSYfhusfYWU2hXc/7kp7Kpv4Rtzl+D1anxORAaeQk4C6+Qv+8Lu3Z8xIS+V/zp/DG+vqeHB9z51ujIRiUAKOQms2BQouw3WzIPqpVxXNpTzJ+Twy9fW8PGG3U5XJyIRRiEngXewN/fOzzDG8NPLJlCYkcAdTyymZn+z09WJSARRyEngxaVC2VdhzctQXUFijIf7r51CXWMrd/5jMe0anxORAaKQk/5x8lcgxjc2BzAmJ5n/uaiYf6/fzX1vrnO4OBGJFAo56R9xqVB2K6x+CbYvA+CKkjwum5LHfW+t4/11Nc7WJyIRQSEn/afsKxCT3NGbM8bwPxePY+SgRO58cgnb65ocLlBEwp1CTvpPXJqvN7fqRdi+HID4aA8PXDuFxtZ2vvbEItravQ4XKSLhTCEn/avs1i69OYCTBiXxf5eOZ8Gmvfzy9bUOFici4U4hJ/0rLs03CWXVC7BjRUfzRZNy+dzJBTz47qe8uWqHgwWKSDhTyEn/K7sVopO69OYAfnj+WMYNSeYbc5eydU+DQ8WJSDhTyEn/i0/3XSC+8nnYsbKjOTbKzQPXTsHrtdz++CJa2jQ+JyKBpZCTgTH9tm57c0M
"text/plain": [
"<Figure size 864x576 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loss\n",
"\ttraining \t (min: 0.130, max: 0.706, cur: 0.130)\n",
"\tvalidation \t (min: 0.224, max: 0.696, cur: 0.226)\n"
]
}
],
"source": [
"gmf_recommender = GMFRecommender(print_type='live', n_neg_per_pos=10, batch_size=16, \n",
" embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20, seed=1)\n",
"gmf_recommender.fit(ml_ratings_df, None, ml_movies_df)"
]
},
{
"cell_type": "markdown",
"id": "incorporated-messaging",
"metadata": {},
"source": [
"## Quick test of the recommender (recommending)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "accessible-value",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommendations\n"
]
},
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>score</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>4896</td>\n",
" <td>0.768898</td>\n",
" <td>Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>435</td>\n",
" <td>0.650600</td>\n",
" <td>Coneheads (1993)</td>\n",
" <td>Comedy|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>41566</td>\n",
" <td>0.609373</td>\n",
" <td>Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>6502</td>\n",
" <td>0.535332</td>\n",
" <td>28 Days Later (2002)</td>\n",
" <td>Action|Horror|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>145</td>\n",
" <td>0.441272</td>\n",
" <td>Bad Boys (1995)</td>\n",
" <td>Action|Comedy|Crime|Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>6537</td>\n",
" <td>0.432268</td>\n",
" <td>Terminator 3: Rise of the Machines (2003)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>355</td>\n",
" <td>0.421626</td>\n",
" <td>Flintstones, The (1994)</td>\n",
" <td>Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>5673</td>\n",
" <td>0.242538</td>\n",
" <td>Punch-Drunk Love (2002)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>481</td>\n",
" <td>0.218651</td>\n",
" <td>Kalifornia (1993)</td>\n",
" <td>Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>267</td>\n",
" <td>0.213728</td>\n",
" <td>Major Payne (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>780</td>\n",
" <td>0.858898</td>\n",
" <td>Independence Day (a.k.a. ID4) (1996)</td>\n",
" <td>Action|Adventure|Sci-Fi|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4</td>\n",
" <td>435</td>\n",
" <td>0.634766</td>\n",
" <td>Coneheads (1993)</td>\n",
" <td>Comedy|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>41566</td>\n",
" <td>0.597829</td>\n",
" <td>Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>4</td>\n",
" <td>6502</td>\n",
" <td>0.531417</td>\n",
" <td>28 Days Later (2002)</td>\n",
" <td>Action|Horror|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>145</td>\n",
" <td>0.447853</td>\n",
" <td>Bad Boys (1995)</td>\n",
" <td>Action|Comedy|Crime|Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>4</td>\n",
" <td>6537</td>\n",
" <td>0.439573</td>\n",
" <td>Terminator 3: Rise of the Machines (2003)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>4</td>\n",
" <td>355</td>\n",
" <td>0.430258</td>\n",
" <td>Flintstones, The (1994)</td>\n",
" <td>Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>4</td>\n",
" <td>5673</td>\n",
" <td>0.266561</td>\n",
" <td>Punch-Drunk Love (2002)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>4</td>\n",
" <td>481</td>\n",
" <td>0.243838</td>\n",
" <td>Kalifornia (1993)</td>\n",
" <td>Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>4</td>\n",
" <td>267</td>\n",
" <td>0.239114</td>\n",
" <td>Major Payne (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>6</td>\n",
" <td>4896</td>\n",
" <td>0.687780</td>\n",
" <td>Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>6</td>\n",
" <td>41566</td>\n",
" <td>0.572620</td>\n",
" <td>Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>6</td>\n",
" <td>1500</td>\n",
" <td>0.572483</td>\n",
" <td>Grosse Pointe Blank (1997)</td>\n",
" <td>Comedy|Crime|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>6</td>\n",
" <td>6502</td>\n",
" <td>0.523220</td>\n",
" <td>28 Days Later (2002)</td>\n",
" <td>Action|Horror|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>6</td>\n",
" <td>6537</td>\n",
" <td>0.455307</td>\n",
" <td>Terminator 3: Rise of the Machines (2003)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>6</td>\n",
" <td>5673</td>\n",
" <td>0.321320</td>\n",
" <td>Punch-Drunk Love (2002)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>6</td>\n",
" <td>481</td>\n",
" <td>0.302354</td>\n",
" <td>Kalifornia (1993)</td>\n",
" <td>Drama|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>6</td>\n",
" <td>4890</td>\n",
" <td>0.270704</td>\n",
" <td>Shallow Hal (2001)</td>\n",
" <td>Comedy|Fantasy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>6</td>\n",
" <td>5954</td>\n",
" <td>0.261981</td>\n",
" <td>25th Hour (2002)</td>\n",
" <td>Crime|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>6</td>\n",
" <td>3468</td>\n",
" <td>0.239384</td>\n",
" <td>Hustler, The (1961)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"recommendations = gmf_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n",
"\n",
"recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n",
"print(\"Recommendations\")\n",
"display(HTML(recommendations.to_html()))"
]
},
{
"cell_type": "markdown",
"id": "documentary-barcelona",
"metadata": {},
"source": [
"## User and item representations"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "balanced-detective",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"User id=1\n",
"[ 8.8694301e-03 -1.1293894e-09 7.6482260e-01 6.5688614e-06\n",
" 6.1402158e-03 -3.4989858e-10 3.0581679e-05 1.6342730e-05]\n",
"\n",
"User watched\n",
"['Independence Day (a.k.a. ID4) (1996)', 'Grosse Pointe Blank (1997)', 'Ladyhawke (1985)']\n",
"\n",
"User history item representations\n",
"Item id = 780\titem title = Independence Day (a.k.a. ID4) (1996)\n",
"[-2.0800237e-01 -3.2530998e-08 -7.2467870e-01 -7.6390163e-04\n",
" 6.0946174e-02 -1.0309565e-09 -1.6934791e-03 -3.3520073e-02]\n",
"Scalar product=-0.555722\n",
"Score=0.884161\n",
"\n",
"Item id = 1500\titem title = Grosse Pointe Blank (1997)\n",
"[-4.7350328e-02 -1.4992246e-09 -1.5850608e-01 -2.9982104e-05\n",
" 6.0663655e-02 4.1064720e-08 1.5929480e-04 1.2831817e-03]\n",
"Scalar product=-0.121276\n",
"Score=0.609364\n",
"\n",
"Item id = 3479\titem title = Ladyhawke (1985)\n",
"[-2.8682781e-02 6.1106755e-09 6.3241005e-01 -3.3657509e-06\n",
" 9.6770316e-02 9.6757424e-10 -6.0637249e-05 1.5274031e-03]\n",
"Scalar product=0.484021\n",
"Score=0.145174\n",
"\n",
"===============\n",
"Item id = 145\titem title = Bad Boys (1995)\n",
"[-9.6727222e-02 1.2952676e-09 8.4303088e-02 1.5707446e-05\n",
" 9.7245917e-02 -9.5372132e-10 -9.6978983e-05 1.0601738e-02]\n",
"Scalar product=0.064216\n",
"Score=0.441272\n",
"\n",
"Item id = 171\titem title = Jeffrey (1995)\n",
"[ 7.6405336e-03 -6.6923184e-10 9.0268552e-01 -5.7306852e-06\n",
" -1.5152089e-02 -9.7515729e-10 -1.3149886e-04 4.9494698e-08]\n",
"Scalar product=0.690369\n",
"Score=0.073709\n"
]
}
],
"source": [
"user_id = 1\n",
"user_repr = gmf_recommender.get_user_repr(user_id=user_id)\n",
"print(\"User id={}\".format(user_id))\n",
"print(user_repr)\n",
"print()\n",
"\n",
"print(\"User watched\")\n",
"print(ml_df.loc[ml_df['user_id'] == user_id, 'title'].tolist())\n",
"print()\n",
"\n",
"print('User history item representations')\n",
"for item_id in ml_df.loc[ml_df['user_id'] == user_id, 'item_id'].tolist():\n",
" item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n",
" print(\"Item id = {}\\titem title = {}\".format(\n",
" item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n",
" print(item_repr)\n",
" scalar_product = np.dot(user_repr, item_repr)\n",
" print(\"Scalar product={:.6f}\".format(scalar_product))\n",
" score = gmf_recommender.nn_model(\n",
" torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n",
" gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n",
" print(\"Score={:.6f}\".format(score))\n",
" print()\n",
"\n",
"print(\"===============\")\n",
" \n",
"item_id = 145\n",
"item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n",
"print(\"Item id = {}\\titem title = {}\".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n",
"print(item_repr)\n",
"score = np.dot(user_repr, item_repr)\n",
"print(\"Scalar product={:.6f}\".format(score))\n",
"score = gmf_recommender.nn_model(\n",
" torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n",
" gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n",
"print(\"Score={:.6f}\".format(score))\n",
"print()\n",
"\n",
"item_id = 171\n",
"item_repr = gmf_recommender.get_item_repr(item_id=item_id)\n",
"print(\"Item id = {}\\titem title = {}\".format(item_id, ml_movies_df.loc[ml_movies_df['item_id'] == item_id, 'title'].iloc[0]))\n",
"print(item_repr)\n",
"score = np.dot(user_repr, item_repr)\n",
"print(\"Scalar product={:.6f}\".format(score))\n",
"score = gmf_recommender.nn_model(\n",
" torch.tensor([[gmf_recommender.user_id_mapping[user_id], \n",
" gmf_recommender.item_id_mapping[item_id]]]).to(gmf_recommender.device)).flatten().detach().cpu().item()\n",
"print(\"Score={:.6f}\".format(score))"
]
},
{
"cell_type": "markdown",
"id": "framed-negative",
"metadata": {},
"source": [
"# Training-test split evaluation"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "amended-future",
"metadata": {},
"outputs": [],
"source": [
"from evaluation_and_testing.testing import evaluate_train_test_split_implicit"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "unsigned-video",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>GMFRecommender</td>\n",
" <td>0.292208</td>\n",
" <td>0.487013</td>\n",
" <td>0.662338</td>\n",
" <td>0.805195</td>\n",
" <td>0.292208</td>\n",
" <td>0.404914</td>\n",
" <td>0.477292</td>\n",
" <td>0.52351</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"gmf_recommender = GMFRecommender(n_neg_per_pos=10, batch_size=16, \n",
" embedding_dim=6, lr=0.001, weight_decay=0.0001, n_epochs=20)\n",
"\n",
"gmf_tts_results = [['GMFRecommender'] + list(evaluate_train_test_split_implicit(\n",
" gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"gmf_tts_results = pd.DataFrame(\n",
" gmf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(gmf_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "romantic-music",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NetflixRecommender</td>\n",
" <td>0.292208</td>\n",
" <td>0.538961</td>\n",
" <td>0.733766</td>\n",
" <td>0.948052</td>\n",
" <td>0.292208</td>\n",
" <td>0.434289</td>\n",
" <td>0.514203</td>\n",
" <td>0.583217</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from recommenders.netflix_recommender import NetflixRecommender\n",
"\n",
"netflix_recommender = NetflixRecommender(n_epochs=150)\n",
"\n",
"netflix_tts_results = [['NetflixRecommender'] + list(evaluate_train_test_split_implicit(\n",
" netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"netflix_tts_results = pd.DataFrame(\n",
" netflix_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(netflix_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "standing-tiffany",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.181818</td>\n",
" <td>0.311688</td>\n",
" <td>0.402597</td>\n",
" <td>0.551948</td>\n",
" <td>0.181818</td>\n",
" <td>0.257806</td>\n",
" <td>0.294682</td>\n",
" <td>0.34147</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from recommenders.amazon_recommender import AmazonRecommender\n",
"\n",
"amazon_recommender = AmazonRecommender()\n",
"\n",
"amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n",
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"amazon_tts_results = pd.DataFrame(\n",
" amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(amazon_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "saving-harrison",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.025974</td>\n",
" <td>0.090909</td>\n",
" <td>0.136364</td>\n",
" <td>0.318182</td>\n",
" <td>0.025974</td>\n",
" <td>0.064393</td>\n",
" <td>0.083685</td>\n",
" <td>0.140799</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from recommenders.tfidf_recommender import TFIDFRecommender\n",
"\n",
"tfidf_recommender = TFIDFRecommender()\n",
"\n",
"tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n",
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"tfidf_tts_results = pd.DataFrame(\n",
" tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(tfidf_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "random-source",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>GMFRecommender</td>\n",
" <td>0.292208</td>\n",
" <td>0.487013</td>\n",
" <td>0.662338</td>\n",
" <td>0.805195</td>\n",
" <td>0.292208</td>\n",
" <td>0.404914</td>\n",
" <td>0.477292</td>\n",
" <td>0.523510</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NetflixRecommender</td>\n",
" <td>0.292208</td>\n",
" <td>0.538961</td>\n",
" <td>0.733766</td>\n",
" <td>0.948052</td>\n",
" <td>0.292208</td>\n",
" <td>0.434289</td>\n",
" <td>0.514203</td>\n",
" <td>0.583217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.181818</td>\n",
" <td>0.311688</td>\n",
" <td>0.402597</td>\n",
" <td>0.551948</td>\n",
" <td>0.181818</td>\n",
" <td>0.257806</td>\n",
" <td>0.294682</td>\n",
" <td>0.341470</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.025974</td>\n",
" <td>0.090909</td>\n",
" <td>0.136364</td>\n",
" <td>0.318182</td>\n",
" <td>0.025974</td>\n",
" <td>0.064393</td>\n",
" <td>0.083685</td>\n",
" <td>0.140799</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tts_results = pd.concat([gmf_tts_results, netflix_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n",
"display(HTML(tts_results.to_html()))"
]
},
{
"cell_type": "markdown",
"id": "continued-harassment",
"metadata": {},
"source": [
"# Leave-one-out evaluation"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "exact-stuff",
"metadata": {},
"outputs": [],
"source": [
"from evaluation_and_testing.testing import evaluate_leave_one_out_implicit"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "divided-resistance",
"metadata": {},
"outputs": [],
"source": [
"gmf_recommender = GMFRecommender(n_epochs=10)\n",
"\n",
"gmf_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" gmf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"gmf_loo_results = pd.DataFrame(\n",
" gmf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(gmf_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "prerequisite-lounge",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>UserBasedCosineNearestNeighborsRecommender</td>\n",
" <td>0.096667</td>\n",
" <td>0.146667</td>\n",
" <td>0.186667</td>\n",
" <td>0.306667</td>\n",
" <td>0.096667</td>\n",
" <td>0.124285</td>\n",
" <td>0.140782</td>\n",
" <td>0.178962</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"netflix_recommender = NetflixRecommender(n_epochs=10)\n",
"\n",
"netflix_loo_results = [['NetflixRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" netflix_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"netflix_loo_results = pd.DataFrame(\n",
" netflix_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(netflix_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "social-escape",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.166667</td>\n",
" <td>0.256667</td>\n",
" <td>0.32</td>\n",
" <td>0.426667</td>\n",
" <td>0.166667</td>\n",
" <td>0.219086</td>\n",
" <td>0.245486</td>\n",
" <td>0.279978</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from recommenders.amazon_recommender import AmazonRecommender\n",
"\n",
"amazon_recommender = AmazonRecommender()\n",
"\n",
"amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"amazon_loo_results = pd.DataFrame(\n",
" amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(amazon_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "behind-cambodia",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.006667</td>\n",
" <td>0.053333</td>\n",
" <td>0.123333</td>\n",
" <td>0.233333</td>\n",
" <td>0.006667</td>\n",
" <td>0.033491</td>\n",
" <td>0.062178</td>\n",
" <td>0.096151</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tfidf_recommender = TFIDFRecommender()\n",
"\n",
"tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"tfidf_loo_results = pd.DataFrame(\n",
" tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(tfidf_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "lightweight-password",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>UserBasedCosineNearestNeighborsRecommender</td>\n",
" <td>0.096667</td>\n",
" <td>0.146667</td>\n",
" <td>0.186667</td>\n",
" <td>0.306667</td>\n",
" <td>0.096667</td>\n",
" <td>0.124285</td>\n",
" <td>0.140782</td>\n",
" <td>0.178962</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>UserBasedCosineNearestNeighborsRecommender</td>\n",
" <td>0.100000</td>\n",
" <td>0.150000</td>\n",
" <td>0.180000</td>\n",
" <td>0.313333</td>\n",
" <td>0.100000</td>\n",
" <td>0.127182</td>\n",
" <td>0.139518</td>\n",
" <td>0.181748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>UserBasedCosineNearestNeighborsRecommender</td>\n",
" <td>0.266667</td>\n",
" <td>0.420000</td>\n",
" <td>0.513333</td>\n",
" <td>0.650000</td>\n",
" <td>0.266667</td>\n",
" <td>0.357736</td>\n",
" <td>0.396033</td>\n",
" <td>0.440599</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>UserBasedCosineNearestNeighborsRecommender</td>\n",
" <td>0.173333</td>\n",
" <td>0.280000</td>\n",
" <td>0.336667</td>\n",
" <td>0.420000</td>\n",
" <td>0.173333</td>\n",
" <td>0.234522</td>\n",
" <td>0.257759</td>\n",
" <td>0.284723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.166667</td>\n",
" <td>0.256667</td>\n",
" <td>0.320000</td>\n",
" <td>0.426667</td>\n",
" <td>0.166667</td>\n",
" <td>0.219086</td>\n",
" <td>0.245486</td>\n",
" <td>0.279978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.006667</td>\n",
" <td>0.053333</td>\n",
" <td>0.123333</td>\n",
" <td>0.233333</td>\n",
" <td>0.006667</td>\n",
" <td>0.033491</td>\n",
" <td>0.062178</td>\n",
" <td>0.096151</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"loo_results = pd.concat([gmf_loo_results, netflix_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n",
"display(HTML(loo_results.to_html()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}