diff --git a/.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb b/.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb
new file mode 100644
index 0000000..e905e56
--- /dev/null
+++ b/.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb
@@ -0,0 +1,698 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Building train and test sets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# if you don't have some library installed try using pip (or pip3) to install it - you can do it from the notebook\n",
+ "# example: !pip install tqdm\n",
+ "# also on labs it's better to use python3 kernel - ipython3 notebook\n",
+ "\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "import time\n",
+ "import random\n",
+ "import matplotlib\n",
+ "import matplotlib.pyplot as plt\n",
+ "import os\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "import helpers\n",
+ "\n",
+ "os.makedirs('./Datasets/', exist_ok = True)\n",
+ "\n",
+ "helpers.download_movielens_100k_dataset()\n",
+ "\n",
+ "df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
+ "df.columns=['user', 'item', 'rating', 'timestamp']\n",
+ "\n",
+ "train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
+ "\n",
+ "train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
+ "test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Interactions properties"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### How data looks like?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " rating | \n",
+ " timestamp | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 196 | \n",
+ " 242 | \n",
+ " 3 | \n",
+ " 881250949 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 186 | \n",
+ " 302 | \n",
+ " 3 | \n",
+ " 891717742 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 22 | \n",
+ " 377 | \n",
+ " 1 | \n",
+ " 878887116 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 244 | \n",
+ " 51 | \n",
+ " 2 | \n",
+ " 880606923 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 166 | \n",
+ " 346 | \n",
+ " 1 | \n",
+ " 886397596 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user item rating timestamp\n",
+ "0 196 242 3 881250949\n",
+ "1 186 302 3 891717742\n",
+ "2 22 377 1 878887116\n",
+ "3 244 51 2 880606923\n",
+ "4 166 346 1 886397596"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sample properties"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "We have 943 users, 1682 items and 100000 ratings.\n",
+ "\n",
+ "Average number of ratings per user is 106.0445. \n",
+ "\n",
+ "Average number of ratings per item is 59.453.\n",
+ "\n",
+ "Data sparsity (% of missing entries) is 93.6953%.\n"
+ ]
+ }
+ ],
+ "source": [
+ "users, items, ratings=df['user'].nunique(), df['item'].nunique(), len(df)\n",
+ "\n",
+ "print(f'We have {users} users, {items} items and {ratings} ratings.\\n')\n",
+ "\n",
+ "print(f'Average number of ratings per user is {round(ratings/users,4)}. \\n')\n",
+ "print(f'Average number of ratings per item is {round(ratings/items,4)}.\\n')\n",
+ "print(f'Data sparsity (% of missing entries) is {round(100*(1-ratings/(users*items)),4)}%.')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "items_per_user=df.groupby(['user']).count()['rating']\n",
+ "\n",
+ "plt.figure(figsize=(16,8))\n",
+ "plt.hist(items_per_user, bins=100)\n",
+ "\n",
+ "# Let's add median\n",
+ "t=items_per_user.median()\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
+ "\n",
+ "# Let's add also some percentiles\n",
+ "t=items_per_user.quantile(0.25)\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
+ "\n",
+ "t=items_per_user.quantile(0.75)\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
+ "\n",
+ "plt.title('Number of ratings per user', fontsize=30)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "users_per_item=df.groupby(['item']).count()['rating']\n",
+ "\n",
+ "plt.figure(figsize=(16,8))\n",
+ "plt.hist(users_per_item, bins=100)\n",
+ "\n",
+ "# Let's add median\n",
+ "t=users_per_item.median()\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
+ "\n",
+ "# Let's add also some percentiles\n",
+ "t=users_per_item.quantile(0.25)\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
+ "\n",
+ "t=users_per_item.quantile(0.75)\n",
+ "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
+ "plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
+ "\n",
+ "plt.title('Number of ratings per item', fontsize=30)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "rating\n",
+ "1 0.06110\n",
+ "2 0.11370\n",
+ "3 0.27145\n",
+ "4 0.34174\n",
+ "5 0.21201\n",
+ "Name: user, dtype: float64"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby(['rating']).count()['user']/len(df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Item attributes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
+ " encoding='latin-1')\n",
+ "genres=dict(zip(genres[1], genres[0]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{0: 'unknown',\n",
+ " 1: 'Action',\n",
+ " 2: 'Adventure',\n",
+ " 3: 'Animation',\n",
+ " 4: \"Children's\",\n",
+ " 5: 'Comedy',\n",
+ " 6: 'Crime',\n",
+ " 7: 'Documentary',\n",
+ " 8: 'Drama',\n",
+ " 9: 'Fantasy',\n",
+ " 10: 'Film-Noir',\n",
+ " 11: 'Horror',\n",
+ " 12: 'Musical',\n",
+ " 13: 'Mystery',\n",
+ " 14: 'Romance',\n",
+ " 15: 'Sci-Fi',\n",
+ " 16: 'Thriller',\n",
+ " 17: 'War',\n",
+ " 18: 'Western'}"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "genres"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 14 | \n",
+ " 15 | \n",
+ " 16 | \n",
+ " 17 | \n",
+ " 18 | \n",
+ " 19 | \n",
+ " 20 | \n",
+ " 21 | \n",
+ " 22 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Toy Story (1995) | \n",
+ " 01-Jan-1995 | \n",
+ " NaN | \n",
+ " http://us.imdb.com/M/title-exact?Toy%20Story%2... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " GoldenEye (1995) | \n",
+ " 01-Jan-1995 | \n",
+ " NaN | \n",
+ " http://us.imdb.com/M/title-exact?GoldenEye%20(... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Four Rooms (1995) | \n",
+ " 01-Jan-1995 | \n",
+ " NaN | \n",
+ " http://us.imdb.com/M/title-exact?Four%20Rooms%... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3 rows × 24 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 \\\n",
+ "0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
+ "1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
+ "2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
+ "\n",
+ " 4 5 6 7 8 9 ... \\\n",
+ "0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
+ "1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
+ "2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
+ "\n",
+ " 14 15 16 17 18 19 20 21 22 23 \n",
+ "0 0 0 0 0 0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 0 0 1 0 0 \n",
+ "2 0 0 0 0 0 0 0 1 0 0 \n",
+ "\n",
+ "[3 rows x 24 columns]"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "movies[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in range(19):\n",
+ " movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "movies=movies[[0,1,'genre']]\n",
+ "movies.columns=['id', 'title', 'genres']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Toy Story (1995) | \n",
+ " Animation, Children's, Comedy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " GoldenEye (1995) | \n",
+ " Action, Adventure, Thriller | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Four Rooms (1995) | \n",
+ " Thriller | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " Get Shorty (1995) | \n",
+ " Action, Comedy, Drama | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " Copycat (1995) | \n",
+ " Crime, Drama, Thriller | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title genres\n",
+ "0 1 Toy Story (1995) Animation, Children's, Comedy\n",
+ "1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
+ "2 3 Four Rooms (1995) Thriller\n",
+ "3 4 Get Shorty (1995) Action, Comedy, Drama\n",
+ "4 5 Copycat (1995) Crime, Drama, Thriller"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
+ "movies[:5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Toy example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.makedirs('./Datasets/toy-example/', exist_ok = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
+ " [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
+ " [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
+ "toy_test=pd.DataFrame([[0,60,3,0],\n",
+ " [10,40,5,0],\n",
+ " [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
+ "\n",
+ "toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
+ "toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb b/.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb
new file mode 100644
index 0000000..85b9494
--- /dev/null
+++ b/.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb
@@ -0,0 +1,1527 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Preparing dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "from collections import defaultdict\n",
+ "from itertools import chain\n",
+ "import random\n",
+ "\n",
+ "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
+ "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's prepare dataset\n",
+ "train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
+ "train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
+ "train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
+ "\n",
+ "user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
+ "user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
+ "item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
+ "item_id_code = dict((v, k) for k, v in item_code_id.items())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " rating | \n",
+ " timestamp | \n",
+ " user_code | \n",
+ " item_code | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 664 | \n",
+ " 525 | \n",
+ " 4 | \n",
+ " 876526580 | \n",
+ " 663 | \n",
+ " 524 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 49 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 888068651 | \n",
+ " 48 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 352 | \n",
+ " 273 | \n",
+ " 2 | \n",
+ " 884290328 | \n",
+ " 351 | \n",
+ " 272 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 618 | \n",
+ " 96 | \n",
+ " 3 | \n",
+ " 891307749 | \n",
+ " 617 | \n",
+ " 95 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 560 | \n",
+ " 24 | \n",
+ " 2 | \n",
+ " 879976772 | \n",
+ " 559 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user item rating timestamp user_code item_code\n",
+ "0 664 525 4 876526580 663 524\n",
+ "1 49 1 2 888068651 48 0\n",
+ "2 352 273 2 884290328 351 272\n",
+ "3 618 96 3 891307749 617 95\n",
+ "4 560 24 2 879976772 559 23"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_and_test[:5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
+ "test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Take number of users and items\n",
+ "(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
+ "\n",
+ "# Create sparse csr matrices\n",
+ "train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
+ "test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Above steps are the same for many algorithms, so I put the code in separate file:\n",
+ "import helpers\n",
+ "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
+ "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
+ "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### CSR matrices - what is it?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<3x4 sparse matrix of type ''\n",
+ "\twith 8 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
+ "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
+ "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
+ "sample_csr=sparse.csr_matrix((data, (row, col)))\n",
+ "sample_csr"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ratings matrix with missing entries replaced by zeros:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[4, 1, 3, 0],\n",
+ " [0, 2, 0, 1],\n",
+ " [2, 0, 5, 4]], dtype=int32)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of ratings: 8\n",
+ "Number of users: 3\n",
+ "Number of items: 4\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Ratings matrix with missing entries replaced by zeros:')\n",
+ "display(sample_csr.todense())\n",
+ "\n",
+ "print(f'Number of ratings: {sample_csr.nnz}')\n",
+ "print(f'Number of users: {sample_csr.shape[0]}')\n",
+ "print(f'Number of items: {sample_csr.shape[1]}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ratings data: [4 1 3 2 1 2 5 4]\n",
+ "Regarding items: [0 1 2 1 3 0 2 3]\n",
+ "Where ratings from 0 to 2 belongs to user 0.\n",
+ "Where ratings from 3 to 4 belongs to user 1.\n",
+ "Where ratings from 5 to 7 belongs to user 2.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Ratings data:', sample_csr.data)\n",
+ "\n",
+ "print('Regarding items:', sample_csr.indices)\n",
+ "\n",
+ "for i in range(sample_csr.shape[0]):\n",
+ " print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Efficient way to access items rated by user:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
+ " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "471 ns ± 15.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
+ "Inefficient way to access items rated by user:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
+ " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "48.3 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "user=123\n",
+ "\n",
+ "print('Efficient way to access items rated by user:')\n",
+ "display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
+ "%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
+ "\n",
+ "print('Inefficient way to access items rated by user:')\n",
+ "display(train_ui[user].indices)\n",
+ "%timeit train_ui[user].indices"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###### Example: subtracting row means"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Our matrix:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[4, 1, 3, 0],\n",
+ " [0, 2, 0, 1],\n",
+ " [2, 0, 5, 4]], dtype=int32)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "List of row sums:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[ 8, 3, 11]])"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print('Our matrix:')\n",
+ "display(sample_csr.todense())\n",
+ "print('List of row sums:')\n",
+ "sample_csr.sum(axis=1).ravel()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Array with row means:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([2.66666667, 1.5 , 3.66666667])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Diagonal csr matrix with inverse of row sums on diagonal:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[2.66666667, 0. , 0. ],\n",
+ " [0. , 1.5 , 0. ],\n",
+ " [0. , 0. , 3.66666667]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Let's apply them in nonzero entries:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n",
+ " [0. , 1.5 , 0. , 1.5 ],\n",
+ " [3.66666667, 0. , 3.66666667, 3.66666667]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Finally after subtraction:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n",
+ " [ 0. , 0.5 , 0. , -0.5 ],\n",
+ " [-1.66666667, 0. , 1.33333333, 0.33333333]])"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print('Array with row means:')\n",
+ "row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
+ "display(row_means)\n",
+ "\n",
+ "print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
+ "display(sparse.diags(row_means).todense())\n",
+ "\n",
+ "print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
+ "to_subtract=sparse.diags(row_means)*(sample_csr>0)\n",
+ "display(to_subtract.todense())\n",
+ "\n",
+ "print(\"Finally after subtraction:\")\n",
+ "sample_csr-to_subtract.todense()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###### Transposing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sample matrix: \n",
+ " [[4 1 3 0]\n",
+ " [0 2 0 1]\n",
+ " [2 0 5 4]]\n",
+ "\n",
+ "Indices: \n",
+ " [0 1 2 1 3 0 2 3]\n",
+ "\n",
+ "Transposed matrix: \n",
+ " [[4 0 2]\n",
+ " [1 2 0]\n",
+ " [3 0 5]\n",
+ " [0 1 4]]\n",
+ "\n",
+ "Indices of transposed matrix: \n",
+ " [0 1 2 1 3 0 2 3]\n",
+ "\n",
+ "Reason: \n",
+ "\n",
+ "After converting to csr: \n",
+ " [0 2 0 1 0 2 1 2]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from scipy import sparse\n",
+ "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
+ "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
+ "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
+ "sample=sparse.csr_matrix((data, (row, col)))\n",
+ "print('Sample matrix: \\n', sample.A)\n",
+ "print('\\nIndices: \\n', sample.indices)\n",
+ "transposed=sample.transpose()\n",
+ "print('\\nTransposed matrix: \\n', transposed.A)\n",
+ "print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
+ "\n",
+ "print('\\nReason: ', type(transposed))\n",
+ "\n",
+ "print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self made top popular"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "if not os.path.exists('./Recommendations generated/'):\n",
+ " os.mkdir('./Recommendations generated/')\n",
+ " os.mkdir('./Recommendations generated/ml-100k/')\n",
+ " os.mkdir('./Recommendations generated/toy-example/')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "top_pop = []\n",
+ "train_iu = train_ui.transpose().tocsr()\n",
+ "scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))\n",
+ "\n",
+ "for i in range(train_iu.shape[0]):\n",
+ " top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
+ " \n",
+ "top_pop.sort(key=lambda x: x[1], reverse=True)\n",
+ "#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
+ "\n",
+ "k = 10\n",
+ "result = []\n",
+ "\n",
+ "for u in range(train_ui.shape[0]):\n",
+ " user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
+ " rec_user = []\n",
+ " item_pos = 0\n",
+ " while len(rec_user)<10:\n",
+ " if top_pop[item_pos][0] not in user_rated:\n",
+ " rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))\n",
+ " item_pos+=1\n",
+ " result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
+ "\n",
+ "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
+ "\n",
+ "\n",
+ "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
+ "\n",
+ "estimations=[]\n",
+ "\n",
+ "for user, item in zip(*test_ui.nonzero()):\n",
+ " estimations.append([user_code_id[user], item_code_id[item],\n",
+ " (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
+ "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self made top rated"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "top_rated = []\n",
+ "global_avg = sum(train_iu.data)/train_ui.nnz\n",
+ "\n",
+ "for i in range(train_iu.shape[0]):\n",
+ " ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]\n",
+ " avg = np.mean(ratings) if len(ratings)>0 else global_avg\n",
+ " top_rated.append((i, avg))\n",
+ " \n",
+ "top_rated.sort(key=lambda x: x[1], reverse=True)\n",
+ " \n",
+ "k=10\n",
+ "result=[]\n",
+ "\n",
+ "for u in range(train_ui.shape[0]):\n",
+ " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
+ " rec_user=[]\n",
+ " item_pos=0\n",
+ " while len(rec_user)<10:\n",
+ " if top_rated[item_pos][0] not in user_rated:\n",
+ " rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))\n",
+ " item_pos+=1\n",
+ " result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
+ "\n",
+ "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
+ "\n",
+ "\n",
+ "\n",
+ "estimations=[]\n",
+ "d = dict(top_rated)\n",
+ "\n",
+ "for user, item in zip(*test_ui.nonzero()):\n",
+ " estimations.append([user_code_id[user], item_code_id[item], d[item]])\n",
+ "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ " 15 | \n",
+ " 16 | \n",
+ " 17 | \n",
+ " 18 | \n",
+ " 19 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 814 | \n",
+ " 5.0 | \n",
+ " 1122 | \n",
+ " 5.0 | \n",
+ " 1189 | \n",
+ " 5.0 | \n",
+ " 1201 | \n",
+ " 5.0 | \n",
+ " 1293 | \n",
+ " ... | \n",
+ " 1306 | \n",
+ " 5.0 | \n",
+ " 1467 | \n",
+ " 5.0 | \n",
+ " 1491 | \n",
+ " 5.0 | \n",
+ " 1500 | \n",
+ " 5.0 | \n",
+ " 1536 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 119 | \n",
+ " 5.0 | \n",
+ " 814 | \n",
+ " 5.0 | \n",
+ " 1122 | \n",
+ " 5.0 | \n",
+ " 1189 | \n",
+ " 5.0 | \n",
+ " 1201 | \n",
+ " ... | \n",
+ " 1293 | \n",
+ " 5.0 | \n",
+ " 1306 | \n",
+ " 5.0 | \n",
+ " 1467 | \n",
+ " 5.0 | \n",
+ " 1491 | \n",
+ " 5.0 | \n",
+ " 1500 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 9 ... 11 12 13 \\\n",
+ "0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 1293 ... 1306 5.0 1467 \n",
+ "1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 1293 5.0 1306 \n",
+ "\n",
+ " 14 15 16 17 18 19 20 \n",
+ "0 5.0 1491 5.0 1500 5.0 1536 5.0 \n",
+ "1 5.0 1467 5.0 1491 5.0 1500 5.0 \n",
+ "\n",
+ "[2 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(result)[:2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self-made baseline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class selfBaselineUI():\n",
+ " \n",
+ " def fit(self, train_ui):\n",
+ " self.train_ui=train_ui.copy()\n",
+ " self.train_iu=train_ui.transpose().tocsr()\n",
+ " \n",
+ " result=self.train_ui.copy()\n",
+ " \n",
+ " self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
+ " \n",
+ " # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
+ " # (other option is to define addition/multiplication in a desired way)\n",
+ " row_means=self.row_means.copy()\n",
+ " \n",
+ " max_row_mean=np.max(row_means)\n",
+ " row_means[row_means==0]=max_row_mean+1\n",
+ " to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
+ " to_subtract_rows.sort_indices() # needed to have valid .data\n",
+ " \n",
+ " subtract=to_subtract_rows.data\n",
+ " subtract[subtract==max_row_mean+1]=0\n",
+ " \n",
+ " result.data=result.data-subtract\n",
+ "# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
+ " self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
+ " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
+ " \n",
+ " # again - it is possible that some mean will be zero, so let's use the same workaround\n",
+ " col_means=self.col_means.copy()\n",
+ " \n",
+ " max_col_mean=np.max(col_means)\n",
+ " col_means[col_means==0]=max_col_mean+1\n",
+ " to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
+ " to_subtract_cols.sort_indices() # needed to have valid .data\n",
+ " \n",
+ " subtract=to_subtract_cols.data\n",
+ " subtract[subtract==max_col_mean+1]=0\n",
+ " \n",
+ " result.data=result.data-subtract\n",
+ "\n",
+ " return result\n",
+ " \n",
+ " \n",
+ " def recommend(self, user_code_id, item_code_id, topK=10):\n",
+ " estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
+ " \n",
+ " top_k = defaultdict(list)\n",
+ " for nb_user, user in enumerate(estimations):\n",
+ " \n",
+ " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
+ " for item, score in enumerate(user):\n",
+ " if item not in user_rated:\n",
+ " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
+ " result=[]\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, item_scores in top_k.items():\n",
+ " item_scores.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid]+list(chain(*item_scores[:topK])))\n",
+ " return result\n",
+ " \n",
+ " def estimate(self, user_code_id, item_code_id, test_ui):\n",
+ " result=[]\n",
+ " for user, item in zip(*test_ui.nonzero()):\n",
+ " result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training data:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
+ " [0, 1, 2, 3, 0, 0, 0, 0],\n",
+ " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "After subtracting rows and columns:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
+ " [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
+ " [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Recommend best unseen item:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Print estimations on unseen items:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " est_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 60 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 40 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20 | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 20 | \n",
+ " 20 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 20 | \n",
+ " 70 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user item est_score\n",
+ "0 0 60 4.0\n",
+ "1 10 40 3.0\n",
+ "2 20 0 3.0\n",
+ "3 20 20 4.0\n",
+ "4 20 70 4.0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
+ "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
+ "\n",
+ "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
+ "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "\n",
+ "print('Training data:')\n",
+ "display(toy_train_ui.todense())\n",
+ "\n",
+ "model=selfBaselineUI()\n",
+ "print('After subtracting rows and columns:')\n",
+ "display(model.fit(toy_train_ui).todense())\n",
+ "\n",
+ "print('Recommend best unseen item:')\n",
+ "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
+ "\n",
+ "print('Print estimations on unseen items:')\n",
+ "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
+ "estimations.columns=['user', 'item', 'est_score']\n",
+ "display(estimations)\n",
+ "\n",
+ "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
+ "\n",
+ "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
+ "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model=selfBaselineUI()\n",
+ "model.fit(train_ui)\n",
+ "\n",
+ "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
+ "\n",
+ "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
+ "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# project task 1: implement self-made BaselineIU"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI but first subtract column means then row means.\n",
+ "\n",
+ "The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv' and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'.\n",
+ "\n",
+ "
\n",
+ "Additional clarification: \n",
+ "\n",
+ "Summarizing, the prediction of the rating of the user u regarding the item i should be equal to b_u + b_i.\n",
+ "The procedure to get b_u and b_i is the following:\n",
+ "- We have the original user-item ratings matrix M.\n",
+ "- For each column representing the item i, we compute the mean of ratings and denote by b_i. From each rating in matrix M we subtract the corresponding column mean (b_i) to receive new matrix M'.\n",
+ "- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class selfBaselineIU():\n",
+ " \n",
+ " def fit(self, train_ui):\n",
+ " self.train_ui=train_ui.copy()\n",
+ " self.train_iu=train_ui.transpose().tocsr()\n",
+ " \n",
+ " result=self.train_ui.copy()\n",
+ " \n",
+ " #we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
+ " self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
+ " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
+ " \n",
+ " # again - it is possible that some mean will be zero, so let's use the same workaround\n",
+ " col_means=self.col_means.copy()\n",
+ " \n",
+ " max_col_mean=np.max(col_means)\n",
+ " col_means[col_means==0]=max_col_mean+1\n",
+ " to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
+ " to_subtract_cols.sort_indices() # needed to have valid .data\n",
+ " \n",
+ " subtract=to_subtract_cols.data\n",
+ " subtract[subtract==max_col_mean+1]=0\n",
+ " \n",
+ " result.data=result.data-subtract\n",
+ "\n",
+ "\n",
+ " self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
+ " \n",
+ " # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
+ " # (other option is to define addition/multiplication in a desired way)\n",
+ " row_means=self.row_means.copy()\n",
+ " \n",
+ " max_row_mean=np.max(row_means)\n",
+ " row_means[row_means==0]=max_row_mean+1\n",
+ " to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
+ " to_subtract_rows.sort_indices() # needed to have valid .data\n",
+ " \n",
+ " subtract=to_subtract_rows.data\n",
+ " subtract[subtract==max_row_mean+1]=0\n",
+ " \n",
+ " result.data=result.data-subtract\n",
+ "\n",
+ " return result\n",
+ " \n",
+ " \n",
+ " def recommend(self, user_code_id, item_code_id, topK=10):\n",
+ " estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
+ " \n",
+ " top_k = defaultdict(list)\n",
+ " for nb_user, user in enumerate(estimations):\n",
+ " \n",
+ " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
+ " for item, score in enumerate(user):\n",
+ " if item not in user_rated:\n",
+ " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
+ " result=[]\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, item_scores in top_k.items():\n",
+ " item_scores.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid]+list(chain(*item_scores[:topK])))\n",
+ " return result\n",
+ " \n",
+ " def estimate(self, user_code_id, item_code_id, test_ui):\n",
+ " result=[]\n",
+ " for user, item in zip(*test_ui.nonzero()):\n",
+ " result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training data:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
+ " [0, 1, 2, 3, 0, 0, 0, 0],\n",
+ " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "After subtracting columns and rows:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[-0.375 , 1.125 , 0. , 0. , -0.375 ,\n",
+ " 0. , 0. , -0.375 ],\n",
+ " [ 0. , -0.66666667, 0.83333333, -0.16666667, 0. ,\n",
+ " 0. , 0. , 0. ],\n",
+ " [ 0. , 0. , 0. , 0.66666667, 0. ,\n",
+ " -0.33333333, -0.33333333, 0. ]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Recommend best unseen item:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Print estimations on unseen items:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " est_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 60 | \n",
+ " 4.375000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 40 | \n",
+ " 4.166667 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20 | \n",
+ " 0 | \n",
+ " 3.333333 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 20 | \n",
+ " 20 | \n",
+ " 2.333333 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 20 | \n",
+ " 70 | \n",
+ " 4.333333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user item est_score\n",
+ "0 0 60 4.375000\n",
+ "1 10 40 4.166667\n",
+ "2 20 0 3.333333\n",
+ "3 20 20 2.333333\n",
+ "4 20 70 4.333333"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
+ "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
+ "\n",
+ "toy_train_iu, toy_test_iu, toy_user_code_id, toy_user_id_code, \\\n",
+ "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "\n",
+ "print('Training data:')\n",
+ "display(toy_train_iu.todense())\n",
+ "\n",
+ "model=selfBaselineIU()\n",
+ "print('After subtracting columns and rows:')\n",
+ "display(model.fit(toy_train_iu).todense())\n",
+ "\n",
+ "print('Recommend best unseen item:')\n",
+ "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
+ "\n",
+ "print('Print estimations on unseen items:')\n",
+ "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
+ "estimations.columns=['user', 'item', 'est_score']\n",
+ "display(estimations)\n",
+ "\n",
+ "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n",
+ "\n",
+ "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
+ "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model=selfBaselineIU()\n",
+ "model.fit(train_ui)\n",
+ "\n",
+ "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_reco.csv', index=False, header=False)\n",
+ "\n",
+ "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
+ "estimations.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_estimations.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ready-made baseline - Surprise implementation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Estimating biases using als...\n"
+ ]
+ }
+ ],
+ "source": [
+ "import surprise as sp\n",
+ "import time\n",
+ "\n",
+ "# Based on surprise.readthedocs.io\n",
+ "def get_top_n(predictions, n=10):\n",
+ " \n",
+ " # Here we create a dictionary which items are lists of pairs (item, score)\n",
+ " top_n = defaultdict(list)\n",
+ " for uid, iid, true_r, est, _ in predictions:\n",
+ " top_n[uid].append((iid, est))\n",
+ " \n",
+ " result=[]\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, user_ratings in top_n.items():\n",
+ " user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid]+list(chain(*user_ratings[:n]))) \n",
+ " return result\n",
+ "\n",
+ "\n",
+ "reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
+ "trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
+ "trainset = trainset.build_full_trainset() # -> it is needed for using Surprise package\n",
+ "\n",
+ "testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
+ "testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
+ "\n",
+ "algo = sp.BaselineOnly()\n",
+ "# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
+ "# observe how bad results gives above algorithm\n",
+ "# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
+ "\n",
+ "algo.fit(trainset)\n",
+ "\n",
+ "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
+ "predictions = algo.test(antitrainset)\n",
+ "\n",
+ "top_n = get_top_n(predictions, n=10)\n",
+ "\n",
+ "top_n=pd.DataFrame(top_n)\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSE: 0.9495\n",
+ "MAE: 0.7525\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.7524871012820799"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Compute RMSE on testset using buildin functions\n",
+ "predictions = algo.test(testset)\n",
+ "sp.accuracy.rmse(predictions, verbose=True)\n",
+ "\n",
+ "# Let's also save the results in file\n",
+ "predictions_df=[]\n",
+ "for uid, iid, true_r, est, _ in predictions:\n",
+ " predictions_df.append([uid, iid, est])\n",
+ " \n",
+ "predictions_df=pd.DataFrame(predictions_df)\n",
+ "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
+ "\n",
+ "sp.accuracy.mae(predictions, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Let's compare with random"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSE: 1.5165\n",
+ "MAE: 1.2172\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1.2172144988785374"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
+ "algo = sp.NormalPredictor()\n",
+ "algo.fit(trainset)\n",
+ "\n",
+ "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
+ "predictions = algo.test(antitrainset)\n",
+ "\n",
+ "top_n = get_top_n(predictions, n=10)\n",
+ "\n",
+ "top_n=pd.DataFrame(top_n)\n",
+ "\n",
+ "top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
+ "\n",
+ "# Compute RMSE on testset using buildin functions\n",
+ "predictions = algo.test(testset)\n",
+ "sp.accuracy.rmse(predictions, verbose=True)\n",
+ "\n",
+ "# Let's also save the results in file\n",
+ "predictions_df=[]\n",
+ "for uid, iid, true_r, est, _ in predictions:\n",
+ " predictions_df.append([uid, iid, est])\n",
+ " \n",
+ "predictions_df=pd.DataFrame(predictions_df)\n",
+ "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
+ "\n",
+ "sp.accuracy.mae(predictions, verbose=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ },
+ "metadata": {
+ "interpreter": {
+ "hash": "2a3a95f8b675c5b7dd6a35e1675edaf697539b1f0a71c4603e9520a8bbd07d82"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/.ipynb_checkpoints/P2. Evaluation-checkpoint.ipynb b/.ipynb_checkpoints/P2. Evaluation-checkpoint.ipynb
new file mode 100644
index 0000000..d4cadb5
--- /dev/null
+++ b/.ipynb_checkpoints/P2. Evaluation-checkpoint.ipynb
@@ -0,0 +1,1678 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Prepare test set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "-"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "from collections import defaultdict\n",
+ "from itertools import chain\n",
+ "import random\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "# In evaluation we do not load train set - it is not needed\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "test.columns = [\"user\", \"item\", \"rating\", \"timestamp\"]\n",
+ "\n",
+ "test[\"user_code\"] = test[\"user\"].astype(\"category\").cat.codes\n",
+ "test[\"item_code\"] = test[\"item\"].astype(\"category\").cat.codes\n",
+ "\n",
+ "user_code_id = dict(enumerate(test[\"user\"].astype(\"category\").cat.categories))\n",
+ "user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
+ "item_code_id = dict(enumerate(test[\"item\"].astype(\"category\").cat.categories))\n",
+ "item_id_code = dict((v, k) for k, v in item_code_id.items())\n",
+ "\n",
+ "test_ui = sparse.csr_matrix((test[\"rating\"], (test[\"user_code\"], test[\"item_code\"])))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Estimations metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: 'Recommendations generated/ml-100k/Ready_Baseline_estimations.csv'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m estimations_df = pd.read_csv(\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;34m\"Recommendations generated/ml-100k/Ready_Baseline_estimations.csv\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m )\n\u001b[0;32m 4\u001b[0m \u001b[0mestimations_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m\"user\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"item\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"score\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 608\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 609\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 610\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 611\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 612\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 462\u001b[1;33m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 463\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 464\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 817\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"has_index_names\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"has_index_names\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 818\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 819\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 820\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 821\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m 1048\u001b[0m )\n\u001b[0;32m 1049\u001b[0m \u001b[1;31m# error: Too many arguments for \"ParserBase\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1050\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmapping\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# type: ignore[call-arg]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1051\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1052\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 1865\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1866\u001b[0m \u001b[1;31m# open handles\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1867\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_open_handles\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1868\u001b[0m \u001b[1;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandles\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1869\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"storage_options\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"encoding\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"memory_map\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"compression\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_open_handles\u001b[1;34m(self, src, kwds)\u001b[0m\n\u001b[0;32m 1360\u001b[0m \u001b[0mLet\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mreaders\u001b[0m \u001b[0mopen\u001b[0m \u001b[0mIOHanldes\u001b[0m \u001b[0mafter\u001b[0m \u001b[0mthey\u001b[0m \u001b[0mare\u001b[0m \u001b[0mdone\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtheir\u001b[0m \u001b[0mpotential\u001b[0m \u001b[0mraises\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1361\u001b[0m \"\"\"\n\u001b[1;32m-> 1362\u001b[1;33m self.handles = get_handle(\n\u001b[0m\u001b[0;32m 1363\u001b[0m \u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1364\u001b[0m \u001b[1;34m\"r\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\pandas\\io\\common.py\u001b[0m in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 640\u001b[0m \u001b[0merrors\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"replace\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 641\u001b[0m \u001b[1;31m# Encoding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 642\u001b[1;33m handle = open(\n\u001b[0m\u001b[0;32m 643\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 644\u001b[0m \u001b[0mioargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Recommendations generated/ml-100k/Ready_Baseline_estimations.csv'"
+ ]
+ }
+ ],
+ "source": [
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_estimations.csv\", header=None\n",
+ ")\n",
+ "estimations_df.columns = [\"user\", \"item\", \"score\"]\n",
+ "\n",
+ "estimations_df[\"user_code\"] = [user_id_code[user] for user in estimations_df[\"user\"]]\n",
+ "estimations_df[\"item_code\"] = [item_id_code[item] for item in estimations_df[\"item\"]]\n",
+ "estimations = sparse.csr_matrix(\n",
+ " (\n",
+ " estimations_df[\"score\"],\n",
+ " (estimations_df[\"user_code\"], estimations_df[\"item_code\"]),\n",
+ " ),\n",
+ " shape=test_ui.shape,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def estimations_metrics(test_ui, estimations):\n",
+ " result = []\n",
+ "\n",
+ " RMSE = (np.sum((estimations.data - test_ui.data) ** 2) / estimations.nnz) ** (1 / 2)\n",
+ " result.append([\"RMSE\", RMSE])\n",
+ "\n",
+ " MAE = np.sum(abs(estimations.data - test_ui.data)) / estimations.nnz\n",
+ " result.append([\"MAE\", MAE])\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
+ " return df_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'estimations' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mestimations_metrics\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_ui\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mestimations\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[1;31mNameError\u001b[0m: name 'estimations' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "# in case of error (in the laboratories) you might have to switch to the other version of pandas\n",
+ "# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel\n",
+ "\n",
+ "estimations_metrics(test_ui, estimations)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Ranking metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[663, 475, 62, ..., 472, 269, 503],\n",
+ " [ 48, 313, 475, ..., 591, 175, 466],\n",
+ " [351, 313, 475, ..., 591, 175, 466],\n",
+ " ...,\n",
+ " [259, 313, 475, ..., 11, 591, 175],\n",
+ " [ 33, 313, 475, ..., 11, 591, 175],\n",
+ " [ 77, 313, 475, ..., 11, 591, 175]])"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_reco.csv\", delimiter=\",\"\n",
+ ")\n",
+ "# Let's ignore scores - they are not used in evaluation:\n",
+ "users = reco[:, :1]\n",
+ "items = reco[:, 1::2]\n",
+ "# Let's use inner ids instead of real ones\n",
+ "users = np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)\n",
+ "items = np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items)\n",
+ "reco = np.concatenate((users, items), axis=1)\n",
+ "reco"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):\n",
+ "\n",
+ " nb_items = test_ui.shape[1]\n",
+ " (\n",
+ " relevant_users,\n",
+ " super_relevant_users,\n",
+ " prec,\n",
+ " rec,\n",
+ " F_1,\n",
+ " F_05,\n",
+ " prec_super,\n",
+ " rec_super,\n",
+ " ndcg,\n",
+ " mAP,\n",
+ " MRR,\n",
+ " LAUC,\n",
+ " HR,\n",
+ " ) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)\n",
+ "\n",
+ " cg = 1.0 / np.log2(np.arange(2, topK + 2))\n",
+ " cg_sum = np.cumsum(cg)\n",
+ "\n",
+ " for (nb_user, user) in tqdm(enumerate(reco[:, 0])):\n",
+ " u_rated_items = test_ui.indices[test_ui.indptr[user] : test_ui.indptr[user + 1]]\n",
+ " nb_u_rated_items = len(u_rated_items)\n",
+ " if (\n",
+ " nb_u_rated_items > 0\n",
+ " ): # skip users with no items in test set (still possible that there will be no super items)\n",
+ " relevant_users += 1\n",
+ "\n",
+ " u_super_items = u_rated_items[\n",
+ " np.vectorize(lambda x: x in super_reactions)(\n",
+ " test_ui.data[test_ui.indptr[user] : test_ui.indptr[user + 1]]\n",
+ " )\n",
+ " ]\n",
+ " # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]\n",
+ " # but accesing test_ui[user,item] is expensive -we should avoid doing it\n",
+ " if len(u_super_items) > 0:\n",
+ " super_relevant_users += 1\n",
+ "\n",
+ " user_successes = np.zeros(topK)\n",
+ " nb_user_successes = 0\n",
+ " user_super_successes = np.zeros(topK)\n",
+ " nb_user_super_successes = 0\n",
+ "\n",
+ " # evaluation\n",
+ " for (item_position, item) in enumerate(reco[nb_user, 1 : topK + 1]):\n",
+ " if item in u_rated_items:\n",
+ " user_successes[item_position] = 1\n",
+ " nb_user_successes += 1\n",
+ " if item in u_super_items:\n",
+ " user_super_successes[item_position] = 1\n",
+ " nb_user_super_successes += 1\n",
+ "\n",
+ " prec_u = nb_user_successes / topK\n",
+ " prec += prec_u\n",
+ "\n",
+ " rec_u = nb_user_successes / nb_u_rated_items\n",
+ " rec += rec_u\n",
+ "\n",
+ " F_1 += 2 * (prec_u * rec_u) / (prec_u + rec_u) if prec_u + rec_u > 0 else 0\n",
+ " F_05 += (\n",
+ " (0.5 ** 2 + 1) * (prec_u * rec_u) / (0.5 ** 2 * prec_u + rec_u)\n",
+ " if prec_u + rec_u > 0\n",
+ " else 0\n",
+ " )\n",
+ "\n",
+ " prec_super += nb_user_super_successes / topK\n",
+ " rec_super += nb_user_super_successes / max(\n",
+ " len(u_super_items), 1\n",
+ " ) # to set 0 if no super items\n",
+ " ndcg += np.dot(user_successes, cg) / cg_sum[min(topK, nb_u_rated_items) - 1]\n",
+ "\n",
+ " cumsum_successes = np.cumsum(user_successes)\n",
+ " mAP += np.dot(\n",
+ " cumsum_successes / np.arange(1, topK + 1), user_successes\n",
+ " ) / min(topK, nb_u_rated_items)\n",
+ " MRR += (\n",
+ " 1 / (user_successes.nonzero()[0][0] + 1)\n",
+ " if user_successes.nonzero()[0].size > 0\n",
+ " else 0\n",
+ " )\n",
+ " LAUC += (\n",
+ " np.dot(cumsum_successes, 1 - user_successes)\n",
+ " + (nb_user_successes + nb_u_rated_items)\n",
+ " / 2\n",
+ " * ((nb_items - nb_u_rated_items) - (topK - nb_user_successes))\n",
+ " ) / ((nb_items - nb_u_rated_items) * nb_u_rated_items)\n",
+ "\n",
+ " HR += nb_user_successes > 0\n",
+ "\n",
+ " result = []\n",
+ " result.append((\"precision\", prec / relevant_users))\n",
+ " result.append((\"recall\", rec / relevant_users))\n",
+ " result.append((\"F_1\", F_1 / relevant_users))\n",
+ " result.append((\"F_05\", F_05 / relevant_users))\n",
+ " result.append((\"precision_super\", prec_super / super_relevant_users))\n",
+ " result.append((\"recall_super\", rec_super / super_relevant_users))\n",
+ " result.append((\"NDCG\", ndcg / relevant_users))\n",
+ " result.append((\"mAP\", mAP / relevant_users))\n",
+ " result.append((\"MRR\", MRR / relevant_users))\n",
+ " result.append((\"LAUC\", LAUC / relevant_users))\n",
+ " result.append((\"HR\", HR / relevant_users))\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
+ " return df_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 9434.06it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.09141 | \n",
+ " 0.037652 | \n",
+ " 0.04603 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " precision recall F_1 F_05 precision_super recall_super \\\n",
+ "0 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463 \n",
+ "\n",
+ " NDCG mAP MRR LAUC HR \n",
+ "0 0.095957 0.043178 0.198193 0.515501 0.437964 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ranking_metrics(test_ui, reco, super_reactions=[4, 5], topK=10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Diversity metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def diversity_metrics(test_ui, reco, topK=10):\n",
+ "\n",
+ " frequencies = defaultdict(int)\n",
+ "\n",
+ " # let's assign 0 to all items in test set\n",
+ " for item in list(set(test_ui.indices)):\n",
+ " frequencies[item] = 0\n",
+ "\n",
+ " # counting frequencies\n",
+ " for item in reco[:, 1:].flat:\n",
+ " frequencies[item] += 1\n",
+ "\n",
+ " nb_reco_outside_test = frequencies[-1]\n",
+ " del frequencies[-1]\n",
+ "\n",
+ " frequencies = np.array(list(frequencies.values()))\n",
+ "\n",
+ " nb_rec_items = len(frequencies[frequencies > 0])\n",
+ " nb_reco_inside_test = np.sum(frequencies)\n",
+ "\n",
+ " frequencies = frequencies / np.sum(frequencies)\n",
+ " frequencies = np.sort(frequencies)\n",
+ "\n",
+ " with np.errstate(\n",
+ " divide=\"ignore\"\n",
+ " ): # let's put zeros put items with 0 frequency and ignore division warning\n",
+ " log_frequencies = np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)\n",
+ "\n",
+ " result = []\n",
+ " result.append(\n",
+ " (\n",
+ " \"Reco in test\",\n",
+ " nb_reco_inside_test / (nb_reco_inside_test + nb_reco_outside_test),\n",
+ " )\n",
+ " )\n",
+ " result.append((\"Test coverage\", nb_rec_items / test_ui.shape[1]))\n",
+ " result.append((\"Shannon\", -np.dot(frequencies, log_frequencies)))\n",
+ " result.append(\n",
+ " (\n",
+ " \"Gini\",\n",
+ " np.dot(frequencies, np.arange(1 - len(frequencies), len(frequencies), 2))\n",
+ " / (len(frequencies) - 1),\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
+ " return df_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.0 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Reco in test Test coverage Shannon Gini\n",
+ "0 1.0 0.033911 2.836513 0.991139"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel\n",
+ "\n",
+ "x = diversity_metrics(test_ui, reco, topK=10)\n",
+ "x"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# To be used in other notebooks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 11012.47it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.09141 | \n",
+ " 0.037652 | \n",
+ " 0.04603 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.0 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " RMSE MAE precision recall F_1 F_05 \\\n",
+ "0 0.949459 0.752487 0.09141 0.037652 0.04603 0.061286 \n",
+ "\n",
+ " precision_super recall_super NDCG mAP MRR LAUC \\\n",
+ "0 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 \n",
+ "\n",
+ " HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.437964 1.0 0.033911 2.836513 0.991139 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import evaluation_measures as ev\n",
+ "\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_estimations.csv\", header=None\n",
+ ")\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_reco.csv\", delimiter=\",\"\n",
+ ")\n",
+ "\n",
+ "ev.evaluate(\n",
+ " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n",
+ " estimations_df=estimations_df,\n",
+ " reco=reco,\n",
+ " super_reactions=[4, 5],\n",
+ ")\n",
+ "# also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 10346.82it/s]\n",
+ "943it [00:00, 11772.32it/s]\n",
+ "943it [00:00, 10636.62it/s]\n",
+ "943it [00:00, 10767.92it/s]\n",
+ "943it [00:00, 12019.93it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "df = ev.evaluate_all(test, dir_path, super_reactions)\n",
+ "# also you can just type ev.evaluate_all() - I put above values as default"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "\n",
+ " F_05 precision_super recall_super \n",
+ "0 0.141584 0.130472 0.137473 \n",
+ "0 0.061286 0.079614 0.056463 \n",
+ "0 0.032269 0.029506 0.023707 \n",
+ "0 0.000481 0.000644 0.000223 \n",
+ "0 0.000463 0.000644 0.000189 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[:, :9]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model NDCG mAP MRR LAUC HR \\\n",
+ "0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 \n",
+ "0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 \n",
+ "0 Ready_Random 0.050075 0.018728 0.121957 0.506893 0.329799 \n",
+ "0 Self_TopRated 0.001043 0.000335 0.003348 0.496433 0.009544 \n",
+ "0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 \n",
+ "\n",
+ " Reco in test Test coverage Shannon Gini \n",
+ "0 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.600530 0.005051 1.803126 0.996380 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[:, np.append(0, np.arange(9, df.shape[1]))]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Check metrics on toy dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "3it [00:00, 5771.98it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 1.612452 | \n",
+ " 1.4 | \n",
+ " 0.444444 | \n",
+ " 0.888889 | \n",
+ " 0.555556 | \n",
+ " 0.478632 | \n",
+ " 0.333333 | \n",
+ " 0.75 | \n",
+ " 0.676907 | \n",
+ " 0.574074 | \n",
+ " 0.611111 | \n",
+ " 0.638889 | \n",
+ " 1.0 | \n",
+ " 0.888889 | \n",
+ " 0.8 | \n",
+ " 1.386294 | \n",
+ " 0.25 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 F_05 \\\n",
+ "0 Self_BaselineUI 1.612452 1.4 0.444444 0.888889 0.555556 0.478632 \n",
+ "\n",
+ " precision_super recall_super NDCG mAP MRR LAUC HR \\\n",
+ "0 0.333333 0.75 0.676907 0.574074 0.611111 0.638889 1.0 \n",
+ "\n",
+ " Reco in test Test coverage Shannon Gini \n",
+ "0 0.888889 0.8 1.386294 0.25 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training data:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
+ " [0, 1, 2, 3, 0, 0, 0, 0],\n",
+ " [0, 0, 0, 5, 0, 3, 4, 0]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test data:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "matrix([[0, 0, 0, 0, 0, 0, 3, 0],\n",
+ " [0, 0, 0, 0, 5, 0, 0, 0],\n",
+ " [5, 0, 4, 0, 0, 0, 0, 2]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Recommendations:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30 | \n",
+ " 5.0 | \n",
+ " 20 | \n",
+ " 4.0 | \n",
+ " 60 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 40 | \n",
+ " 3.0 | \n",
+ " 60 | \n",
+ " 2.0 | \n",
+ " 70 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20 | \n",
+ " 40 | \n",
+ " 5.0 | \n",
+ " 20 | \n",
+ " 4.0 | \n",
+ " 70 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6\n",
+ "0 0 30 5.0 20 4.0 60 4.0\n",
+ "1 10 40 3.0 60 2.0 70 2.0\n",
+ "2 20 40 5.0 20 4.0 70 4.0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Estimations:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " est_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 60 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 40 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20 | \n",
+ " 0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 20 | \n",
+ " 20 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 20 | \n",
+ " 70 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user item est_score\n",
+ "0 0 60 4.0\n",
+ "1 10 40 3.0\n",
+ "2 20 0 3.0\n",
+ "3 20 20 4.0\n",
+ "4 20 70 4.0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import helpers\n",
+ "\n",
+ "dir_path = \"Recommendations generated/toy-example/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/toy-example/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))\n",
+ "# also you can just type ev.evaluate_all() - I put above values as default\n",
+ "\n",
+ "toy_train_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "toy_test_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/test.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "reco = pd.read_csv(\n",
+ " \"Recommendations generated/toy-example/Self_BaselineUI_reco.csv\", header=None\n",
+ ")\n",
+ "estimations = pd.read_csv(\n",
+ " \"Recommendations generated/toy-example/Self_BaselineUI_estimations.csv\",\n",
+ " names=[\"user\", \"item\", \"est_score\"],\n",
+ ")\n",
+ "(\n",
+ " toy_train_ui,\n",
+ " toy_test_ui,\n",
+ " toy_user_code_id,\n",
+ " toy_user_id_code,\n",
+ " toy_item_code_id,\n",
+ " toy_item_id_code,\n",
+ ") = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "\n",
+ "print(\"Training data:\")\n",
+ "display(toy_train_ui.todense())\n",
+ "\n",
+ "print(\"Test data:\")\n",
+ "display(toy_test_ui.todense())\n",
+ "\n",
+ "print(\"Recommendations:\")\n",
+ "display(reco)\n",
+ "\n",
+ "print(\"Estimations:\")\n",
+ "display(estimations)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sample recommendations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Here is what user rated high:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " rating | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 57482 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Emma (1996) | \n",
+ " Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 54506 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Sense and Sensibility (1995) | \n",
+ " Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 40581 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Titanic (1997) | \n",
+ " Action, Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 2949 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Star Wars (1977) | \n",
+ " Action, Adventure, Romance, Sci-Fi, War | \n",
+ "
\n",
+ " \n",
+ " 69653 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Wings of the Dove, The (1997) | \n",
+ " Drama, Romance, Thriller | \n",
+ "
\n",
+ " \n",
+ " 7906 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " As Good As It Gets (1997) | \n",
+ " Comedy, Drama | \n",
+ "
\n",
+ " \n",
+ " 69400 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Shall We Dance? (1996) | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 14469 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Fargo (1996) | \n",
+ " Crime, Drama, Thriller | \n",
+ "
\n",
+ " \n",
+ " 46151 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " L.A. Confidential (1997) | \n",
+ " Crime, Film-Noir, Mystery, Thriller | \n",
+ "
\n",
+ " \n",
+ " 67293 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Good Will Hunting (1997) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 20923 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Secrets & Lies (1996) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 52921 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Kolya (1996) | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 50103 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Mrs. Brown (Her Majesty, Mrs. Brown) (1997) | \n",
+ " Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 51972 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Mighty Aphrodite (1995) | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 515 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Heat (1995) | \n",
+ " Action, Crime, Thriller | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user rating title \\\n",
+ "57482 2 5 Emma (1996) \n",
+ "54506 2 5 Sense and Sensibility (1995) \n",
+ "40581 2 5 Titanic (1997) \n",
+ "2949 2 5 Star Wars (1977) \n",
+ "69653 2 5 Wings of the Dove, The (1997) \n",
+ "7906 2 5 As Good As It Gets (1997) \n",
+ "69400 2 5 Shall We Dance? (1996) \n",
+ "14469 2 5 Fargo (1996) \n",
+ "46151 2 5 L.A. Confidential (1997) \n",
+ "67293 2 5 Good Will Hunting (1997) \n",
+ "20923 2 5 Secrets & Lies (1996) \n",
+ "52921 2 5 Kolya (1996) \n",
+ "50103 2 4 Mrs. Brown (Her Majesty, Mrs. Brown) (1997) \n",
+ "51972 2 4 Mighty Aphrodite (1995) \n",
+ "515 2 4 Heat (1995) \n",
+ "\n",
+ " genres \n",
+ "57482 Drama, Romance \n",
+ "54506 Drama, Romance \n",
+ "40581 Action, Drama, Romance \n",
+ "2949 Action, Adventure, Romance, Sci-Fi, War \n",
+ "69653 Drama, Romance, Thriller \n",
+ "7906 Comedy, Drama \n",
+ "69400 Comedy \n",
+ "14469 Crime, Drama, Thriller \n",
+ "46151 Crime, Film-Noir, Mystery, Thriller \n",
+ "67293 Drama \n",
+ "20923 Drama \n",
+ "52921 Comedy \n",
+ "50103 Drama, Romance \n",
+ "51972 Comedy \n",
+ "515 Action, Crime, Thriller "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Here is what we recommend:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " rec_nb | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " Great Day in Harlem, A (1994) | \n",
+ " Documentary | \n",
+ "
\n",
+ " \n",
+ " 943 | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " Tough and Deadly (1995) | \n",
+ " Action, Drama, Thriller | \n",
+ "
\n",
+ " \n",
+ " 1885 | \n",
+ " 2.0 | \n",
+ " 3 | \n",
+ " Aiqing wansui (1994) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 2827 | \n",
+ " 2.0 | \n",
+ " 4 | \n",
+ " Delta of Venus (1994) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 3769 | \n",
+ " 2.0 | \n",
+ " 5 | \n",
+ " Someone Else's America (1995) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 4711 | \n",
+ " 2.0 | \n",
+ " 6 | \n",
+ " Saint of Fort Washington, The (1993) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 5653 | \n",
+ " 2.0 | \n",
+ " 7 | \n",
+ " Celestial Clockwork (1994) | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 6595 | \n",
+ " 2.0 | \n",
+ " 8 | \n",
+ " Some Mother's Son (1996) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 8489 | \n",
+ " 2.0 | \n",
+ " 9 | \n",
+ " Maya Lin: A Strong Clear Vision (1994) | \n",
+ " Documentary | \n",
+ "
\n",
+ " \n",
+ " 7536 | \n",
+ " 2.0 | \n",
+ " 10 | \n",
+ " Prefontaine (1997) | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " user rec_nb title \\\n",
+ "1 2.0 1 Great Day in Harlem, A (1994) \n",
+ "943 2.0 2 Tough and Deadly (1995) \n",
+ "1885 2.0 3 Aiqing wansui (1994) \n",
+ "2827 2.0 4 Delta of Venus (1994) \n",
+ "3769 2.0 5 Someone Else's America (1995) \n",
+ "4711 2.0 6 Saint of Fort Washington, The (1993) \n",
+ "5653 2.0 7 Celestial Clockwork (1994) \n",
+ "6595 2.0 8 Some Mother's Son (1996) \n",
+ "8489 2.0 9 Maya Lin: A Strong Clear Vision (1994) \n",
+ "7536 2.0 10 Prefontaine (1997) \n",
+ "\n",
+ " genres \n",
+ "1 Documentary \n",
+ "943 Action, Drama, Thriller \n",
+ "1885 Drama \n",
+ "2827 Drama \n",
+ "3769 Drama \n",
+ "4711 Drama \n",
+ "5653 Comedy \n",
+ "6595 Drama \n",
+ "8489 Documentary \n",
+ "7536 Drama "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train = pd.read_csv(\n",
+ " \"./Datasets/ml-100k/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n",
+ "\n",
+ "user = random.choice(list(set(train[\"user\"])))\n",
+ "\n",
+ "train_content = pd.merge(train, items, left_on=\"item\", right_on=\"id\")\n",
+ "\n",
+ "print(\"Here is what user rated high:\")\n",
+ "display(\n",
+ " train_content[train_content[\"user\"] == user][\n",
+ " [\"user\", \"rating\", \"title\", \"genres\"]\n",
+ " ].sort_values(by=\"rating\", ascending=False)[:15]\n",
+ ")\n",
+ "\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Self_BaselineUI_reco.csv\", delimiter=\",\"\n",
+ ")\n",
+ "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n",
+ "\n",
+ "# Let's ignore scores - they are not used in evaluation:\n",
+ "reco_users = reco[:, :1]\n",
+ "reco_items = reco[:, 1::2]\n",
+ "# Let's put them into one array\n",
+ "reco = np.concatenate((reco_users, reco_items), axis=1)\n",
+ "\n",
+ "# Let's rebuild it user-item dataframe\n",
+ "recommended = []\n",
+ "for row in reco:\n",
+ " for rec_nb, entry in enumerate(row[1:]):\n",
+ " recommended.append((row[0], rec_nb + 1, entry))\n",
+ "recommended = pd.DataFrame(recommended, columns=[\"user\", \"rec_nb\", \"item\"])\n",
+ "\n",
+ "recommended_content = pd.merge(recommended, items, left_on=\"item\", right_on=\"id\")\n",
+ "\n",
+ "print(\"Here is what we recommend:\")\n",
+ "recommended_content[recommended_content[\"user\"] == user][\n",
+ " [\"user\", \"rec_nb\", \"title\", \"genres\"]\n",
+ "].sort_values(by=\"rec_nb\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# project task 2: implement some other evaluation measure"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# it may be your idea, modification of what we have already implemented\n",
+ "# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations)\n",
+ "# or something well-known\n",
+ "# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/.ipynb_checkpoints/P3. k-nearest neighbours-checkpoint.ipynb b/.ipynb_checkpoints/P3. k-nearest neighbours-checkpoint.ipynb
new file mode 100644
index 0000000..a15592c
--- /dev/null
+++ b/.ipynb_checkpoints/P3. k-nearest neighbours-checkpoint.ipynb
@@ -0,0 +1,1057 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self made simplified I-KNN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import helpers\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "from collections import defaultdict\n",
+ "from itertools import chain\n",
+ "import random\n",
+ "\n",
+ "train_read = pd.read_csv(\"./Datasets/ml-100k/train.csv\", sep=\"\\t\", header=None)\n",
+ "test_read = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "(\n",
+ " train_ui,\n",
+ " test_ui,\n",
+ " user_code_id,\n",
+ " user_id_code,\n",
+ " item_code_id,\n",
+ " item_id_code,\n",
+ ") = helpers.data_to_csr(train_read, test_read)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class IKNN:\n",
+ " def fit(self, train_ui):\n",
+ " self.train_ui = train_ui\n",
+ "\n",
+ " train_iu = train_ui.transpose()\n",
+ " norms = np.linalg.norm(\n",
+ " train_iu.A, axis=1\n",
+ " ) # here we compute length of each item ratings vector\n",
+ " norms = np.vectorize(lambda x: max(x, 1))(\n",
+ " norms[:, None]\n",
+ " ) # to avoid dividing by zero\n",
+ "\n",
+ " normalized_train_iu = sparse.csr_matrix(train_iu / norms)\n",
+ "\n",
+ " self.similarity_matrix_ii = (\n",
+ " normalized_train_iu * normalized_train_iu.transpose()\n",
+ " )\n",
+ "\n",
+ " self.estimations = np.array(\n",
+ " train_ui\n",
+ " * self.similarity_matrix_ii\n",
+ " / ((train_ui > 0) * self.similarity_matrix_ii)\n",
+ " )\n",
+ "\n",
+ " def recommend(self, user_code_id, item_code_id, topK=10):\n",
+ "\n",
+ " top_k = defaultdict(list)\n",
+ " for nb_user, user in enumerate(self.estimations):\n",
+ "\n",
+ " user_rated = self.train_ui.indices[\n",
+ " self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]\n",
+ " ]\n",
+ " for item, score in enumerate(user):\n",
+ " if item not in user_rated and not np.isnan(score):\n",
+ " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
+ " result = []\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, item_scores in top_k.items():\n",
+ " item_scores.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid] + list(chain(*item_scores[:topK])))\n",
+ " return result\n",
+ "\n",
+ " def estimate(self, user_code_id, item_code_id, test_ui):\n",
+ " result = []\n",
+ " for user, item in zip(*test_ui.nonzero()):\n",
+ " result.append(\n",
+ " [\n",
+ " user_code_id[user],\n",
+ " item_code_id[item],\n",
+ " self.estimations[user, item]\n",
+ " if not np.isnan(self.estimations[user, item])\n",
+ " else 1,\n",
+ " ]\n",
+ " )\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "toy train ui:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[3, 4, 0, 0, 5, 0, 0, 4],\n",
+ " [0, 1, 2, 3, 0, 0, 0, 0],\n",
+ " [0, 0, 0, 5, 0, 3, 4, 0]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "similarity matrix:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ],\n",
+ " [0.9701425 , 1. , 0.24253563, 0.12478355, 0.9701425 ,\n",
+ " 0. , 0. , 0.9701425 ],\n",
+ " [0. , 0.24253563, 1. , 0.51449576, 0. ,\n",
+ " 0. , 0. , 0. ],\n",
+ " [0. , 0.12478355, 0.51449576, 1. , 0. ,\n",
+ " 0.85749293, 0.85749293, 0. ],\n",
+ " [1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ],\n",
+ " [0. , 0. , 0. , 0.85749293, 0. ,\n",
+ " 1. , 1. , 0. ],\n",
+ " [0. , 0. , 0. , 0.85749293, 0. ,\n",
+ " 1. , 1. , 0. ],\n",
+ " [1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "estimations matrix:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[4. , 4. , 4. , 4. , 4. ,\n",
+ " nan, nan, 4. ],\n",
+ " [1. , 1.35990333, 2.15478388, 2.53390319, 1. ,\n",
+ " 3. , 3. , 1. ],\n",
+ " [ nan, 5. , 5. , 4.05248907, nan,\n",
+ " 3.95012863, 3.95012863, nan]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[[0, 20, 4.0, 30, 4.0],\n",
+ " [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],\n",
+ " [20, 10, 5.0, 20, 5.0]]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# toy example\n",
+ "toy_train_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "toy_test_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/test.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " toy_train_ui,\n",
+ " toy_test_ui,\n",
+ " toy_user_code_id,\n",
+ " toy_user_id_code,\n",
+ " toy_item_code_id,\n",
+ " toy_item_id_code,\n",
+ ") = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "\n",
+ "\n",
+ "model = IKNN()\n",
+ "model.fit(toy_train_ui)\n",
+ "\n",
+ "print(\"toy train ui:\")\n",
+ "display(toy_train_ui.A)\n",
+ "\n",
+ "print(\"similarity matrix:\")\n",
+ "display(model.similarity_matrix_ii.A)\n",
+ "\n",
+ "print(\"estimations matrix:\")\n",
+ "display(model.estimations)\n",
+ "\n",
+ "model.recommend(toy_user_code_id, toy_item_code_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = IKNN()\n",
+ "model.fit(train_ui)\n",
+ "\n",
+ "top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
+ "\n",
+ "top_n.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_reco.csv\", index=False, header=False\n",
+ ")\n",
+ "\n",
+ "estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
+ "estimations.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_estimations.csv\",\n",
+ " index=False,\n",
+ " header=False,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 9004.71it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.00014 | \n",
+ " 0.000189 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.11544 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " RMSE MAE precision recall F_1 F_05 \\\n",
+ "0 1.018363 0.808793 0.000318 0.000108 0.00014 0.000189 \n",
+ "\n",
+ " precision_super recall_super NDCG mAP MRR LAUC \\\n",
+ "0 0.0 0.0 0.000214 0.000037 0.000368 0.496391 \n",
+ "\n",
+ " HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.003181 0.392153 0.11544 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import evaluation_measures as ev\n",
+ "\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_estimations.csv\", header=None\n",
+ ")\n",
+ "reco = np.loadtxt(\"Recommendations generated/ml-100k/Self_IKNN_reco.csv\", delimiter=\",\")\n",
+ "\n",
+ "ev.evaluate(\n",
+ " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n",
+ " estimations_df=estimations_df,\n",
+ " reco=reco,\n",
+ " super_reactions=[4, 5],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 8517.83it/s]\n",
+ "943it [00:00, 11438.64it/s]\n",
+ "943it [00:00, 11933.36it/s]\n",
+ "943it [00:00, 10307.81it/s]\n",
+ "943it [00:00, 12250.41it/s]\n",
+ "943it [00:00, 12064.07it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ready-made KNNs - Surprise implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### I-KNN - basic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing the cosine similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "import helpers\n",
+ "import surprise as sp\n",
+ "\n",
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": False,\n",
+ "} # compute similarities between items\n",
+ "algo = sp.KNNBasic(sim_options=sim_options)\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_I-KNN_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### U-KNN - basic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing the cosine similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": True,\n",
+ "} # compute similarities between users\n",
+ "algo = sp.KNNBasic(sim_options=sim_options)\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_U-KNN_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### I-KNN - on top baseline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Estimating biases using als...\n",
+ "Computing the msd similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": False,\n",
+ "} # compute similarities between items\n",
+ "algo = sp.KNNBaseline()\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 11286.27it/s]\n",
+ "943it [00:00, 10874.86it/s]\n",
+ "943it [00:00, 11509.97it/s]\n",
+ "943it [00:00, 11855.81it/s]\n",
+ "943it [00:00, 11574.00it/s]\n",
+ "943it [00:00, 11080.19it/s]\n",
+ "943it [00:00, 11550.84it/s]\n",
+ "943it [00:00, 12148.14it/s]\n",
+ "943it [00:00, 10779.39it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNN | \n",
+ " 1.030386 | \n",
+ " 0.813067 | \n",
+ " 0.026087 | \n",
+ " 0.006908 | \n",
+ " 0.010593 | \n",
+ " 0.016046 | \n",
+ " 0.021137 | \n",
+ " 0.009522 | \n",
+ " 0.024214 | \n",
+ " 0.008958 | \n",
+ " 0.048068 | \n",
+ " 0.499885 | \n",
+ " 0.154825 | \n",
+ " 0.402333 | \n",
+ " 0.434343 | \n",
+ " 5.133650 | \n",
+ " 0.877999 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNNBaseline | \n",
+ " 0.935327 | \n",
+ " 0.737424 | \n",
+ " 0.002545 | \n",
+ " 0.000755 | \n",
+ " 0.001105 | \n",
+ " 0.001602 | \n",
+ " 0.002253 | \n",
+ " 0.000930 | \n",
+ " 0.003444 | \n",
+ " 0.001362 | \n",
+ " 0.011760 | \n",
+ " 0.496724 | \n",
+ " 0.021209 | \n",
+ " 0.482821 | \n",
+ " 0.059885 | \n",
+ " 2.232578 | \n",
+ " 0.994487 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_U-KNN | \n",
+ " 1.023495 | \n",
+ " 0.807913 | \n",
+ " 0.000742 | \n",
+ " 0.000205 | \n",
+ " 0.000305 | \n",
+ " 0.000449 | \n",
+ " 0.000536 | \n",
+ " 0.000198 | \n",
+ " 0.000845 | \n",
+ " 0.000274 | \n",
+ " 0.002744 | \n",
+ " 0.496441 | \n",
+ " 0.007423 | \n",
+ " 0.602121 | \n",
+ " 0.010823 | \n",
+ " 2.089186 | \n",
+ " 0.995706 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n",
+ "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n",
+ "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n",
+ "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n",
+ "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n",
+ "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n",
+ "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# project task 3: use a version of your choice of Surprise KNNalgorithm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read the docs and try to find best parameter configuration (let say in terms of RMSE)\n",
+ "# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline\n",
+ "# the solution here can be similar to examples above\n",
+ "# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and\n",
+ "# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/P0. Data preparation.ipynb b/P0. Data preparation.ipynb
index e905e56..c40508c 100644
--- a/P0. Data preparation.ipynb
+++ b/P0. Data preparation.ipynb
@@ -9,7 +9,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -58,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -137,7 +137,7 @@
"4 166 346 1 886397596"
]
},
- "execution_count": 17,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -155,7 +155,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -184,7 +184,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -226,7 +226,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -268,7 +268,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -283,7 +283,7 @@
"Name: user, dtype: float64"
]
},
- "execution_count": 21,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -301,7 +301,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -312,7 +312,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -339,7 +339,7 @@
" 18: 'Western'}"
]
},
- "execution_count": 23,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -350,7 +350,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -359,7 +359,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -503,7 +503,7 @@
"[3 rows x 24 columns]"
]
},
- "execution_count": 25,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -514,7 +514,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -524,7 +524,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -533,7 +533,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -543,7 +543,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -616,7 +616,7 @@
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
- "execution_count": 29,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -635,7 +635,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -644,7 +644,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
diff --git a/P1. Baseline.ipynb b/P1. Baseline.ipynb
index 3dbaf3a..85b9494 100644
--- a/P1. Baseline.ipynb
+++ b/P1. Baseline.ipynb
@@ -306,7 +306,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "658 ns ± 16.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
+ "471 ns ± 15.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
"Inefficient way to access items rated by user:\n"
]
},
@@ -324,7 +324,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "67.8 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+ "48.3 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
@@ -1318,7 +1318,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -1342,7 +1342,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -1446,24 +1446,24 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "RMSE: 1.5230\n",
- "MAE: 1.2226\n"
+ "RMSE: 1.5165\n",
+ "MAE: 1.2172\n"
]
},
{
"data": {
"text/plain": [
- "1.2226271020019277"
+ "1.2172144988785374"
]
},
- "execution_count": 30,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1496,34 +1496,6 @@
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/P2. Evaluation.ipynb b/P2. Evaluation.ipynb
index fdea66d..e89d78d 100644
--- a/P2. Evaluation.ipynb
+++ b/P2. Evaluation.ipynb
@@ -1684,7 +1684,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.8.8"
}
},
"nbformat": 4,
diff --git a/P3. k-nearest neighbours.ipynb b/P3. k-nearest neighbours.ipynb
index 17eecae..a15592c 100644
--- a/P3. k-nearest neighbours.ipynb
+++ b/P3. k-nearest neighbours.ipynb
@@ -1049,7 +1049,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.8.8"
}
},
"nbformat": 4,