{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import helpers\n", "import pandas as pd\n", "import numpy as np\n", "import scipy.sparse as sparse\n", "from collections import defaultdict\n", "from itertools import chain\n", "import random\n", "import time\n", "import matplotlib.pyplot as plt\n", "\n", "train_read = pd.read_csv(\"./Datasets/ml-100k/train.csv\", sep=\"\\t\", header=None)\n", "test_read = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n", "(\n", " train_ui,\n", " test_ui,\n", " user_code_id,\n", " user_id_code,\n", " item_code_id,\n", " item_id_code,\n", ") = helpers.data_to_csr(train_read, test_read)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# User and item features preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Item features" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...14151617181920212223
01Toy Story (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Toy%20Story%2...00011...0000000000
12GoldenEye (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?GoldenEye%20(...01100...0000000100
23Four Rooms (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Four%20Rooms%...00000...0000000100
\n", "

3 rows × 24 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "0 1 Toy Story (1995) 01-Jan-1995 NaN \n", "1 2 GoldenEye (1995) 01-Jan-1995 NaN \n", "2 3 Four Rooms (1995) 01-Jan-1995 NaN \n", "\n", " 4 5 6 7 8 9 ... 14 \\\n", "0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 \n", "1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 \n", "2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 \n", "\n", " 15 16 17 18 19 20 21 22 23 \n", "0 0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 1 0 0 \n", "2 0 0 0 0 0 0 1 0 0 \n", "\n", "[3 rows x 24 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv(\n", " \"./Datasets/ml-100k/u.item\", sep=\"|\", encoding=\"latin-1\", header=None\n", ").astype(object)\n", "\n", "movies[:3]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_1id_2id_3id_4id_5id_6id_7id_8id_9id_10...date_30-Mar-1996date_30-May-1997date_30-Nov-1996date_30-Oct-1995date_30-Oct-1996date_31-Dec-1997date_31-Jan-1997date_31-Jul-1996date_31-May-1996date_4-Feb-1971
01000000000...0000000000
10100000000...0000000000
20010000000...0000000000
\n", "

3 rows × 1922 columns

\n", "
" ], "text/plain": [ " id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... \\\n", "0 1 0 0 0 0 0 0 0 0 0 ... \n", "1 0 1 0 0 0 0 0 0 0 0 ... \n", "2 0 0 1 0 0 0 0 0 0 0 ... \n", "\n", " date_30-Mar-1996 date_30-May-1997 date_30-Nov-1996 date_30-Oct-1995 \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "\n", " date_30-Oct-1996 date_31-Dec-1997 date_31-Jan-1997 date_31-Jul-1996 \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "\n", " date_31-May-1996 date_4-Feb-1971 \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "\n", "[3 rows x 1922 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id_date = pd.get_dummies(data=movies[[0, 2]], prefix=[\"id\", \"date\"])\n", "id_date[:3]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0unknown0
1Action1
2Adventure2
\n", "
" ], "text/plain": [ " 0 1\n", "0 unknown 0\n", "1 Action 1\n", "2 Adventure 2" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres = pd.read_csv(\n", " \"./Datasets/ml-100k/u.genre\", sep=\"|\", header=None, encoding=\"latin-1\"\n", ")\n", "genres[:3]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "item_genres = movies[np.arange(5, 24)]\n", "item_genres.columns = list(genres[0])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_1id_2id_3id_4id_5id_6id_7id_8id_9id_10...FantasyFilm-NoirHorrorMusicalMysteryRomanceSci-FiThrillerWarWestern
01000000000...0000000000
10100000000...0000000100
20010000000...0000000100
30001000000...0000000000
40000100000...0000000100
..................................................................
16770000000000...0000000000
16780000000000...0000010100
16790000000000...0000010000
16800000000000...0000000000
16810000000000...0000000000
\n", "

1682 rows × 1941 columns

\n", "
" ], "text/plain": [ " id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... \\\n", "0 1 0 0 0 0 0 0 0 0 0 ... \n", "1 0 1 0 0 0 0 0 0 0 0 ... \n", "2 0 0 1 0 0 0 0 0 0 0 ... \n", "3 0 0 0 1 0 0 0 0 0 0 ... \n", "4 0 0 0 0 1 0 0 0 0 0 ... \n", "... ... ... ... ... ... ... ... ... ... ... ... \n", "1677 0 0 0 0 0 0 0 0 0 0 ... \n", "1678 0 0 0 0 0 0 0 0 0 0 ... \n", "1679 0 0 0 0 0 0 0 0 0 0 ... \n", "1680 0 0 0 0 0 0 0 0 0 0 ... \n", "1681 0 0 0 0 0 0 0 0 0 0 ... \n", "\n", " Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller \\\n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 1 \n", "2 0 0 0 0 0 0 0 1 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 1 \n", "... ... ... ... ... ... ... ... ... \n", "1677 0 0 0 0 0 0 0 0 \n", "1678 0 0 0 0 0 1 0 1 \n", "1679 0 0 0 0 0 1 0 0 \n", "1680 0 0 0 0 0 0 0 0 \n", "1681 0 0 0 0 0 0 0 0 \n", "\n", " War Western \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "... ... ... \n", "1677 0 0 \n", "1678 0 0 \n", "1679 0 0 \n", "1680 0 0 \n", "1681 0 0 \n", "\n", "[1682 rows x 1941 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "item_features_df = pd.concat([id_date, item_genres], axis=1).astype(int)\n", "item_features_df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<1682x1941 sparse matrix of type ''\n", "\twith 6256 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "item_features = sparse.csr_matrix(item_features_df.values)\n", "item_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### User features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0124Mtechnician85711
1253Fother94043
2323Mwriter32067
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 24 M technician 85711\n", "1 2 53 F other 94043\n", "2 3 23 M writer 32067" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users = pd.read_csv(\n", " \"./Datasets/ml-100k/u.user\", sep=\"|\", encoding=\"latin-1\", header=None\n", ")\n", "users[:3]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_1id_2id_3id_4id_5id_6id_7id_8id_9id_10...FantasyFilm-NoirHorrorMusicalMysteryRomanceSci-FiThrillerWarWestern
01000000000...0000000000
10100000000...0000000100
20010000000...0000000100
\n", "

3 rows × 1941 columns

\n", "
" ], "text/plain": [ " id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... Fantasy \\\n", "0 1 0 0 0 0 0 0 0 0 0 ... 0 \n", "1 0 1 0 0 0 0 0 0 0 0 ... 0 \n", "2 0 0 1 0 0 0 0 0 0 0 ... 0 \n", "\n", " Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War \\\n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 1 0 \n", "2 0 0 0 0 0 0 1 0 \n", "\n", " Western \n", "0 0 \n", "1 0 \n", "2 0 \n", "\n", "[3 rows x 1941 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users = users.astype(object)\n", "user_features_df = pd.get_dummies(users, [\"id\", \"age\", \"sex\", \"profesion\", \"zip_code\"])\n", "item_features_df[:3]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<943x1682 sparse matrix of type ''\n", "\twith 80000 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_ui" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<943x1822 sparse matrix of type ''\n", "\twith 4715 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_features = sparse.csr_matrix(user_features_df.values)\n", "user_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### LightFM with user and item features" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/robert.kwiecinski/opt/anaconda3/lib/python3.8/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "logistic\n", "Train precision: 0.09\n", "Test precision: 0.03\n", "bpr\n", "Train precision: 0.57\n", "Test precision: 0.24\n", "warp\n", "Train precision: 0.63\n", "Test precision: 0.35\n" ] } ], "source": [ "from lightfm import LightFM\n", "from lightfm.evaluation import precision_at_k\n", "\n", "for loss in [\"logistic\", \"bpr\", \"warp\"]:\n", "\n", " model = LightFM(no_components=10, loss=loss)\n", " model.fit(\n", " train_ui,\n", " user_features=user_features,\n", " item_features=item_features,\n", " epochs=30,\n", " num_threads=4,\n", " )\n", "\n", " print(loss)\n", " print(\n", " \"Train precision: %.2f\"\n", " % precision_at_k(\n", " model,\n", " test_interactions=train_ui,\n", " user_features=user_features,\n", " item_features=item_features,\n", " k=10,\n", " preserve_rows=True,\n", " ).mean()\n", " )\n", " print(\n", " \"Test precision: %.2f\"\n", " % precision_at_k(\n", " model,\n", " test_interactions=test_ui,\n", " train_interactions=train_ui,\n", " user_features=user_features,\n", " item_features=item_features,\n", " k=10,\n", " preserve_rows=True,\n", " ).mean()\n", " )" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def top_k_recommendations(\n", " model, user_features, item_features, user_code_id, item_code_id, topK=10\n", "):\n", " result = []\n", " for user_code in range(test_ui.shape[0]):\n", " user_rated = train_ui.indices[\n", " train_ui.indptr[user_code] : train_ui.indptr[user_code + 1]\n", " ]\n", " scores = model.predict(\n", " user_code,\n", " np.arange(train_ui.shape[1]),\n", " user_features=user_features,\n", " item_features=item_features,\n", " )\n", "\n", " scores[user_rated] = -np.inf # to put rated items at the end of the list\n", "\n", " top_items = [item_code_id[item] for item in np.argsort(-scores)[:topK]]\n", " result.append(\n", " [user_code_id[user_code]]\n", " + list(chain(*zip(top_items, -np.sort(-scores)[:topK])))\n", " )\n", " return result\n", "\n", "\n", "def estimate(model, user_features, item_features, user_code_id, item_code_id, test_ui):\n", " result = []\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append(\n", " [\n", " user_code_id[user],\n", " item_code_id[item],\n", " model.predict(\n", " int(user),\n", " np.array([int(item)]),\n", " user_features=user_features,\n", " item_features=item_features,\n", " )[0],\n", " ]\n", " )\n", " return result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "top_n = pd.DataFrame(\n", " top_k_recommendations(\n", " model=model,\n", " user_features=user_features,\n", " item_features=item_features,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " topK=10,\n", " )\n", ")\n", "top_n.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFM_reco.csv\",\n", " index=False,\n", " header=False,\n", ")\n", "\n", "estimations = pd.DataFrame(\n", " estimate(\n", " model=model,\n", " user_features=user_features,\n", " item_features=item_features,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " test_ui=test_ui,\n", " )\n", ")\n", "estimations.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFM_estimations.csv\",\n", " index=False,\n", " header=False,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Pure MF with LightFM" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "item_features_interactions = sparse.csr_matrix(\n", " item_features_df[\n", " [\n", " item_feature\n", " for item_feature in item_features_df.columns\n", " if \"id_\" in item_feature\n", " ]\n", " ].values\n", ")\n", "user_features_interactions = sparse.csr_matrix(\n", " user_features_df[\n", " [\n", " user_feature\n", " for user_feature in user_features_df.columns\n", " if \"id_\" in user_feature\n", " ]\n", " ].values\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train precision: 0.62\n", "Test precision: 0.34\n" ] } ], "source": [ "from lightfm import LightFM\n", "\n", "model = LightFM(loss=\"warp\")\n", "model.fit(\n", " train_ui,\n", " user_features=user_features_interactions,\n", " item_features=item_features_interactions,\n", " epochs=30,\n", " num_threads=4,\n", ")\n", "\n", "from lightfm.evaluation import precision_at_k\n", "\n", "print(\n", " \"Train precision: %.2f\"\n", " % precision_at_k(model, test_interactions=train_ui, k=10).mean()\n", ")\n", "print(\n", " \"Test precision: %.2f\"\n", " % precision_at_k(\n", " model, test_interactions=test_ui, train_interactions=train_ui, k=10\n", " ).mean()\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "top_n = pd.DataFrame(\n", " top_k_recommendations(\n", " model=model,\n", " user_features=user_features_interactions,\n", " item_features=item_features_interactions,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " topK=10,\n", " )\n", ")\n", "top_n.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFMpureMF_reco.csv\",\n", " index=False,\n", " header=False,\n", ")\n", "\n", "estimations = pd.DataFrame(\n", " estimate(\n", " model=model,\n", " user_features=user_features_interactions,\n", " item_features=item_features_interactions,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " test_ui=test_ui,\n", " )\n", ")\n", "estimations.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFMpureMF_estimations.csv\",\n", " index=False,\n", " header=False,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### LightFM with user/item attributes only (without treating id as a feature)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "item_features_only = sparse.csr_matrix(\n", " item_features_df[\n", " [\n", " item_feature\n", " for item_feature in item_features_df.columns\n", " if \"id_\" not in item_feature\n", " ]\n", " ].values\n", ")\n", "user_features_only = sparse.csr_matrix(\n", " user_features_df[\n", " [\n", " user_feature\n", " for user_feature in user_features_df.columns\n", " if \"id_\" not in user_feature\n", " ]\n", " ].values\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train precision: 0.39\n", "Test precision: 0.16\n" ] } ], "source": [ "from lightfm import LightFM\n", "\n", "model = LightFM(loss=\"warp\")\n", "model.fit(\n", " train_ui,\n", " user_features=user_features_only,\n", " item_features=item_features_only,\n", " epochs=30,\n", " num_threads=4,\n", ")\n", "\n", "from lightfm.evaluation import precision_at_k\n", "\n", "print(\n", " \"Train precision: %.2f\"\n", " % precision_at_k(\n", " model,\n", " test_interactions=train_ui,\n", " user_features=user_features_only,\n", " item_features=item_features_only,\n", " k=10,\n", " ).mean()\n", ")\n", "print(\n", " \"Test precision: %.2f\"\n", " % precision_at_k(\n", " model,\n", " test_interactions=test_ui,\n", " train_interactions=train_ui,\n", " user_features=user_features_only,\n", " item_features=item_features_only,\n", " k=10,\n", " ).mean()\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "top_n = pd.DataFrame(\n", " top_k_recommendations(\n", " model=model,\n", " user_features=user_features_only,\n", " item_features=item_features_only,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " topK=10,\n", " )\n", ")\n", "top_n.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFMcontent_reco.csv\",\n", " index=False,\n", " header=False,\n", ")\n", "\n", "estimations = pd.DataFrame(\n", " estimate(\n", " model=model,\n", " user_features=user_features_only,\n", " item_features=item_features_only,\n", " user_code_id=user_code_id,\n", " item_code_id=item_code_id,\n", " test_ui=test_ui,\n", " )\n", ")\n", "estimations.to_csv(\n", " \"Recommendations generated/ml-100k/Ready_LightFMcontent_estimations.csv\",\n", " index=False,\n", " header=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "943it [00:00, 11300.75it/s]\n", "943it [00:00, 11070.14it/s]\n", "943it [00:00, 11045.26it/s]\n", "943it [00:00, 11373.51it/s]\n", "943it [00:00, 10314.45it/s]\n", "943it [00:00, 11760.03it/s]\n", "943it [00:00, 11634.63it/s]\n", "943it [00:00, 11158.87it/s]\n", "943it [00:00, 12014.16it/s]\n", "943it [00:00, 11089.66it/s]\n", "943it [00:00, 10880.72it/s]\n", "943it [00:00, 11381.59it/s]\n", "943it [00:00, 10462.79it/s]\n", "943it [00:00, 11886.63it/s]\n", "943it [00:00, 11710.11it/s]\n", "943it [00:00, 12056.35it/s]\n", "943it [00:00, 10062.43it/s]\n", "943it [00:00, 10174.38it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_super
0Ready_LightFM164.986935163.0743240.3475080.2228210.2222530.2628610.2449570.266155
0Ready_LightFMpureMF7.9845187.4878040.3359490.2154740.2163500.2551870.2356220.259289
0Self_P33.7024463.5272730.2821850.1920920.1867490.2169800.2041850.240096
0Ready_ImplicitALS3.2691563.0700030.2575820.1866400.1784450.2029740.1711370.216258
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.137473
0Ready_LightFMcontent184.450812182.3272750.1616120.1018360.1028290.1218450.1020390.110954
0Ready_SVD0.9516520.7509750.0963940.0472520.0528700.0672570.0855150.074754
0Self_SVD0.9143930.7171990.1016970.0423340.0517870.0688110.0924890.072360
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.056463
0Ready_SVDBiased0.9404130.7395710.0860020.0354780.0431960.0575070.0757510.053460
0Ready_Random1.5279351.2253930.0493110.0204790.0249440.0329900.0321890.024725
0Ready_I-KNN1.0303860.8130670.0260870.0069080.0105930.0160460.0211370.009522
0Ready_I-KNNBaseline0.9353270.7374240.0025450.0007550.0011050.0016020.0022530.000930
0Ready_U-KNN1.0234950.8079130.0007420.0002050.0003050.0004490.0005360.000198
0Self_TopRated1.0307120.8209040.0009540.0001880.0002980.0004810.0006440.000223
0Self_BaselineIU0.9581360.7540510.0009540.0001880.0002980.0004810.0006440.000223
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.000189
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.000000
\n", "
" ], "text/plain": [ " Model RMSE MAE precision recall \\\n", "0 Ready_LightFM 164.986935 163.074324 0.347508 0.222821 \n", "0 Ready_LightFMpureMF 7.984518 7.487804 0.335949 0.215474 \n", "0 Self_P3 3.702446 3.527273 0.282185 0.192092 \n", "0 Ready_ImplicitALS 3.269156 3.070003 0.257582 0.186640 \n", "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 \n", "0 Ready_LightFMcontent 184.450812 182.327275 0.161612 0.101836 \n", "0 Ready_SVD 0.951652 0.750975 0.096394 0.047252 \n", "0 Self_SVD 0.914393 0.717199 0.101697 0.042334 \n", "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 \n", "0 Ready_SVDBiased 0.940413 0.739571 0.086002 0.035478 \n", "0 Ready_Random 1.527935 1.225393 0.049311 0.020479 \n", "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 \n", "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 \n", "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 \n", "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 \n", "0 Self_BaselineIU 0.958136 0.754051 0.000954 0.000188 \n", "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 \n", "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 \n", "\n", " F_1 F_05 precision_super recall_super \n", "0 0.222253 0.262861 0.244957 0.266155 \n", "0 0.216350 0.255187 0.235622 0.259289 \n", "0 0.186749 0.216980 0.204185 0.240096 \n", "0 0.178445 0.202974 0.171137 0.216258 \n", "0 0.118732 0.141584 0.130472 0.137473 \n", "0 0.102829 0.121845 0.102039 0.110954 \n", "0 0.052870 0.067257 0.085515 0.074754 \n", "0 0.051787 0.068811 0.092489 0.072360 \n", "0 0.046030 0.061286 0.079614 0.056463 \n", "0 0.043196 0.057507 0.075751 0.053460 \n", "0 0.024944 0.032990 0.032189 0.024725 \n", "0 0.010593 0.016046 0.021137 0.009522 \n", "0 0.001105 0.001602 0.002253 0.000930 \n", "0 0.000305 0.000449 0.000536 0.000198 \n", "0 0.000298 0.000481 0.000644 0.000223 \n", "0 0.000298 0.000481 0.000644 0.000223 \n", "0 0.000278 0.000463 0.000644 0.000189 \n", "0 0.000140 0.000189 0.000000 0.000000 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelNDCGmAPMRRLAUCHRReco in testTest coverageShannonGini
0Ready_LightFM0.4128730.2761770.6485690.6091660.9077411.0000000.3600295.3649830.884435
0Ready_LightFMpureMF0.3977510.2619000.6336980.6054440.9003181.0000000.2792215.0869050.913551
0Self_P30.3391140.2049050.5721570.5935440.8759281.0000000.0772013.8758920.974947
0Ready_ImplicitALS0.3084150.1757960.5328350.5907090.8780490.9997880.5043295.7619410.820874
0Self_TopPop0.2146510.1117070.4009390.5555460.7656421.0000000.0389613.1590790.987317
0Ready_LightFMcontent0.1798400.0869000.3349370.5478740.7200420.9763520.2510824.8866640.928488
0Ready_SVD0.1095780.0515620.2355670.5203410.4962880.9955460.2085144.4557550.951624
0Self_SVD0.1048390.0489700.1961170.5178890.4803820.8673380.1471863.8525450.972694
0Ready_Baseline0.0959570.0431780.1981930.5155010.4379641.0000000.0339112.8365130.991139
0Ready_SVDBiased0.0948970.0433610.2091240.5144050.4284200.9973490.1774894.2125090.962656
0Ready_Random0.0536470.0204620.1360360.5067630.3393430.9861080.1911985.1012150.907796
0Ready_I-KNN0.0242140.0089580.0480680.4998850.1548250.4023330.4343435.1336500.877999
0Ready_I-KNNBaseline0.0034440.0013620.0117600.4967240.0212090.4828210.0598852.2325780.994487
0Ready_U-KNN0.0008450.0002740.0027440.4964410.0074230.6021210.0108232.0891860.995706
0Self_TopRated0.0010430.0003350.0033480.4964330.0095440.6990460.0050511.9459100.995669
0Self_BaselineIU0.0010430.0003350.0033480.4964330.0095440.6990460.0050511.9459100.995669
0Self_BaselineUI0.0007520.0001680.0016770.4964240.0095440.6005300.0050511.8031260.996380
0Self_IKNN0.0002140.0000370.0003680.4963910.0031810.3921530.1154404.1747410.965327
\n", "
" ], "text/plain": [ " Model NDCG mAP MRR LAUC HR \\\n", "0 Ready_LightFM 0.412873 0.276177 0.648569 0.609166 0.907741 \n", "0 Ready_LightFMpureMF 0.397751 0.261900 0.633698 0.605444 0.900318 \n", "0 Self_P3 0.339114 0.204905 0.572157 0.593544 0.875928 \n", "0 Ready_ImplicitALS 0.308415 0.175796 0.532835 0.590709 0.878049 \n", "0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 \n", "0 Ready_LightFMcontent 0.179840 0.086900 0.334937 0.547874 0.720042 \n", "0 Ready_SVD 0.109578 0.051562 0.235567 0.520341 0.496288 \n", "0 Self_SVD 0.104839 0.048970 0.196117 0.517889 0.480382 \n", "0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 \n", "0 Ready_SVDBiased 0.094897 0.043361 0.209124 0.514405 0.428420 \n", "0 Ready_Random 0.053647 0.020462 0.136036 0.506763 0.339343 \n", "0 Ready_I-KNN 0.024214 0.008958 0.048068 0.499885 0.154825 \n", "0 Ready_I-KNNBaseline 0.003444 0.001362 0.011760 0.496724 0.021209 \n", "0 Ready_U-KNN 0.000845 0.000274 0.002744 0.496441 0.007423 \n", "0 Self_TopRated 0.001043 0.000335 0.003348 0.496433 0.009544 \n", "0 Self_BaselineIU 0.001043 0.000335 0.003348 0.496433 0.009544 \n", "0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 \n", "0 Self_IKNN 0.000214 0.000037 0.000368 0.496391 0.003181 \n", "\n", " Reco in test Test coverage Shannon Gini \n", "0 1.000000 0.360029 5.364983 0.884435 \n", "0 1.000000 0.279221 5.086905 0.913551 \n", "0 1.000000 0.077201 3.875892 0.974947 \n", "0 0.999788 0.504329 5.761941 0.820874 \n", "0 1.000000 0.038961 3.159079 0.987317 \n", "0 0.976352 0.251082 4.886664 0.928488 \n", "0 0.995546 0.208514 4.455755 0.951624 \n", "0 0.867338 0.147186 3.852545 0.972694 \n", "0 1.000000 0.033911 2.836513 0.991139 \n", "0 0.997349 0.177489 4.212509 0.962656 \n", "0 0.986108 0.191198 5.101215 0.907796 \n", "0 0.402333 0.434343 5.133650 0.877999 \n", "0 0.482821 0.059885 2.232578 0.994487 \n", "0 0.602121 0.010823 2.089186 0.995706 \n", "0 0.699046 0.005051 1.945910 0.995669 \n", "0 0.699046 0.005051 1.945910 0.995669 \n", "0 0.600530 0.005051 1.803126 0.996380 \n", "0 0.392153 0.115440 4.174741 0.965327 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import evaluation_measures as ev\n", "\n", "dir_path = \"Recommendations generated/ml-100k/\"\n", "super_reactions = [4, 5]\n", "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n", "\n", "df = ev.evaluate_all(test, dir_path, super_reactions)\n", "display(df.iloc[:, :9])\n", "display(df.iloc[:, np.append(0, np.arange(9, df.shape[1]))])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }