{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Self made SVD" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import helpers\n", "import pandas as pd\n", "import numpy as np\n", "import scipy.sparse as sparse\n", "from collections import defaultdict\n", "from itertools import chain\n", "import random\n", "import matplotlib.pyplot as plt\n", "\n", "train_read = pd.read_csv(\"./Datasets/ml-100k/train.csv\", sep=\"\\t\", header=None)\n", "test_read = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n", "(\n", " train_ui,\n", " test_ui,\n", " user_code_id,\n", " user_id_code,\n", " item_code_id,\n", " item_id_code,\n", ") = helpers.data_to_csr(train_read, test_read)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Done similarly to https://github.com/albertauyeung/matrix-factorization-in-python\n", "from tqdm import tqdm\n", "\n", "\n", "class SVD:\n", " def __init__(self, train_ui, learning_rate, regularization, nb_factors, iterations):\n", " self.train_ui = train_ui\n", " self.uir = list(\n", " zip(*[train_ui.nonzero()[0], train_ui.nonzero()[1], train_ui.data])\n", " )\n", "\n", " self.learning_rate = learning_rate\n", " self.regularization = regularization\n", " self.iterations = iterations\n", " self.nb_users, self.nb_items = train_ui.shape\n", " self.nb_ratings = train_ui.nnz\n", " self.nb_factors = nb_factors\n", "\n", " self.Pu = np.random.normal(\n", " loc=0, scale=1.0 / self.nb_factors, size=(self.nb_users, self.nb_factors)\n", " )\n", " self.Qi = np.random.normal(\n", " loc=0, scale=1.0 / self.nb_factors, size=(self.nb_items, self.nb_factors)\n", " )\n", "\n", " def train(self, test_ui=None):\n", " if test_ui != None:\n", " self.test_uir = list(\n", " zip(*[test_ui.nonzero()[0], test_ui.nonzero()[1], test_ui.data])\n", " )\n", "\n", " self.learning_process = []\n", " pbar = tqdm(range(self.iterations))\n", " for i in pbar:\n", " pbar.set_description(\n", " f\"Epoch {i} RMSE: {self.learning_process[-1][1] if i>0 else 0}. Training epoch {i+1}...\"\n", " )\n", " np.random.shuffle(self.uir)\n", " self.sgd(self.uir)\n", " if test_ui == None:\n", " self.learning_process.append([i + 1, self.RMSE_total(self.uir)])\n", " else:\n", " self.learning_process.append(\n", " [i + 1, self.RMSE_total(self.uir), self.RMSE_total(self.test_uir)]\n", " )\n", "\n", " def sgd(self, uir):\n", "\n", " for u, i, score in uir:\n", " # Computer prediction and error\n", " prediction = self.get_rating(u, i)\n", " e = score - prediction\n", "\n", " # Update user and item latent feature matrices\n", " Pu_update = self.learning_rate * (\n", " e * self.Qi[i] - self.regularization * self.Pu[u]\n", " )\n", " Qi_update = self.learning_rate * (\n", " e * self.Pu[u] - self.regularization * self.Qi[i]\n", " )\n", "\n", " self.Pu[u] += Pu_update\n", " self.Qi[i] += Qi_update\n", "\n", " def get_rating(self, u, i):\n", " prediction = self.Pu[u].dot(self.Qi[i].T)\n", " return prediction\n", "\n", " def RMSE_total(self, uir):\n", " RMSE = 0\n", " for u, i, score in uir:\n", " prediction = self.get_rating(u, i)\n", " RMSE += (score - prediction) ** 2\n", " return np.sqrt(RMSE / len(uir))\n", "\n", " def estimations(self):\n", " self.estimations = np.dot(self.Pu, self.Qi.T)\n", "\n", " def recommend(self, user_code_id, item_code_id, topK=10):\n", "\n", " top_k = defaultdict(list)\n", " for nb_user, user_scores in enumerate(self.estimations):\n", "\n", " user_rated = self.train_ui.indices[\n", " self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]\n", " ]\n", " for item, score in enumerate(user_scores):\n", " if item not in user_rated and not np.isnan(score):\n", " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", " result = []\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, item_scores in top_k.items():\n", " item_scores.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid] + list(chain(*item_scores[:topK])))\n", " return result\n", "\n", " def estimate(self, user_code_id, item_code_id, test_ui):\n", " result = []\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append(\n", " [\n", " user_code_id[user],\n", " item_code_id[item],\n", " self.estimations[user, item]\n", " if not np.isnan(self.estimations[user, item])\n", " else 1,\n", " ]\n", " )\n", " return result" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Epoch 39 RMSE: 0.7489999966900885. Training epoch 40...: 100%|██████████| 40/40 [01:02<00:00, 1.57s/it]\n" ] } ], "source": [ "model = SVD(\n", " train_ui, learning_rate=0.005, regularization=0.02, nb_factors=100, iterations=40\n", ")\n", "model.train(test_ui)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df = pd.DataFrame(model.learning_process).iloc[:, :2]\n", "df.columns = [\"epoch\", \"train_RMSE\"]\n", "plt.plot(\"epoch\", \"train_RMSE\", data=df, color=\"blue\")\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df = pd.DataFrame(\n", " model.learning_process[10:], columns=[\"epoch\", \"train_RMSE\", \"test_RMSE\"]\n", ")\n", "plt.plot(\"epoch\", \"train_RMSE\", data=df, color=\"blue\")\n", "plt.plot(\"epoch\", \"test_RMSE\", data=df, color=\"green\", linestyle=\"dashed\")\n", "plt.legend()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Saving and evaluating recommendations" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "model.estimations()\n", "\n", "top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", "\n", "top_n.to_csv(\n", " \"Recommendations generated/ml-100k/Self_SVD_reco.csv\", index=False, header=False\n", ")\n", "\n", "estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", "estimations.to_csv(\n", " \"Recommendations generated/ml-100k/Self_SVD_estimations.csv\",\n", " index=False,\n", " header=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "943it [00:00, 8683.10it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRReco in testTest coverageShannonGini
00.9143930.7171990.1016970.0423340.0517870.0688110.0924890.072360.1048390.048970.1961170.5178890.4803820.8673380.1471863.8525450.972694
\n", "
" ], "text/plain": [ " RMSE MAE precision recall F_1 F_05 \\\n", "0 0.914393 0.717199 0.101697 0.042334 0.051787 0.068811 \n", "\n", " precision_super recall_super NDCG mAP MRR LAUC \\\n", "0 0.092489 0.07236 0.104839 0.04897 0.196117 0.517889 \n", "\n", " HR Reco in test Test coverage Shannon Gini \n", "0 0.480382 0.867338 0.147186 3.852545 0.972694 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import evaluation_measures as ev\n", "\n", "estimations_df = pd.read_csv(\n", " \"Recommendations generated/ml-100k/Self_SVD_estimations.csv\", header=None\n", ")\n", "reco = np.loadtxt(\"Recommendations generated/ml-100k/Self_SVD_reco.csv\", delimiter=\",\")\n", "\n", "ev.evaluate(\n", " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n", " estimations_df=estimations_df,\n", " reco=reco,\n", " super_reactions=[4, 5],\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "943it [00:00, 8505.85it/s]\n", "943it [00:00, 9544.72it/s]\n", "943it [00:00, 9154.80it/s]\n", "943it [00:00, 8282.66it/s]\n", "943it [00:00, 8432.23it/s]\n", "943it [00:00, 9601.30it/s]\n", "943it [00:00, 9158.89it/s]\n", "943it [00:00, 12283.59it/s]\n", "943it [00:00, 9500.43it/s]\n", "943it [00:00, 10085.91it/s]\n", "943it [00:00, 10260.90it/s]\n", "943it [00:00, 9691.20it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRReco in testTest coverageShannonGini
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.1374730.2146510.1117070.4009390.5555460.7656421.0000000.0389613.1590790.987317
0Ready_SVD0.9503470.7493120.1006360.0505140.0557940.0707530.0912020.0827340.1140540.0532000.2488030.5219830.5174970.9921530.2106784.4186830.952848
0Self_SVD0.9143930.7171990.1016970.0423340.0517870.0688110.0924890.0723600.1048390.0489700.1961170.5178890.4803820.8673380.1471863.8525450.972694
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.0564630.0959570.0431780.1981930.5155010.4379641.0000000.0339112.8365130.991139
0Ready_SVDBiased0.9394720.7398160.0858960.0360730.0435280.0576430.0770390.0574630.0977530.0455460.2198390.5147090.4316010.9974550.1688314.2175780.962577
0Ready_Random1.5218451.2259490.0471900.0207530.0248100.0322690.0295060.0237070.0500750.0187280.1219570.5068930.3297990.9865320.1847045.0997060.907217
0Ready_I-KNN1.0303860.8130670.0260870.0069080.0105930.0160460.0211370.0095220.0242140.0089580.0480680.4998850.1548250.4023330.4343435.1336500.877999
0Ready_I-KNNBaseline0.9353270.7374240.0025450.0007550.0011050.0016020.0022530.0009300.0034440.0013620.0117600.4967240.0212090.4828210.0598852.2325780.994487
0Ready_U-KNN1.0234950.8079130.0007420.0002050.0003050.0004490.0005360.0001980.0008450.0002740.0027440.4964410.0074230.6021210.0108232.0891860.995706
0Self_TopRated1.0307120.8209040.0009540.0001880.0002980.0004810.0006440.0002230.0010430.0003350.0033480.4964330.0095440.6990460.0050511.9459100.995669
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.0001890.0007520.0001680.0016770.4964240.0095440.6005300.0050511.8031260.996380
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.0000000.0002140.0000370.0003680.4963910.0031810.3921530.1154404.1747410.965327
\n", "
" ], "text/plain": [ " Model RMSE MAE precision recall F_1 \\\n", "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n", "0 Ready_SVD 0.950347 0.749312 0.100636 0.050514 0.055794 \n", "0 Self_SVD 0.914393 0.717199 0.101697 0.042334 0.051787 \n", "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n", "0 Ready_SVDBiased 0.939472 0.739816 0.085896 0.036073 0.043528 \n", "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n", "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n", "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n", "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n", "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n", "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n", "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n", "\n", " F_05 precision_super recall_super NDCG mAP MRR \\\n", "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n", "0 0.070753 0.091202 0.082734 0.114054 0.053200 0.248803 \n", "0 0.068811 0.092489 0.072360 0.104839 0.048970 0.196117 \n", "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n", "0 0.057643 0.077039 0.057463 0.097753 0.045546 0.219839 \n", "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n", "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n", "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n", "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n", "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n", "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n", "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n", "\n", " LAUC HR Reco in test Test coverage Shannon Gini \n", "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n", "0 0.521983 0.517497 0.992153 0.210678 4.418683 0.952848 \n", "0 0.517889 0.480382 0.867338 0.147186 3.852545 0.972694 \n", "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n", "0 0.514709 0.431601 0.997455 0.168831 4.217578 0.962577 \n", "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n", "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n", "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n", "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n", "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n", "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n", "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir_path = \"Recommendations generated/ml-100k/\"\n", "super_reactions = [4, 5]\n", "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n", "\n", "ev.evaluate_all(test, dir_path, super_reactions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embeddings" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codescoreitem_ididtitlegenres
03211.000000322322Murder at 1600 (1997)Mystery, Thriller
19830.902748984984Shadow Conspiracy (1997)Thriller
29850.894696986986Turbulence (1997)Thriller
37780.890524779779Drop Zone (1994)Action
46860.889220687687McHale's Navy (1997)Comedy, War
53310.887596332332Kiss the Girls (1997)Crime, Drama, Thriller
69870.886547988988Beautician and the Beast, The (1997)Comedy, Romance
710390.88284510401040Two if by Sea (1996)Comedy, Romance
810220.88278210231023Fathers' Day (1997)Comedy
99290.877662930930Chain Reaction (1996)Action, Adventure, Thriller
\n", "
" ], "text/plain": [ " code score item_id id title \\\n", "0 321 1.000000 322 322 Murder at 1600 (1997) \n", "1 983 0.902748 984 984 Shadow Conspiracy (1997) \n", "2 985 0.894696 986 986 Turbulence (1997) \n", "3 778 0.890524 779 779 Drop Zone (1994) \n", "4 686 0.889220 687 687 McHale's Navy (1997) \n", "5 331 0.887596 332 332 Kiss the Girls (1997) \n", "6 987 0.886547 988 988 Beautician and the Beast, The (1997) \n", "7 1039 0.882845 1040 1040 Two if by Sea (1996) \n", "8 1022 0.882782 1023 1023 Fathers' Day (1997) \n", "9 929 0.877662 930 930 Chain Reaction (1996) \n", "\n", " genres \n", "0 Mystery, Thriller \n", "1 Thriller \n", "2 Thriller \n", "3 Action \n", "4 Comedy, War \n", "5 Crime, Drama, Thriller \n", "6 Comedy, Romance \n", "7 Comedy, Romance \n", "8 Comedy \n", "9 Action, Adventure, Thriller " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "item = random.choice(list(set(train_ui.indices)))\n", "\n", "embeddings_norm = (\n", " model.Qi / np.linalg.norm(model.Qi, axis=1)[:, None]\n", ") # we do not mean-center here\n", "# omitting normalization also makes sense, but items with a greater magnitude will be recommended more often\n", "\n", "similarity_scores = np.dot(embeddings_norm, embeddings_norm[item].T)\n", "top_similar_items = pd.DataFrame(\n", " enumerate(similarity_scores), columns=[\"code\", \"score\"]\n", ").sort_values(by=[\"score\"], ascending=[False])[:10]\n", "\n", "top_similar_items[\"item_id\"] = top_similar_items[\"code\"].apply(\n", " lambda x: item_code_id[x]\n", ")\n", "\n", "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n", "\n", "result = pd.merge(top_similar_items, items, left_on=\"item_id\", right_on=\"id\")\n", "\n", "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# project task 5: implement SVD on top baseline (as it is in Surprise library)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# making changes to our implementation by considering additional parameters in the gradient descent procedure\n", "# seems to be the fastest option\n", "# please save the output in 'Recommendations generated/ml-100k/Self_SVDBaseline_reco.csv' and\n", "# 'Recommendations generated/ml-100k/Self_SVDBaseline_estimations.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ready-made SVD - Surprise implementation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SVD" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Generating predictions...\n", "Generating top N recommendations...\n", "Generating predictions...\n" ] } ], "source": [ "import helpers\n", "import surprise as sp\n", "\n", "algo = sp.SVD(biased=False) # to use unbiased version\n", "\n", "helpers.ready_made(\n", " algo,\n", " reco_path=\"Recommendations generated/ml-100k/Ready_SVD_reco.csv\",\n", " estimations_path=\"Recommendations generated/ml-100k/Ready_SVD_estimations.csv\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SVD biased - on top baseline" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Generating predictions...\n", "Generating top N recommendations...\n", "Generating predictions...\n" ] } ], "source": [ "algo = sp.SVD() # default is biased=True\n", "\n", "helpers.ready_made(\n", " algo,\n", " reco_path=\"Recommendations generated/ml-100k/Ready_SVDBiased_reco.csv\",\n", " estimations_path=\"Recommendations generated/ml-100k/Ready_SVDBiased_estimations.csv\",\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "943it [00:00, 11456.53it/s]\n", "943it [00:00, 11932.50it/s]\n", "943it [00:00, 10853.07it/s]\n", "943it [00:00, 9426.44it/s]\n", "943it [00:00, 8757.09it/s]\n", "943it [00:00, 9999.67it/s]\n", "943it [00:00, 11323.49it/s]\n", "943it [00:00, 9764.72it/s]\n", "943it [00:00, 9692.41it/s]\n", "943it [00:00, 9052.77it/s]\n", "943it [00:00, 8645.18it/s]\n", "943it [00:00, 10594.54it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRReco in testTest coverageShannonGini
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.1374730.2146510.1117070.4009390.5555460.7656421.0000000.0389613.1590790.987317
0Ready_SVD0.9516520.7509750.0963940.0472520.0528700.0672570.0855150.0747540.1095780.0515620.2355670.5203410.4962880.9955460.2085144.4557550.951624
0Self_SVD0.9143930.7171990.1016970.0423340.0517870.0688110.0924890.0723600.1048390.0489700.1961170.5178890.4803820.8673380.1471863.8525450.972694
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.0564630.0959570.0431780.1981930.5155010.4379641.0000000.0339112.8365130.991139
0Ready_SVDBiased0.9404130.7395710.0860020.0354780.0431960.0575070.0757510.0534600.0948970.0433610.2091240.5144050.4284200.9973490.1774894.2125090.962656
0Ready_Random1.5218451.2259490.0471900.0207530.0248100.0322690.0295060.0237070.0500750.0187280.1219570.5068930.3297990.9865320.1847045.0997060.907217
0Ready_I-KNN1.0303860.8130670.0260870.0069080.0105930.0160460.0211370.0095220.0242140.0089580.0480680.4998850.1548250.4023330.4343435.1336500.877999
0Ready_I-KNNBaseline0.9353270.7374240.0025450.0007550.0011050.0016020.0022530.0009300.0034440.0013620.0117600.4967240.0212090.4828210.0598852.2325780.994487
0Ready_U-KNN1.0234950.8079130.0007420.0002050.0003050.0004490.0005360.0001980.0008450.0002740.0027440.4964410.0074230.6021210.0108232.0891860.995706
0Self_TopRated1.0307120.8209040.0009540.0001880.0002980.0004810.0006440.0002230.0010430.0003350.0033480.4964330.0095440.6990460.0050511.9459100.995669
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.0001890.0007520.0001680.0016770.4964240.0095440.6005300.0050511.8031260.996380
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.0000000.0002140.0000370.0003680.4963910.0031810.3921530.1154404.1747410.965327
\n", "
" ], "text/plain": [ " Model RMSE MAE precision recall F_1 \\\n", "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n", "0 Ready_SVD 0.951652 0.750975 0.096394 0.047252 0.052870 \n", "0 Self_SVD 0.914393 0.717199 0.101697 0.042334 0.051787 \n", "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n", "0 Ready_SVDBiased 0.940413 0.739571 0.086002 0.035478 0.043196 \n", "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n", "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n", "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n", "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n", "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n", "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n", "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n", "\n", " F_05 precision_super recall_super NDCG mAP MRR \\\n", "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n", "0 0.067257 0.085515 0.074754 0.109578 0.051562 0.235567 \n", "0 0.068811 0.092489 0.072360 0.104839 0.048970 0.196117 \n", "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n", "0 0.057507 0.075751 0.053460 0.094897 0.043361 0.209124 \n", "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n", "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n", "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n", "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n", "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n", "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n", "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n", "\n", " LAUC HR Reco in test Test coverage Shannon Gini \n", "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n", "0 0.520341 0.496288 0.995546 0.208514 4.455755 0.951624 \n", "0 0.517889 0.480382 0.867338 0.147186 3.852545 0.972694 \n", "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n", "0 0.514405 0.428420 0.997349 0.177489 4.212509 0.962656 \n", "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n", "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n", "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n", "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n", "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n", "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n", "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir_path = \"Recommendations generated/ml-100k/\"\n", "super_reactions = [4, 5]\n", "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n", "\n", "ev.evaluate_all(test, dir_path, super_reactions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }