From 769d9aa62f79503ee675905d3dad3ee2bd15a1cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Maciaszek?= Date: Sat, 13 Jun 2020 01:25:51 +0000 Subject: [PATCH] =?UTF-8?q?Prze=C5=9Blij=20pliki=20do=20''?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- P3. k-nearest neighbours.ipynb | 757 +++++++++ ...mbeddings in high demensional spaces.ipynb | 80 + P4. Matrix Factorization.ipynb | 1490 +++++++++++++++++ 3 files changed, 2327 insertions(+) create mode 100644 P3. k-nearest neighbours.ipynb create mode 100644 P4. Appendix - embeddings in high demensional spaces.ipynb create mode 100644 P4. Matrix Factorization.ipynb diff --git a/P3. k-nearest neighbours.ipynb b/P3. k-nearest neighbours.ipynb new file mode 100644 index 0000000..423a575 --- /dev/null +++ b/P3. k-nearest neighbours.ipynb @@ -0,0 +1,757 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Self made simplified I-KNN" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import helpers\n", + "import pandas as pd\n", + "import numpy as np\n", + "import scipy.sparse as sparse\n", + "from collections import defaultdict\n", + "from itertools import chain\n", + "import random\n", + "\n", + "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n", + "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", + "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class IKNN():\n", + " \n", + " def fit(self, train_ui):\n", + " self.train_ui=train_ui\n", + " \n", + " train_iu=train_ui.transpose()\n", + " norms=np.linalg.norm(train_iu.A, axis=1) # here we compute lenth of each item ratings vector\n", + " norms=np.vectorize(lambda x: max(x,1))(norms[:,None]) # to avoid dividing by zero\n", + "\n", + " normalized_train_iu=sparse.csr_matrix(train_iu/norms)\n", + "\n", + " self.similarity_matrix_ii=normalized_train_iu*normalized_train_iu.transpose()\n", + " \n", + " self.estimations=np.array(train_ui*self.similarity_matrix_ii/((train_ui>0)*self.similarity_matrix_ii))\n", + " \n", + " def recommend(self, user_code_id, item_code_id, topK=10):\n", + " \n", + " top_k = defaultdict(list)\n", + " for nb_user, user in enumerate(self.estimations):\n", + " \n", + " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", + " for item, score in enumerate(user):\n", + " if item not in user_rated and not np.isnan(score):\n", + " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", + " result=[]\n", + " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", + " for uid, item_scores in top_k.items():\n", + " item_scores.sort(key=lambda x: x[1], reverse=True)\n", + " result.append([uid]+list(chain(*item_scores[:topK])))\n", + " return result\n", + " \n", + " def estimate(self, user_code_id, item_code_id, test_ui):\n", + " result=[]\n", + " for user, item in zip(*test_ui.nonzero()):\n", + " result.append([user_code_id[user], item_code_id[item], \n", + " self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "toy train ui:\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[3, 4, 0, 0, 5, 0, 0, 4],\n", + " [0, 1, 2, 3, 0, 0, 0, 0],\n", + " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similarity matrix:\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[1. , 0.9701425 , 0. , 0. , 1. ,\n", + " 0. , 0. , 1. ],\n", + " [0.9701425 , 1. , 0.24253563, 0.12478355, 0.9701425 ,\n", + " 0. , 0. , 0.9701425 ],\n", + " [0. , 0.24253563, 1. , 0.51449576, 0. ,\n", + " 0. , 0. , 0. ],\n", + " [0. , 0.12478355, 0.51449576, 1. , 0. ,\n", + " 0.85749293, 0.85749293, 0. ],\n", + " [1. , 0.9701425 , 0. , 0. , 1. ,\n", + " 0. , 0. , 1. ],\n", + " [0. , 0. , 0. , 0.85749293, 0. ,\n", + " 1. , 1. , 0. ],\n", + " [0. , 0. , 0. , 0.85749293, 0. ,\n", + " 1. , 1. , 0. ],\n", + " [1. , 0.9701425 , 0. , 0. , 1. ,\n", + " 0. , 0. , 1. ]])" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "estimations matrix:\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[4. , 4. , 4. , 4. , 4. ,\n", + " nan, nan, 4. ],\n", + " [1. , 1.35990333, 2.15478388, 2.53390319, 1. ,\n", + " 3. , 3. , 1. ],\n", + " [ nan, 5. , 5. , 4.05248907, nan,\n", + " 3.95012863, 3.95012863, nan]])" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[[0, 20, 4.0, 30, 4.0],\n", + " [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],\n", + " [20, 10, 5.0, 20, 5.0]]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# toy example\n", + "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", + "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", + "\n", + "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n", + "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n", + "\n", + "\n", + "model=IKNN()\n", + "model.fit(toy_train_ui)\n", + "\n", + "print('toy train ui:')\n", + "display(toy_train_ui.A)\n", + "\n", + "print('similarity matrix:')\n", + "display(model.similarity_matrix_ii.A)\n", + "\n", + "print('estimations matrix:')\n", + "display(model.estimations)\n", + "\n", + "model.recommend(toy_user_code_id, toy_item_code_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "model=IKNN()\n", + "model.fit(train_ui)\n", + "\n", + "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", + "\n", + "top_n.to_csv('Recommendations generated/ml-100k/Self_IKNN_reco.csv', index=False, header=False)\n", + "\n", + "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", + "estimations.to_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', index=False, header=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "943it [00:00, 6719.05it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRH2RReco in testTest coverageShannonGini
01.0183630.8087930.0003180.0001080.000140.0001890.00.00.0002140.0000370.0003680.4963910.0031810.00.3921530.115444.1747410.965327
\n", + "
" + ], + "text/plain": [ + " RMSE MAE precision recall F_1 F_05 \\\n", + "0 1.018363 0.808793 0.000318 0.000108 0.00014 0.000189 \n", + "\n", + " precision_super recall_super NDCG mAP MRR LAUC \\\n", + "0 0.0 0.0 0.000214 0.000037 0.000368 0.496391 \n", + "\n", + " HR H2R Reco in test Test coverage Shannon Gini \n", + "0 0.003181 0.0 0.392153 0.11544 4.174741 0.965327 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import evaluation_measures as ev\n", + "estimations_df=pd.read_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', header=None)\n", + "reco=np.loadtxt('Recommendations generated/ml-100k/Self_IKNN_reco.csv', delimiter=',')\n", + "\n", + "ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None),\n", + " estimations_df=estimations_df, \n", + " reco=reco,\n", + " super_reactions=[4,5])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "943it [00:00, 7023.03it/s]\n", + "943it [00:00, 6323.02it/s]\n", + "943it [00:00, 6003.69it/s]\n", + "943it [00:00, 6582.48it/s]\n", + "943it [00:00, 5623.69it/s]\n", + "943it [00:00, 6775.77it/s]\n", + "943it [00:00, 6119.28it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRH2RReco in testTest coverageShannonGini
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.1374730.2146510.1117070.4009390.5555460.7656420.4920471.0000000.0389613.1590790.987317
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.0564630.0959570.0431780.1981930.5155010.4379640.2396611.0000000.0339112.8365130.991139
0Self_GlobalAvg1.1257600.9435340.0611880.0259680.0313830.0413430.0405580.0321070.0676950.0274700.1711870.5095460.3849420.1421001.0000000.0259742.7117720.992003
0Ready_Random1.5249541.2233520.0455990.0211810.0245850.0315180.0278970.0219310.0481110.0173810.1190050.5070960.3308590.0911980.9881230.1818185.1007920.906866
0Self_TopRatedNaNNaN0.0320250.0126740.0157140.0211830.0284330.0185730.0227410.0053280.0316020.5027640.2375400.0657480.6970310.0144302.2208110.995173
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.0001890.0007520.0001680.0016770.4964240.0095440.0000000.6005300.0050511.8031260.996380
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.0000000.0002140.0000370.0003680.4963910.0031810.0000000.3921530.1154404.1747410.965327
\n", + "
" + ], + "text/plain": [ + " Model RMSE MAE precision recall F_1 \\\n", + "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n", + "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n", + "0 Self_GlobalAvg 1.125760 0.943534 0.061188 0.025968 0.031383 \n", + "0 Ready_Random 1.524954 1.223352 0.045599 0.021181 0.024585 \n", + "0 Self_TopRated NaN NaN 0.032025 0.012674 0.015714 \n", + "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n", + "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n", + "\n", + " F_05 precision_super recall_super NDCG mAP MRR \\\n", + "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n", + "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n", + "0 0.041343 0.040558 0.032107 0.067695 0.027470 0.171187 \n", + "0 0.031518 0.027897 0.021931 0.048111 0.017381 0.119005 \n", + "0 0.021183 0.028433 0.018573 0.022741 0.005328 0.031602 \n", + "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n", + "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n", + "\n", + " LAUC HR H2R Reco in test Test coverage Shannon \\\n", + "0 0.555546 0.765642 0.492047 1.000000 0.038961 3.159079 \n", + "0 0.515501 0.437964 0.239661 1.000000 0.033911 2.836513 \n", + "0 0.509546 0.384942 0.142100 1.000000 0.025974 2.711772 \n", + "0 0.507096 0.330859 0.091198 0.988123 0.181818 5.100792 \n", + "0 0.502764 0.237540 0.065748 0.697031 0.014430 2.220811 \n", + "0 0.496424 0.009544 0.000000 0.600530 0.005051 1.803126 \n", + "0 0.496391 0.003181 0.000000 0.392153 0.115440 4.174741 \n", + "\n", + " Gini \n", + "0 0.987317 \n", + "0 0.991139 \n", + "0 0.992003 \n", + "0 0.906866 \n", + "0 0.995173 \n", + "0 0.996380 \n", + "0 0.965327 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import imp\n", + "imp.reload(ev)\n", + "\n", + "import evaluation_measures as ev\n", + "dir_path=\"Recommendations generated/ml-100k/\"\n", + "super_reactions=[4,5]\n", + "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", + "\n", + "ev.evaluate_all(test, dir_path, super_reactions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ready-made KNNs - Surprise implementation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### I-KNN - basic" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "sim_options = {'name': 'cosine',\n", + " 'user_based': False} # compute similarities between items\n", + "algo = sp.KNNBasic(sim_options=sim_options)\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNN_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### U-KNN - basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "sim_options = {'name': 'cosine',\n", + " 'user_based': True} # compute similarities between users\n", + "algo = sp.KNNBasic(sim_options=sim_options)\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_U-KNN_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### I-KNN - on top baseline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "sim_options = {'name': 'cosine',\n", + " 'user_based': False} # compute similarities between items\n", + "algo = sp.KNNBaseline()\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# project task 4: use a version of your choice of Surprise KNNalgorithm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read the docs and try to find best parameter configuration (let say in terms of RMSE)\n", + "# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline\n", + "# the solution here can be similar to examples above\n", + "# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and\n", + "# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "Generating predictions...\n", + "Generating top N recommendations...\n", + "Generating predictions...\n" + ] + } + ], + "source": [ + "#I chose KNN With Means because I thought it would be interesting if the algorithm take into account\n", + "#the mean ratings of each user\n", + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "sim_options = {'name': 'cosine',\n", + " 'user_based': True} # compute similarities between users\n", + "algo = sp.KNNWithZScore(k=10,sim_options=sim_options)\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/P4. Appendix - embeddings in high demensional spaces.ipynb b/P4. Appendix - embeddings in high demensional spaces.ipynb new file mode 100644 index 0000000..cd982b8 --- /dev/null +++ b/P4. Appendix - embeddings in high demensional spaces.ipynb @@ -0,0 +1,80 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dimensions: 1, cases when observation is the nearest: 0.0%',\n", + " 'dimensions: 2, cases when observation is the nearest: 0.0%',\n", + " 'dimensions: 3, cases when observation is the nearest: 0.0%',\n", + " 'dimensions: 10, cases when observation is the nearest: 7.000000000000001%',\n", + " 'dimensions: 20, cases when observation is the nearest: 57.99999999999999%',\n", + " 'dimensions: 30, cases when observation is the nearest: 92.0%',\n", + " 'dimensions: 40, cases when observation is the nearest: 99.0%',\n", + " 'dimensions: 50, cases when observation is the nearest: 100.0%',\n", + " 'dimensions: 60, cases when observation is the nearest: 100.0%',\n", + " 'dimensions: 70, cases when observation is the nearest: 100.0%',\n", + " 'dimensions: 80, cases when observation is the nearest: 100.0%',\n", + " 'dimensions: 90, cases when observation is the nearest: 100.0%']" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import random\n", + "from numpy.linalg import norm\n", + "\n", + "dimensions=[1,2,3]+[10*i for i in range(1,10)]\n", + "nb_vectors=10000\n", + "trials=100\n", + "k=1 # by setting k=1 we want to check how often the closest vector to the avarage of 2 random vectors is one of these 2 vectors\n", + "\n", + "result=[]\n", + "for dimension in dimensions:\n", + " vectors=np.random.normal(0,1,size=(nb_vectors, dimension))\n", + " successes=0\n", + " for i in range(trials):\n", + " i1,i2=random.sample(range(nb_vectors),2)\n", + " target=(vectors[i1]+vectors[i2])/2\n", + "\n", + " distances=pd.DataFrame(enumerate(np.dot(target, vectors.transpose())/norm(target)/norm(vectors.transpose(), axis=0)))\n", + " distances=distances.sort_values(by=[1], ascending=False)\n", + " if (i1 in (list(distances[0][:k]))) | (i2 in (list(distances[0][:k]))):\n", + " successes+=1\n", + " result.append(successes/trials)\n", + " \n", + "[f'dimensions: {i}, cases when observation is the nearest: {100*round(j,3)}%' for i,j in zip(dimensions, result)]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/P4. Matrix Factorization.ipynb b/P4. Matrix Factorization.ipynb new file mode 100644 index 0000000..1235021 --- /dev/null +++ b/P4. Matrix Factorization.ipynb @@ -0,0 +1,1490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Self made SVD" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import helpers\n", + "import pandas as pd\n", + "import numpy as np\n", + "import scipy.sparse as sparse\n", + "from collections import defaultdict\n", + "from itertools import chain\n", + "import random\n", + "\n", + "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n", + "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", + "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Done similarly to https://github.com/albertauyeung/matrix-factorization-in-python\n", + "from tqdm import tqdm\n", + "\n", + "class SVD():\n", + " \n", + " def __init__(self, train_ui, learning_rate, regularization, nb_factors, iterations):\n", + " self.train_ui=train_ui\n", + " self.uir=list(zip(*[train_ui.nonzero()[0],train_ui.nonzero()[1], train_ui.data]))\n", + " \n", + " self.learning_rate=learning_rate\n", + " self.regularization=regularization\n", + " self.iterations=iterations\n", + " self.nb_users, self.nb_items=train_ui.shape\n", + " self.nb_ratings=train_ui.nnz\n", + " self.nb_factors=nb_factors\n", + " \n", + " self.Pu=np.random.normal(loc=0, scale=1./self.nb_factors, size=(self.nb_users, self.nb_factors))\n", + " self.Qi=np.random.normal(loc=0, scale=1./self.nb_factors, size=(self.nb_items, self.nb_factors))\n", + "\n", + " def train(self, test_ui=None):\n", + " if test_ui!=None:\n", + " self.test_uir=list(zip(*[test_ui.nonzero()[0],test_ui.nonzero()[1], test_ui.data]))\n", + " \n", + " self.learning_process=[]\n", + " pbar = tqdm(range(self.iterations))\n", + " for i in pbar:\n", + " pbar.set_description(f'Epoch {i} RMSE: {self.learning_process[-1][1] if i>0 else 0}. Training epoch {i+1}...')\n", + " np.random.shuffle(self.uir)\n", + " self.sgd(self.uir)\n", + " if test_ui==None:\n", + " self.learning_process.append([i+1, self.RMSE_total(self.uir)])\n", + " else:\n", + " self.learning_process.append([i+1, self.RMSE_total(self.uir), self.RMSE_total(self.test_uir)])\n", + " \n", + " def sgd(self, uir):\n", + " \n", + " for u, i, score in uir:\n", + " # Computer prediction and error\n", + " prediction = self.get_rating(u,i)\n", + " e = (score - prediction)\n", + " \n", + " # Update user and item latent feature matrices\n", + " Pu_update=self.learning_rate * (e * self.Qi[i] - self.regularization * self.Pu[u])\n", + " Qi_update=self.learning_rate * (e * self.Pu[u] - self.regularization * self.Qi[i])\n", + " \n", + " self.Pu[u] += Pu_update\n", + " self.Qi[i] += Qi_update\n", + " \n", + " def get_rating(self, u, i):\n", + " prediction = self.Pu[u].dot(self.Qi[i].T)\n", + " return prediction\n", + " \n", + " def RMSE_total(self, uir):\n", + " RMSE=0\n", + " for u,i, score in uir:\n", + " prediction = self.get_rating(u,i)\n", + " RMSE+=(score - prediction)**2\n", + " return np.sqrt(RMSE/len(uir))\n", + " \n", + " def estimations(self):\n", + " self.estimations=\\\n", + " np.dot(self.Pu,self.Qi.T)\n", + "\n", + " def recommend(self, user_code_id, item_code_id, topK=10):\n", + " \n", + " top_k = defaultdict(list)\n", + " for nb_user, user in enumerate(self.estimations):\n", + " \n", + " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", + " for item, score in enumerate(user):\n", + " if item not in user_rated and not np.isnan(score):\n", + " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", + " result=[]\n", + " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", + " for uid, item_scores in top_k.items():\n", + " item_scores.sort(key=lambda x: x[1], reverse=True)\n", + " result.append([uid]+list(chain(*item_scores[:topK])))\n", + " return result\n", + " \n", + " def estimate(self, user_code_id, item_code_id, test_ui):\n", + " result=[]\n", + " for user, item in zip(*test_ui.nonzero()):\n", + " result.append([user_code_id[user], item_code_id[item], \n", + " self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epoch 39 RMSE: 0.750963575605171. Training epoch 40...: 100%|██████████| 40/40 [01:38<00:00, 2.45s/it] \n" + ] + } + ], + "source": [ + "model=SVD(train_ui, learning_rate=0.005, regularization=0.02, nb_factors=100, iterations=40)\n", + "model.train(test_ui)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "df=pd.DataFrame(model.learning_process).iloc[:,:2]\n", + "df.columns=['epoch', 'train_RMSE']\n", + "plt.plot('epoch', 'train_RMSE', data=df, color='blue')\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XucznX+//HHa5BjRaiVcUqU0xg1dGKjvsop2WoLHVQ2HYgtKtpOa7VbaWPbVKt+KrWbdNp0VlE65yyHRCiDjQ6USoX374/XNVzGMIOZ+VyH5/12u25zXZ/P55p5fbr0+nyu9+H1thACIiKSHjKiDkBEREqPkr6ISBpR0hcRSSNK+iIiaURJX0QkjSjpi4ikESV9EZE0oqQvIpJGlPRFRNJI2agDyK9GjRqhfv36UYchIpJUZs6c+VUIoWZhxxWa9M1sHNANWBtCaF7AfgP+AXQBfgQuDCHMiu3rA9wQO3RECOGRwv5e/fr1mTFjRmGHiYhIHDP7vCjHFaV552Gg0272dwYaxR79gPtiARwE3AwcA7QBbjazakUJSkRESkahST+EMA34ZjeHnA6MD+4DoKqZ1QJOBV4LIXwTQvgWeI3dXzxERKSEFUdHbm1gZdzr3Ni2XW0XEZGIFEdHrhWwLexm+86/wKwf3jRE3bp1iyEkESlNv/76K7m5uWzatCnqUFJehQoVyMzMpFy5cnv1/uJI+rlAnbjXmcDq2Pb2+ba/WdAvCCGMBcYC5OTkqMC/SJLJzc1l//33p379+vjYDikJIQS+/vprcnNzadCgwV79juJo3pkEXGDuWGBDCGEN8CpwiplVi3XgnhLbJiIpZtOmTVSvXl0Jv4SZGdWrV9+nb1RFGbL5OH7HXsPMcvEROeUAQgj3Ay/hwzWX4kM2L4rt+8bM/gJMj/2q4SGE3XUIi0gSU8IvHfv637nQpB9C6FXI/gD038W+ccC4vQttb/wAfAZkld6fFBFJIilWhuFeIBs4F0/+IiISL8WS/h+A64BngSOBy/E+ZRFJZevXr+fee+/d4/d16dKF9evX7/H7LrzwQho0aEB2djYtW7bkjTfe2Lavffv21K1bF28EcT169KBKlSoAbN26lYEDB9K8eXNatGhB69atWb58OeAVCVq0aEF2djbZ2dkMHDhwj2MrTMLV3tk31YC/AQOBEfiAoGWo/1gkteUl/SuuuGKH7Vu2bKFMmTK7fN9LL720139z5MiRnHXWWUydOpV+/fqxZMmSbfuqVq3Ku+++S9u2bVm/fj1r1qzZtu+JJ55g9erVzJs3j4yMDHJzc6lcufK2/VOnTqVGjRp7HVdhUizp56kFjAEGA3m93KvwihKDgCrRhCWSBv74R5gzp3h/Z3Y2jB696/1Dhw7ls88+Izs7m3LlylGlShVq1arFnDlzWLhwIT169GDlypVs2rSJQYMG0a9fP2B7ra+NGzfSuXNn2rZty3vvvUft2rV57rnnqFixYqGxHXfccaxatWqHbT179mTChAm0bduWZ555hjPOOIMFCxYAsGbNGmrVqkVGhje0ZGZm7uV/lb2TYs07+R0GNI09n4TXfmsI3A38HFVQIlLMbrvtNho2bMicOXMYOXIkH330EbfeeisLFy4EYNy4ccycOZMZM2Zw99138/XXX+/0O5YsWUL//v1ZsGABVatW5emnny7S337llVfo0aPHDttOPvlkpk2bxpYtW5gwYQLnnHPOtn1nn302zz//PNnZ2QwePJjZs2fv8N4OHTpsa94ZNWrUnv6nKFSK3ukX5HK8k/d6/G7/NnzQ0Z+iDEok5ezujry0tGnTZofJS3fffTfPPvssACtXrmTJkiVUr159h/fktdEDHH300axYsWK3f+Oaa67h2muvZe3atXzwwQc77CtTpgxt27bliSee4KeffiK+XHxmZiaLFy9mypQpTJkyhZNPPpknn3ySk08+GSj55p0Uv9PP7zhgCjAZaIVPLcjzMrr7F0kN8W3kb775Jq+//jrvv/8+c+fOpVWrVgVObipfvvy252XKlGHz5s27/RsjR45k6dKljBgxgj59+uy0v2fPnlx55ZWcffbZBf6tzp07M3LkSK6//nr++9//7snp7ZM0S/rgJYE6Ai8C/y+2bT4+v6w2cBWwIJrQRGSv7L///nz//fcF7tuwYQPVqlWjUqVKfPLJJzvdle+LjIwMBg0axNatW3n11R0HjLRr145hw4bRq9eOU51mzZrF6tU+qnDr1q3MmzePevXqFVtMhUmj5p2C5F3zmuJ3/w/gHcCj8W8FDwFHRBOaiBRZ9erVOeGEE2jevDkVK1bkkEMO2bavU6dO3H///WRlZXHEEUdw7LHHFuvfNjNuuOEG7rjjDk499dQdtg8ZMmSn49euXcsll1zCzz97y0KbNm0YMGDAtv0dOnTYNuIoKyuL8ePHF2+88WNJE0FOTk6IduWsdcCjwATgDWB/4Cl8SYFTgPqRRSaSqBYtWkSTJk2iDiNtFPTf28xmhhByCntvGjbvFKYmcDXwEZ7wAf4NXAo0wO/8r0Rj/0UkGaV5805RPQN8gjcBTcbLCX2GLw4G8CDeMdwKXUdFUkf//v159913d9g2aNAgLrrooogi2ndK+kViQJPYYxA+yuer2L5v8W8BW/FJYRcCffH5ACKSzMaMGRN1CMVOt6V7pTzbV36shtf3eRQ4GrgdOBx4IprQRER2Q0m/WBwCnAc8D3wB/AU4KbbvSfzbwbxoQhMRiaOkX+xq4+UeasZeLwbuB1oCx+DDQgseTywiUtKU9EvcDXjzz2h8YbF+QNe4/VujCEpE0pSSfqmozvYmng+Am2Lbv8PXlO+Hl4fYEkl0Islub+vpA4wePZoff/xxt8fk1bnPysrixBNP5PPPP9+2z8w4//zzt73evHkzNWvWpFu3bgB8+eWXdOvWjZYtW9K0aVO6dOkCwIoVK6hYseK24mrZ2dnFPhGrIEr6pcrwJp7/i73+DugA/Ac4Gb8A/BFYEUVwIkmrpJM+eCG0efPm0b59e0aMGLFte+XKlZk/fz4//fQTAK+99hq1a9fetv+mm26iY8eOzJ07l4ULF3Lbbbdt25dXGTTvccEFF+zVOewJJf1IZQKPAWvx0T7HAPfhzUAAC4E5QGLNmhYpXPsCHnlJ+cdd7H84tv+rAvbtXnw9/WuuuYaRI0fSunVrsrKyuPnmmwH44Ycf6Nq1Ky1btqR58+Y88cQT3H333axevZoOHTrQoUOHIp1ZQfXzO3fuzIsvvgjA448/vkO9nTVr1uxQMz8rK9o1vJX0E0Il4Gx8mcd1bF8D4K/4hK9MfOz/k8CeL+0mkuri6+l37NiRJUuW8NFHHzFnzhxmzpzJtGnTeOWVVzj00EOZO3cu8+fPp1OnTgwcOJBDDz2UqVOnMnXq1CL9rYLq5+ctmrJp0ybmzZvHMcccs21f//796du3Lx06dODWW2/dVmwN2Hahynu8/fbbxfMfZDc0OSvhHBD3/C682edl4Gl8JvAR+OxggM/xJiFduyXRvLmbfZUK2V+jkP27N3nyZCZPnkyrVq0A2LhxI0uWLKFdu3YMGTKE6667jm7dutGuXbs9+r0dOnTgyy+/5OCDD96heQf87n3FihU8/vjj29rs85x66qksW7aMV155hZdffplWrVoxf/58YHvzTmlStkhoBwMXARPxr7zvAHfG9m0BjgIOBfrgBeI2RhCjSGIJITBs2LBt7eRLly6lb9++NG7cmJkzZ9KiRQuGDRvG8OHD9+j3Tp06lc8//5xmzZpx00037bS/e/fuDBkyZKdSygAHHXQQvXv35tFHH6V169ZMmzZtr89vXynpJ42ywAlAt9jrLcAovCP4BaAX8Bu2t4uKpI/4evqnnnoq48aNY+NGvwlatWoVa9euZfXq1VSqVInzzjuPIUOGMGvWrJ3eW5iKFSsyevRoxo8fzzfffLPDvosvvpibbrqJFi1a7LB9ypQp2zqKv//+ez777DPq1q27T+e7L9S8k7T2Ay6IPbYA7wGPAEfG9s/FZwj3wZuARFJXfD39zp0707t3b4477jgAqlSpwmOPPcbSpUu55ppryMjIoFy5ctx3330A9OvXj86dO1OrVq0itevXqlWLXr16MWbMGG688cZt2zMzMxk0aNBOx8+cOZMBAwZQtmxZtm7dyh/+8Adat27NihUrtrXp57n44osZOHDgvv7n2C3V009ZdwGD2b5S2EVAD6BClEFJilI9/dKlevpSgKvx8s834h2/vfAqoXkzgH+NKC4RiZKad1LaYcCfgZuBqXgxuLzrfGOgCtAayIk9stA3AUlnxxxzzLZlDPM8+uijO7XTJzMl/bSQgQ/9zLMF7wuYjrf7PxTb3h+4B9iMdwjn4HMG9iutQCWJhRAws6jD2Ccffvhh1CEUal+b5JX001IZ/BsA+GzfL4AZQL3YtkXAJbHn5YBmeJXQK4A2pRemJI0KFSrw9ddfU7169aRP/IkshMDXX39NhQp7/41cST/tGZ7s68Vtaw4swS8Ec2KPl4FzYvtfwy8K2XGPHHzmsKSjzMxMcnNzWbduXdShpLwKFSrsUNZhTxUp6ZtZJ+Af+C3igyGE2/Ltr4dPF60JfAOcF0LIje3bAnwcO/SLEEL3vY5WSonhq38dDvSM2573tfIA4Dj8YjApbvssvGzESvyfyqGlEawkgHLlytGgQYOow5AiKDTpm1kZYAw+7i8XmG5mk0IIC+MOuxMYH0J4xMxOAv4G5NUa/SmEkI2kgLyv7ccAj8ee/4hf0z8E8jq77sD7Bhrjk8c64EWzDimtQEVkF4oyZLMNsDSEsCyE8As+3//0fMc0Bd6IPZ9awH5JWZXwi8BAtt9DXIbfBzTCy0b3xL8BxH8j+LZ0wxQRoGhJvzb+fT1PLttXBc8zFzgz9vx3wP5mVj32uoKZzTCzD8ysBwUws36xY2aoTTAVNMMnhr2At/Z9iC8ZaXji/x1eVKs1MAx4HfgpkkhF0k1Rkn5BXfH5xwwNAU40s9nAicAqfNwfQN3YLLHewGgza7jTLwthbAghJ4SQU7Nmzfy7i2z58r1+q5SYsviXxfiunH/jq4dVwL8RdMQXjwGfPDYdrSImUjKKkvRz2bF4Sya+6Os2IYTVIYQzQgitgD/Ftm3I2xf7uQyvl9pq38Pe2eefQ5MmcNpp8OmnJfEXpHgY0BafMPY23szzIt4kBN4/0Ab/JnA6cHvsOH0TECkORUn604FGZtbAzPbDG2gnxR9gZjXMLO93DcNH8mBm1cysfN4xeJnI+A7gYvOb38Dw4fDWW9C8OQweDOu13kgSqAJ0Yfu9QH28k/hMvHzEUOC3bK+vvhgvNZ1bmkGKpIxCk34IYTMwAHgVn7UzMYSwwMyGm1ned/b2wGIz+xQfonFrbHsTYIaZzcU7eG/LN+qn2JQvD9de63f5F1wAo0ZB48bwr3/BFrUUJJED8fuKB/EEvxa/xzghtv8pfL5AHaBu7Ni72b7EpIjsTspW2Zw1C/74R3j7bcjKgtGjoYhLYEpC+xUfN/Be3OMrYAM+e/hefNzB8fhcghrRhClSytK+yuZRR3lTz8SJsGEDnHQSnHEGLFsWdWSyb8rhs38H4qOHv8CXjSwX2z8L+DvecVwTnyswOO79iXWTI1LaUjbpA5jB738PixbBiBEwebJ39g4dCkVcKEeSQvyIrwfxu/638U7gZsB3cftb4d8ALgf+BXwA/FA6YYokgJRt3inI6tUwbBiMHw8HHQQXXwyXXw6HHVYif04Szmb8rn8uXkJiQ2z7pfg8gi34bOKWscehFDxiWSTxpH3zTkEOPRQeeQQ++sjb90eNgsMPh27d4OWXYevWwn+HJLOyeAmpN/GhoiuA/wJ/iO1fDlwPdMVHJlfD+wZejO3/KfYe/UOR5JVWST9P69bw1FOwYgXccAPMmAFduvhon7//HfKtdywpKa+66Ol4HwF4gbm8pqF/AucC5dleXuJ9oAGwP3A0Xl7qb+w4YV0ksaVV886u/PILPPMMjBkD77wDFSpA797Qv793CIu41fhd/0J89PJCPOF/hJeU+A++TsER+AL1eT9z8IuHSMkpavOOkn4+8+bBvffCo4/Cjz/CscfCFVd4h/A+rFsgKet7vJxEOXydgbH4pLJPgV9ix6wGagGPAe/g1UizYj+rlnK8kqrUpr+XsrLg/vu90/cf/4Bvv/XJXnXq+Kgf1feRHe3P9uGiHYEn8VISP+IL078I/Ca2fwXwBD7X8bd4n0FDtvcRzIy9N+9iIVL8dKdfiBBgyhS/+3/uOe/s7drV7/5PPRUydNmUPRLweoQfA/PwDuW8NYk64J3M5fBmoZZ4/cI/7PRbRPJT804JyM2FsWP98eWXPtTzsst86Gf16oW/X2T3PgFm4xeDefjQ0hb4UpXgF4CK+PKUecNKG6NVTwWU9EvUL7/As8/63f+0aV73p2dPH/Pfpo1PChMpHj/jncABv+OfBSzAy1EQ2/YAPgfhAnxuQe24R2N2nLwmqUpJv5R8/DHcd593/G7c6H0C/frBuedCVfXRSYn4Ff9WMBevStoW+BpfwWwVsCnu2FvxuQergR54s1H843Bgv1KKW0qSkn4p++47+M9/vOln9myoWBHOPtsvAMcdp7t/KS0B7ydYFXs0xJetXA70wy8W8WWpHwb6AEuA/8eOF4PqaEZy8lDSj9DMmZ78//Mfv/tv2tST//nne/kHkWh9jw8pXQS0wyepTQLOYnuzEcAB+FKWrYH5eJ2ihrFHJhr8l1iU9BPAxo0wYQI88ICXfihfHs46Cy65BH77W939S6LZjH8j+AQfbvoZvhDeb/CaRNfFHVsen538FnAw8C4+WS0T70vIK2Ohf+SlRUk/wcyZ48n/sce8KahpUxgwwO/+q1SJOjqRwmzBZx8vxS8GeT8fxy8AA/HSFfEqAevxIajjY8c3wjuXG+EXBSkuSvoJ6ocf4Ikn4J57vO3/gAPgwgu95EPjxlFHJ7K3fgXW4P0Fq2I/vwWGx/b3w8tex+ebJmxfPfUZ/MLSCF8R7UCgTIlHnUqU9BNcCPDBB578n3wSfv3VJ3sNGACdO0MZ/XuXlPMzsAzvNP4UT/J5TUat8HLX8ToCk2PPL8S/NVSNe2TjI5LAq6UG/FtHhdjjN0Be3fRvSPXmJiX9JPK//3nH7/33w5o1PunriivgoovU8SvpYiPeZPQpPrx0PT7noF9s/+/xi8W3sX3fAWcAT8f2V8cTe7wLgEdiz8vjCT9+DkMPfL3lgHdSZ+IXinIkIyX9JPTrrz7p6557fG3fihW92uell0JOjjp+Rbbbgn9zqBR7vRhf7+BnfJ7Cz8Ah+KxlgLvZselpFT5U9UZ8jkPeWsqGd0wfClwNnIePdnocL5p3aOxxMNubn7bm+9t5j0x8BNR6vD/jALzZ6gC2X4SKj5J+kpszx5P/4497tc/sbE/+vXt7P4CIFJefgClsn9uwOva4GDgTnwSXne89GfiF4Gx8BFP7An7vc/hazS8C3fLtK4c3XbXHh8WOwC8GE9h+IdszSvopYsMGH+//r3/B3LlQqRL06uXj/lu31t2/SMnbgndS510M1uAXh+5AG/ybw3/wfoT4PoW2eDPSl8CHeJPUhrifl+LDXl/DZ05vAGawtx3YSvopJgSYPt3b/uPv/vv187v/Aw+MOkIRiZLq6acYMy/m9uCD3tl7771+IbjiCl/7t29fHw2UYNdwEUkwSvpJ6IADvKLn7Nk+07d3b5/5e9xx0KIFjB4NX30VdZQikoiU9JOYmbfrP/CA3/2PHQuVK8NVV0Ht2l7u+fXXfeEXERFQ0k8ZBxzgNX0+/NA7fC+7DCZPho4doWFDGDHCF4ERkfSmpJ+CsrJ8fd/Vq33kT8OGcOONUK8edOsGkybBli1RRykiUVDST2EVKvjwztdfh88+g2HDvB/g9NPhiCPg7rvh+++jjlJESpOSfpo47DBv4vn8c6/1c8ghMGgQZGbC4MGwfHnUEYpIaShS0jezTma22MyWmtnQAvbXM7M3zGyemb1pZplx+/qY2ZLYo09xBi97rmxZr+n/7rve/t+1q9/xH344nHkmvPOOhn2KpLJCk76ZlQHGAJ2BpkAvM2ua77A7gfEhhCy8lurfYu89CLgZX7yzDXCzmamIdoJo08bb/Jcvh2uvhalToV07HxH02GO+ALyIpJai3Om3AZaGEJaFEH7Bi0Ocnu+YpsAbsedT4/afCrwWQvgmhPAtPt+4076HLcUpMxP+9jdYudIXed+40Rd3qV8frr8eFi+OOkIRKS5FSfq18SVz8uTGtsWbi1cmAvgdsL+ZVS/iezGzfmY2w8xmrFu3rqixSzGrXNmHei5cCC+/7GUebr8djjwSjj3WLwjf5K9eKyJJpShJv6CSXvlbfYcAJ5rZbOBEvBrR5iK+lxDC2BBCTgghp2bNmkUISUpSRgZ06gQvveRj+0eO9BW/rrgCatXyPoHnn/dS0CKSXIqS9HOBOnGvM/FSc9uEEFaHEM4IIbTCV1ImhLChKO+VxFarFgwZAvPmwaxZXv5h2jTo3t2bha66ystAi0hyKErSnw40MrMGZrYf0BOYFH+AmdUws7zfNQwYF3v+KnCKmVWLdeCeEtsmScYMWrXyuj6rVvkEr3btvPBbq1Zw9NFe/XPz5qgjFZHdKTTphxA2AwPwZL0ImBhCWGBmw82se+yw9sBiM/sUX67m1th7vwH+gl84pgPDY9skiZUrB6edBk895TV/xozxUs+9e0OjRr74y48/Rh2liBRE9fSlWGzdCi+84B2/770H1avDlVdC//5Qo0bh7xeRfaN6+lKqMjK8nf/dd32C1/HHwy23eL2fgQNhxYqoIxQRUNKXEnDCCd7mv2ABnH023H+/z/g991yvACoi0VHSlxLTtCk89BAsWwZ//KNfCLKzoW1bGDdOxd5EoqCkLyUuMxPuvBO++ALuuMNX9erb14eDXnyx6v2IlCYlfSk11arBNdfAokXe9t+zp1f8bNfOZ/3efruPBhKRkqOkL6XOzDt68xZ5f+ghL/U8dCjUqePDQZ99VgXfREqCkr5EqkoVuPBCn+X76adw3XU+8/eMM3zkz9/+Bt9+G3WUIqlDSV8SRqNGcOutvtDLiy9Cy5Ze5bNOHe8I/vzzqCMUSX5K+pJwypaFLl3glVd8iOcZZ/is34YNffnHWbOijlAkeSnpS0LLyoLx432hl6uu8m8ARx8NJ5/s5Z816kdkzyjpS1LIzPQSzytX+s/Fi/3bQFYWPPywOn1FikpJX5LKgQd6qedly+CRR3wk0EUXeafvrbf6HAAR2TUlfUlK++0HF1zgbf6vvuozfW+4wTt9L73U5wKIyM6U9CWpmcEpp3j7/oIFvrbv+PFeAqJLF3jtNbX7i8RT0peU0bQpjB3r5R6GD/dRPqec4u3+48bBpk1RRygSPSV9STk1a8KNN/q4/ocfhjJlvNZPvXp+MdBkL0lnSvqSssqXhz59YPZseOMNaNMGbr7Zk/+wYbB2bdQRipQ+JX1JeWZw0knw/PO+wHvXrl7crX59n+m7alXUEYqUHiV9SSstWvgC7osWwTnn+Hq+hx0Gl13mE8BEUp2SvqSlI47w6p5LlnhN/4ce8to/ffrAJ59EHZ1IyVHSl7TWoAHcd59P9rrySq/v37SpfwvQ0o6SipT0RYDatWHUKF/AfehQH/efne3t/+++G3V0IsVHSV8kzsEHw1//6sM9//IX+OgjX9P3xBN95q8mekmyU9IXKUC1al7WYcUKGD0aPvsMOnXyCp9PPQVbtkQdocjeUdIX2Y3KlWHQIG/zf/BB2LgRfv97b/d/6CFV95Tko6QvUgT77eezehctgieegIoVfdTP4YfD3XfDDz9EHaFI0Sjpi+yBMmXg7LN9lu9LL/ns3kGD/Octt6i0syQ+JX2RvWAGnTvD22/DO+/A8cfDn//syX/gQK3nK4lLSV9kH51wAkyaBPPne3v/fff5er7nnw8ffxx1dCI7UtIXKSbNmnlVz2XL/G7/2We9rHPXrjBtmoZ7SmIoUtI3s05mttjMlprZ0AL21zWzqWY228zmmVmX2Pb6ZvaTmc2JPe4v7hMQSTR16sBdd3ld/7/8BaZP93H+J5wAr78edXSS7gpN+mZWBhgDdAaaAr3MrGm+w24AJoYQWgE9gXvj9n0WQsiOPS4rprhFEt5BB20f6z9mjFfz7NjRx/urxINEpSh3+m2ApSGEZSGEX4AJwOn5jgnAAbHnBwKriy9EkeRWqRJccQUsXgx//7vP8m3VCi68EFaujDo6STdFSfq1gfh/mrmxbfFuAc4zs1zgJeDKuH0NYs0+b5lZu30JViSZVagAV1/ts3uHDIEJE7yy53XXwfr1UUcn6aIoSd8K2Ja/S6oX8HAIIRPoAjxqZhnAGqBurNnnauA/ZnZAvvdiZv3MbIaZzVi3bt2enYFIkqlWDe64Az791Kt5jhzpo31GjYKff446Okl1RUn6uUCduNeZ7Nx80xeYCBBCeB+oANQIIfwcQvg6tn0m8BnQOP8fCCGMDSHkhBByatasuednIZKE6taFRx7xBdxzcvxbwJFH+iIvW7dGHZ2kqqIk/elAIzNrYGb74R21k/Id8wVwMoCZNcGT/jozqxnrCMbMDgMaAcuKK3iRVJCd7RU8J0+GqlWhd29fz3fq1Kgjk1RUaNIPIWwGBgCvAovwUToLzGy4mXWPHTYYuMTM5gKPAxeGEALwW2BebPtTwGUhhG9K4kREkl3HjjBzJowfD+vW+bq+3brBwoVRRyapxEKCzRjJyckJM2bMiDoMkUht2uSF3P76V/j+ey/29uc/Q61aUUcmicrMZoYQcgo7TjNyRRJQhQpw7bWwdKkv4/jwwz7S55ZbvLyzyN5S0hdJYDVq+CIuixZBly5+t9+oEYwdC5s3Rx2dJCMlfZEk0LAhTJwI77/vzy+91Ov6PP+8avrInlHSF0kixx7r5Zyfecbv9Lt397o+WrxdikpJXyTJmMHvfgcLFsC998KSJb54e/fuKuUshVPSF0lS5crB5Zd7Z++tt8Jbb0HLltCnjxd5EymIkr5IkqtcGa6/3uv4Dxnibf+NG/syjmvXRh2dJBolfZFb2cyLAAANu0lEQVQUUb261/RZssQreI4ZA4cdBjffDN99F3V0kiiU9EVSTGamD+lcsMCHeQ4f7sn/n/+ELVuijk6ipqQvkqKOOMKbeqZP9/o+Awf66l3z50cdmURJSV8kxeXkwGuvwb//7bX8jzrKm3xUxjk9KemLpAEzr965cCGcfbY3+Rx1lE/2kvSipC+SRmrWhMcegxdf9EJuJ5zgzT6q55M+lPRF0lCXLt7R278/3HMPNGsGr7wSdVRSGpT0RdLU/vv7iJ533vHF2zt3hvPPh6++ijoyKUlK+iJp7vjjYc4cuPFGX6y9aVMv5awlG1OTkr6IUL68d+7OmuVVPC+6yOv5zJ4ddWRS3JT0RWSbFi28Yue4cV7TJyfH2/2/0SKnKUNJX0R2kJHhd/qffuoJ//77faLXgw+qyScVKOmLSIGqVvV1emfNgiOPhEsu8Xr+06dHHZnsCyV9Edmtli1h2jR49FFYuRKOOQb69dMon2SlpC8ihTKD886DxYvhqqu8zb9xY2/6UZNPclHSF5EiO+AA+PvfYe5c/wZw+eXbh3xKclDSF5E91qwZTJniJR2WL4ejj4arr/bSDpLYlPRFZK+YwbnnwiefeCfvqFHQpIkv2h5C1NHJrijpi8g+qVbN2/bfe89X7zrzTDjtNK3Tm6iU9EWkWBx3HMyc6W3+b77p5Rxuuw1++SXqyCSekr6IFJuyZb1tf9Ei6NQJhg3zuv1vvx11ZJJHSV9Eil2dOt62P2mSd+7+9rdw2WVaoD0RKOmLSIk57TRfrevqq+GBB6B5c9Xtj1qRkr6ZdTKzxWa21MyGFrC/rplNNbPZZjbPzLrE7RsWe99iMzu1OIMXkcRXubK387/7LlSp4nX7L7oIvv026sjSU6FJ38zKAGOAzkBToJeZNc132A3AxBBCK6AncG/svU1jr5sBnYB7Y79PRNLMscd6qeY//clLOjRtCs89F3VU6acod/ptgKUhhGUhhF+ACcDp+Y4JwAGx5wcCq2PPTwcmhBB+DiEsB5bGfp+IpKHy5WHECC/adsgh0KMH9OoF69ZFHVn6KErSrw2sjHudG9sW7xbgPDPLBV4CrtyD94pImmnVyhP/8OHw9NM+w3fiRE3qKg1FSfpWwLb8H00v4OEQQibQBXjUzDKK+F7MrJ+ZzTCzGet0yRdJC+XK+RKNs2ZB/fpwzjk+set//4s6stRWlKSfC9SJe53J9uabPH2BiQAhhPeBCkCNIr6XEMLYEEJOCCGnZs2aRY9eRJJe8+Y+m/eOO+Cll7yt/7HHdNdfUoqS9KcDjcysgZnth3fMTsp3zBfAyQBm1gRP+utix/U0s/Jm1gBoBHxUXMGLSGooWxauucardzZpAuefD6efDqt3ukWUfVVo0g8hbAYGAK8Ci/BROgvMbLiZdY8dNhi4xMzmAo8DFwa3AP8GsBB4BegfQthSEiciIsnviCN8wZZRo+D1172t/5FHdNdfnCwk2H/NnJycMGPGjKjDEJGILVkCF18M77wDXbrA2LFQW8NAdsnMZoYQcgo7TjNyRSQhNWoEb70F//iHF3Br1gweekh3/ftKSV9EElZGBgwcCPPmQXa23/l37uxr9creUdIXkYTXsKGv1HXPPd7c06yZ1/LRXf+eU9IXkaSQkQH9+/tdf04O9OsHXbvCmjVRR5ZclPRFJKkcdpiP7PnnP72tv0ULn9UrRaOkLyJJJyMDBgzw2bwNGsBZZ0GfPrBhQ9SRJT4lfRFJWkce6bN5b7oJ/v1vyMryu3/ZNSV9EUlq5crBn//s9frLl4eTToIhQ2DTpqgjS0xK+iKSEo45xuv1X3aZL9rSurWXdZAdKemLSMqoXBnuvdcLt331lSf+22+HLSr+so2SvoiknM6d4eOPoXt3GDoU2reH5cujjioxKOmLSEqqUQOefNILts2b5528KuOgpC8iKcwMLrjAk/5RR3kZhzPPTO/lGZX0RSTl1avnZRxGjoQXX/QJXS++GHVU0VDSF5G0UKaMD+WcPh0OPhi6dYPLL4cffog6stKlpC8iaSUrCz76yC8A//qXV+/88MOooyo9SvoiknYqVPCmnilT4Jdf4IQT4Oab4ddfo46s5Cnpi0jaat/eO3l794bhw+H442Hx4qijKllK+iKS1g48EMaP9+Gdy5b5KJ9UrtWvpC8iglfqnDcPjjvOa/WfcYbP6k01SvoiIjG1a8PkyXDnnT6kMyvLX6cSJX0RkTgZGTB4sI/wqVoVTj0Vrr46dap2KumLiBQgOxtmzPAlGkeN8iqeCxZEHdW+U9IXEdmFSpV8MfYXXoD//Q+OPtqXaUzmTl4lfRGRQnTt6p28J58MAwf66//9L+qo9o6SvohIERxyiN/x33MPTJ3qnbwvvBB1VHtOSV9EpIjMvI1/xgw49FA47TS44gr48ceoIys6JX0RkT3UrJnX6xk8GO67z9v6Z8+OOqqiUdIXEdkL5cv7eP7XXoMNG3x0z513wtatUUe2e0r6IiL74P/+z5dm7NYNrrkGOnaE3Nyoo9q1IiV9M+tkZovNbKmZDS1g/ygzmxN7fGpm6+P2bYnbN6k4gxcRSQTVq8PTT8ODD8IHH3gn79NPRx1VwQpN+mZWBhgDdAaaAr3MrGn8MSGEq0II2SGEbOCfwDNxu3/K2xdC6F6MsYuIJAwz6NvX2/YbNvRaPn37wsaNUUe2o6Lc6bcBloYQloUQfgEmAKfv5vhewOPFEZyISLJp3Bjeew/+9CdfiD1vZm+iKErSrw2sjHudG9u2EzOrBzQApsRtrmBmM8zsAzPrsdeRiogkiXLlYMQIeOut7Yu03HNPYszkLUrStwK27Sr0nsBTIYQtcdvqhhBygN7AaDNruNMfMOsXuzDMWJfOy9SLSEpp186bezp2hCuvhN//3kf6RKkoST8XqBP3OhNYvYtje5KvaSeEsDr2cxnwJtAq/5tCCGNDCDkhhJyaNWsWISQRkeRQvTpMmuTLM/73v75Iy8yZ0cVTlKQ/HWhkZg3MbD88se80CsfMjgCqAe/HbatmZuVjz2sAJwALiyNwEZFkkZHhC7FPm+br8B5/fHTNPYUm/RDCZmAA8CqwCJgYQlhgZsPNLH40Ti9gQgg7nEYTYIaZzQWmAreFEJT0RSQtHX989M09FhKhZyFOTk5OmJFIXd0iIsVs61a46y4YOhTq1YOJE72Uw74ws5mx/tPd0oxcEZFSFmVzj5K+iEhE8jf3nHNOydfuKVuyv15ERHYnb3TPXXd5+35GCd+KK+mLiEQsr7mnVP5W6fwZERFJBEr6IiJpRElfRCSNKOmLiKQRJX0RkTSipC8ikkaU9EVE0oiSvohIGkm4gmtmtg74PN/mGsBXEYRTklLtnFLtfCD1zinVzgdS75z25XzqhRAKXZAk4ZJ+QcxsRlGqxyWTVDunVDsfSL1zSrXzgdQ7p9I4HzXviIikESV9EZE0kixJf2zUAZSAVDunVDsfSL1zSrXzgdQ7pxI/n6Ro0xcRkeKRLHf6IiJSDBIu6ZvZODNba2bz47YdZGavmdmS2M9qUca4J3ZxPreY2SozmxN7dIkyxj1lZnXMbKqZLTKzBWY2KLY9KT+n3ZxP0n5OZlbBzD4ys7mxc/pzbHsDM/sw9hk9YWb7RR1rUezmfB42s+Vxn1F21LHuCTMrY2azzeyF2OsS/3wSLukDDwOd8m0bCrwRQmgEvBF7nSweZufzARgVQsiOPV4q5Zj21WZgcAihCXAs0N/MmpK8n9OuzgeS93P6GTgphNASyAY6mdmxwO34OTUCvgX6RhjjntjV+QBcE/cZzYkuxL0yCFgU97rEP5+ES/ohhGnAN/k2nw48Env+CNCjVIPaB7s4n6QWQlgTQpgVe/49/o+2Nkn6Oe3mfJJWcBtjL8vFHgE4CXgqtj2ZPqNdnU/SMrNMoCvwYOy1UQqfT8Il/V04JISwBvx/UODgiOMpDgPMbF6s+ScpmkEKYmb1gVbAh6TA55TvfCCJP6dY08EcYC3wGvAZsD6EsDl2SC5JdHHLfz4hhLzP6NbYZzTKzMpHGOKeGg1cC+QthV6dUvh8kiXpp5r7gIb419Q1wN+jDWfvmFkV4GngjyGE76KOZ18VcD5J/TmFELaEELKBTKAN0KSgw0o3qr2X/3zMrDkwDDgSaA0cBFwXYYhFZmbdgLUhhJnxmws4tNg/n2RJ+l+aWS2A2M+1EcezT0IIX8b+AW8FHsD/h0wqZlYOT5D/DiE8E9uctJ9TQeeTCp8TQAhhPfAm3l9R1czKxnZlAqujimtvxZ1Pp1jTXAgh/Aw8RPJ8RicA3c1sBTABb9YZTSl8PsmS9CcBfWLP+wDPRRjLPstLjDG/A+bv6thEFGt7/H/AohDCXXG7kvJz2tX5JPPnZGY1zaxq7HlF4P/wvoqpwFmxw5LpMyrofD6Ju8kwvP07KT6jEMKwEEJmCKE+0BOYEkI4l1L4fBJucpaZPQ60x6vNfQncDPwXmAjUBb4Afh9CSIrO0V2cT3u8ySAAK4BL89rCk4GZtQXeBj5me3vk9Xg7eNJ9Trs5n14k6edkZll4R2AZ/OZuYghhuJkdht9ZHgTMBs6L3SUntN2czxSgJt40Mge4LK7DNymYWXtgSAihW2l8PgmX9EVEpOQkS/OOiIgUAyV9EZE0oqQvIpJGlPRFRNKIkr6ISBpR0hcRSSNK+iIiaURJX0Qkjfx/8h01x0+zQTsAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "df=pd.DataFrame(model.learning_process[10:], columns=['epoch', 'train_RMSE', 'test_RMSE'])\n", + "plt.plot('epoch', 'train_RMSE', data=df, color='blue')\n", + "plt.plot('epoch', 'test_RMSE', data=df, color='yellow', linestyle='dashed')\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Saving and evaluating recommendations" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "model.estimations()\n", + "\n", + "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", + "\n", + "top_n.to_csv('Recommendations generated/ml-100k/Self_SVD_reco.csv', index=False, header=False)\n", + "\n", + "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", + "estimations.to_csv('Recommendations generated/ml-100k/Self_SVD_estimations.csv', index=False, header=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "943it [00:00, 7303.87it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRH2RReco in testTest coverageShannonGini
00.915730.7189210.1022270.0431370.0519810.0688720.0935620.0780570.1048280.0494480.1912430.5182860.4729590.2587490.8592790.139253.831520.973234
\n", + "
" + ], + "text/plain": [ + " RMSE MAE precision recall F_1 F_05 \\\n", + "0 0.91573 0.718921 0.102227 0.043137 0.051981 0.068872 \n", + "\n", + " precision_super recall_super NDCG mAP MRR LAUC \\\n", + "0 0.093562 0.078057 0.104828 0.049448 0.191243 0.518286 \n", + "\n", + " HR H2R Reco in test Test coverage Shannon Gini \n", + "0 0.472959 0.258749 0.859279 0.13925 3.83152 0.973234 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import evaluation_measures as ev\n", + "\n", + "estimations_df=pd.read_csv('Recommendations generated/ml-100k/Self_SVD_estimations.csv', header=None)\n", + "reco=np.loadtxt('Recommendations generated/ml-100k/Self_SVD_reco.csv', delimiter=',')\n", + "\n", + "ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None),\n", + " estimations_df=estimations_df, \n", + " reco=reco,\n", + " super_reactions=[4,5])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "943it [00:00, 6614.64it/s]\n", + "943it [00:00, 6657.91it/s]\n", + "943it [00:00, 6616.31it/s]\n", + "943it [00:00, 7049.97it/s]\n", + "943it [00:00, 7105.27it/s]\n", + "943it [00:00, 7296.68it/s]\n", + "943it [00:00, 6993.15it/s]\n", + "943it [00:00, 7255.64it/s]\n", + "943it [00:00, 6724.45it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRH2RReco in testTest coverageShannonGini
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.1374730.2146510.1117070.4009390.5555460.7656420.4920471.0000000.0389613.1590790.987317
0Self_SVD0.9157300.7189210.1022270.0431370.0519810.0688720.0935620.0780570.1048280.0494480.1912430.5182860.4729590.2587490.8592790.1392503.8315200.973234
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.0564630.0959570.0431780.1981930.5155010.4379640.2396611.0000000.0339112.8365130.991139
0Self_GlobalAvg1.1257600.9435340.0611880.0259680.0313830.0413430.0405580.0321070.0676950.0274700.1711870.5095460.3849420.1421001.0000000.0259742.7117720.992003
0Ready_Random1.5249541.2233520.0455990.0211810.0245850.0315180.0278970.0219310.0481110.0173810.1190050.5070960.3308590.0911980.9881230.1818185.1007920.906866
0Self_TopRatedNaNNaN0.0320250.0126740.0157140.0211830.0284330.0185730.0227410.0053280.0316020.5027640.2375400.0657480.6970310.0144302.2208110.995173
0Self_KNNSurprisetask0.9971060.7841630.0056200.0029210.0034940.0043250.0049360.0034610.0071030.0028330.0214310.4978190.0424180.0095440.4532340.1370852.8663470.982811
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.0001890.0007520.0001680.0016770.4964240.0095440.0000000.6005300.0050511.8031260.996380
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.0000000.0002140.0000370.0003680.4963910.0031810.0000000.3921530.1154404.1747410.965327
\n", + "
" + ], + "text/plain": [ + " Model RMSE MAE precision recall F_1 \\\n", + "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n", + "0 Self_SVD 0.915730 0.718921 0.102227 0.043137 0.051981 \n", + "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n", + "0 Self_GlobalAvg 1.125760 0.943534 0.061188 0.025968 0.031383 \n", + "0 Ready_Random 1.524954 1.223352 0.045599 0.021181 0.024585 \n", + "0 Self_TopRated NaN NaN 0.032025 0.012674 0.015714 \n", + "0 Self_KNNSurprisetask 0.997106 0.784163 0.005620 0.002921 0.003494 \n", + "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n", + "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n", + "\n", + " F_05 precision_super recall_super NDCG mAP MRR \\\n", + "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n", + "0 0.068872 0.093562 0.078057 0.104828 0.049448 0.191243 \n", + "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n", + "0 0.041343 0.040558 0.032107 0.067695 0.027470 0.171187 \n", + "0 0.031518 0.027897 0.021931 0.048111 0.017381 0.119005 \n", + "0 0.021183 0.028433 0.018573 0.022741 0.005328 0.031602 \n", + "0 0.004325 0.004936 0.003461 0.007103 0.002833 0.021431 \n", + "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n", + "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n", + "\n", + " LAUC HR H2R Reco in test Test coverage Shannon \\\n", + "0 0.555546 0.765642 0.492047 1.000000 0.038961 3.159079 \n", + "0 0.518286 0.472959 0.258749 0.859279 0.139250 3.831520 \n", + "0 0.515501 0.437964 0.239661 1.000000 0.033911 2.836513 \n", + "0 0.509546 0.384942 0.142100 1.000000 0.025974 2.711772 \n", + "0 0.507096 0.330859 0.091198 0.988123 0.181818 5.100792 \n", + "0 0.502764 0.237540 0.065748 0.697031 0.014430 2.220811 \n", + "0 0.497819 0.042418 0.009544 0.453234 0.137085 2.866347 \n", + "0 0.496424 0.009544 0.000000 0.600530 0.005051 1.803126 \n", + "0 0.496391 0.003181 0.000000 0.392153 0.115440 4.174741 \n", + "\n", + " Gini \n", + "0 0.987317 \n", + "0 0.973234 \n", + "0 0.991139 \n", + "0 0.992003 \n", + "0 0.906866 \n", + "0 0.995173 \n", + "0 0.982811 \n", + "0 0.996380 \n", + "0 0.965327 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import evaluation_measures as ev\n", + "\n", + "dir_path=\"Recommendations generated/ml-100k/\"\n", + "super_reactions=[4,5]\n", + "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", + "\n", + "ev.evaluate_all(test, dir_path, super_reactions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 2],\n", + " [3, 4]])" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "array([[0.4472136 , 0.89442719],\n", + " [0.6 , 0.8 ]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x=np.array([[1,2],[3,4]])\n", + "display(x)\n", + "x/np.linalg.norm(x, axis=1)[:,None]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
codescoreitem_ididtitlegenres
014551.00000014561456Beat the Devil (1954)Comedy, Drama
115230.99308315241524Kaspar Hauser (1993)Drama
213660.99219513671367Faust (1994)Animation
311680.99213111691169Fresh (1994)Drama
413680.99118313691369Forbidden Christ, The (Cristo proibito, Il) (1...Drama
514500.99074314511451Foreign Correspondent (1940)Thriller
69260.990661927927Flower of My Secret, The (Flor de mi secreto, ...Drama
710670.99004810681068Star Maker, The (Uomo delle stelle, L') (1995)Drama
813990.98984214001400Picture Bride (1995)Drama, Romance
912040.98962512051205Secret Agent, The (1996)Drama
\n", + "
" + ], + "text/plain": [ + " code score item_id id \\\n", + "0 1455 1.000000 1456 1456 \n", + "1 1523 0.993083 1524 1524 \n", + "2 1366 0.992195 1367 1367 \n", + "3 1168 0.992131 1169 1169 \n", + "4 1368 0.991183 1369 1369 \n", + "5 1450 0.990743 1451 1451 \n", + "6 926 0.990661 927 927 \n", + "7 1067 0.990048 1068 1068 \n", + "8 1399 0.989842 1400 1400 \n", + "9 1204 0.989625 1205 1205 \n", + "\n", + " title genres \n", + "0 Beat the Devil (1954) Comedy, Drama \n", + "1 Kaspar Hauser (1993) Drama \n", + "2 Faust (1994) Animation \n", + "3 Fresh (1994) Drama \n", + "4 Forbidden Christ, The (Cristo proibito, Il) (1... Drama \n", + "5 Foreign Correspondent (1940) Thriller \n", + "6 Flower of My Secret, The (Flor de mi secreto, ... Drama \n", + "7 Star Maker, The (Uomo delle stelle, L') (1995) Drama \n", + "8 Picture Bride (1995) Drama, Romance \n", + "9 Secret Agent, The (1996) Drama " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item=random.choice(list(set(train_ui.indices)))\n", + "\n", + "embeddings_norm=model.Qi/np.linalg.norm(model.Qi, axis=1)[:,None] # we do not mean-center here\n", + "# omitting normalization also makes sense, but items with a greater magnitude will be recommended more often\n", + "\n", + "similarity_scores=np.dot(embeddings_norm,embeddings_norm[item].T)\n", + "top_similar_items=pd.DataFrame(enumerate(similarity_scores), columns=['code', 'score'])\\\n", + ".sort_values(by=['score'], ascending=[False])[:10]\n", + "\n", + "top_similar_items['item_id']=top_similar_items['code'].apply(lambda x: item_code_id[x])\n", + "\n", + "items=pd.read_csv('./Datasets/ml-100k/movies.csv')\n", + "\n", + "result=pd.merge(top_similar_items, items, left_on='item_id', right_on='id')\n", + "\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# project task 5: implement SVD on top baseline (as it is in Surprise library)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# making changes to our implementation by considering additional parameters in the gradient descent procedure \n", + "# seems to be the fastest option\n", + "# please save the output in 'Recommendations generated/ml-100k/Self_SVDBaseline_reco.csv' and\n", + "# 'Recommendations generated/ml-100k/Self_SVDBaseline_estimations.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class SVD_bias():\n", + " \n", + " def __init__(self, train_ui, learning_rate, regularization, nb_factors, iterations):\n", + " self.train_ui=train_ui\n", + " self.uir=list(zip(*[train_ui.nonzero()[0],train_ui.nonzero()[1], train_ui.data]))\n", + " \n", + " self.learning_rate=learning_rate\n", + " self.regularization=regularization\n", + " self.iterations=iterations\n", + " self.nb_users, self.nb_items=train_ui.shape\n", + " self.nb_ratings=train_ui.nnz\n", + " self.nb_factors=nb_factors\n", + " \n", + " self.Pu=np.random.normal(loc=0, scale=1./self.nb_factors, size=(self.nb_users, self.nb_factors))\n", + " self.Qi=np.random.normal(loc=0, scale=1./self.nb_factors, size=(self.nb_items, self.nb_factors))\n", + " self.bias_u = np.zeros(self.nb_users)\n", + " self.bias_i = np.zeros(self.nb_items)\n", + "\n", + " def train(self, test_ui=None):\n", + " if test_ui!=None:\n", + " self.test_uir=list(zip(*[test_ui.nonzero()[0],test_ui.nonzero()[1], test_ui.data]))\n", + " \n", + " self.learning_process=[]\n", + " pbar = tqdm(range(self.iterations))\n", + " for i in pbar:\n", + " pbar.set_description(f'Epoch {i} RMSE: {self.learning_process[-1][1] if i>0 else 0}. Training epoch {i+1}...')\n", + " np.random.shuffle(self.uir)\n", + " self.sgd(self.uir)\n", + " if test_ui==None:\n", + " self.learning_process.append([i+1, self.RMSE_total(self.uir)])\n", + " else:\n", + " self.learning_process.append([i+1, self.RMSE_total(self.uir), self.RMSE_total(self.test_uir)])\n", + " \n", + " def sgd(self, uir):\n", + " \n", + " for u, i, score in uir:\n", + " # Computer prediction and error\n", + " prediction = self.get_rating(u,i)\n", + " e = (score - prediction)\n", + " \n", + " # Update user and item latent feature matrices\n", + " Pu_update=self.learning_rate * (e * self.Qi[i] - self.regularization * self.Pu[u])\n", + " Qi_update=self.learning_rate * (e * self.Pu[u] - self.regularization * self.Qi[i])\n", + " bias_u_update=self.learning_rate * (e - self.regularization * self.bias_u[u])\n", + " bias_i_update=self.learning_rate * (e - self.regularization * self.bias_i[i])\n", + " \n", + " self.Pu[u] += Pu_update\n", + " self.Qi[i] += Qi_update\n", + " self.bias_u[u] += bias_u_update\n", + " self.bias_i[i] += bias_i_update\n", + " \n", + " def get_rating(self, u, i):\n", + " prediction = self.bias_u[u] + self.bias_i[i] + self.Pu[u].dot(self.Qi[i].T)\n", + " return prediction\n", + " \n", + " def RMSE_total(self, uir):\n", + " RMSE=0\n", + " for u,i, score in uir:\n", + " prediction = self.get_rating(u,i)\n", + " RMSE+=(score - prediction)**2\n", + " return np.sqrt(RMSE/len(uir))\n", + " \n", + " def estimations(self):\n", + " self.estimations=\\\n", + " self.bias_u[:,np.newaxis] + self.bias_i[np.newaxis:,] + np.dot(self.Pu,self.Qi.T)\n", + "\n", + " def recommend(self, user_code_id, item_code_id, topK=10):\n", + " \n", + " top_k = defaultdict(list)\n", + " for nb_user, user in enumerate(self.estimations):\n", + " \n", + " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", + " for item, score in enumerate(user):\n", + " if item not in user_rated and not np.isnan(score):\n", + " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", + " result=[]\n", + " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", + " for uid, item_scores in top_k.items():\n", + " item_scores.sort(key=lambda x: x[1], reverse=True)\n", + " result.append([uid]+list(chain(*item_scores[:topK])))\n", + " return result\n", + " \n", + " def estimate(self, user_code_id, item_code_id, test_ui):\n", + " result=[]\n", + " for user, item in zip(*test_ui.nonzero()):\n", + " result.append([user_code_id[user], item_code_id[item], \n", + " self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ready-made SVD - Surprise implementation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SVD" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating predictions...\n", + "Generating top N recommendations...\n", + "Generating predictions...\n" + ] + } + ], + "source": [ + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "algo = sp.SVD(biased=False) # to use unbiased version\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_SVD_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Ready_SVD_estimations.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SVD biased - on top baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating predictions...\n", + "Generating top N recommendations...\n", + "Generating predictions...\n" + ] + } + ], + "source": [ + "import helpers\n", + "import surprise as sp\n", + "import imp\n", + "imp.reload(helpers)\n", + "\n", + "algo = sp.SVD() # default is biased=True\n", + "\n", + "helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_SVDBiased_reco.csv',\n", + " estimations_path='Recommendations generated/ml-100k/Ready_SVDBiased_estimations.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "943it [00:00, 5926.84it/s]\n", + "943it [00:00, 6314.27it/s]\n", + "943it [00:00, 5917.48it/s]\n", + "943it [00:00, 6138.94it/s]\n", + "943it [00:00, 6278.83it/s]\n", + "943it [00:00, 6319.68it/s]\n", + "943it [00:00, 4892.96it/s]\n", + "943it [00:00, 6955.58it/s]\n", + "943it [00:00, 4946.53it/s]\n", + "943it [00:00, 6823.16it/s]\n", + "943it [00:00, 6276.95it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelRMSEMAEprecisionrecallF_1F_05precision_superrecall_superNDCGmAPMRRLAUCHRH2RReco in testTest coverageShannonGini
0Self_TopPop2.5082582.2179090.1888650.1169190.1187320.1415840.1304720.1374730.2146510.1117070.4009390.5555460.7656420.4920471.0000000.0389613.1590790.987317
0Ready_SVD0.9522470.7511850.0941680.0441670.0509190.0653910.0830470.0693300.1042660.0476290.2277190.5187830.4931070.2386000.9950160.2121214.4529470.951495
0Self_SVD0.9157300.7189210.1022270.0431370.0519810.0688720.0935620.0780570.1048280.0494480.1912430.5182860.4729590.2587490.8592790.1392503.8315200.973234
0Ready_Baseline0.9494590.7524870.0914100.0376520.0460300.0612860.0796140.0564630.0959570.0431780.1981930.5155010.4379640.2396611.0000000.0339112.8365130.991139
0Ready_SVDBiased0.9390530.7408400.0838810.0340330.0418620.0558080.0743560.0517530.0921230.0422240.1991650.5136790.4347830.2036060.9965010.1702744.1907390.963349
0Self_GlobalAvg1.1257600.9435340.0611880.0259680.0313830.0413430.0405580.0321070.0676950.0274700.1711870.5095460.3849420.1421001.0000000.0259742.7117720.992003
0Ready_Random1.5249541.2233520.0455990.0211810.0245850.0315180.0278970.0219310.0481110.0173810.1190050.5070960.3308590.0911980.9881230.1818185.1007920.906866
0Self_TopRatedNaNNaN0.0320250.0126740.0157140.0211830.0284330.0185730.0227410.0053280.0316020.5027640.2375400.0657480.6970310.0144302.2208110.995173
0Self_KNNSurprisetask0.9971060.7841630.0056200.0029210.0034940.0043250.0049360.0034610.0071030.0028330.0214310.4978190.0424180.0095440.4532340.1370852.8663470.982811
0Self_BaselineUI0.9675850.7627400.0009540.0001700.0002780.0004630.0006440.0001890.0007520.0001680.0016770.4964240.0095440.0000000.6005300.0050511.8031260.996380
0Self_IKNN1.0183630.8087930.0003180.0001080.0001400.0001890.0000000.0000000.0002140.0000370.0003680.4963910.0031810.0000000.3921530.1154404.1747410.965327
\n", + "
" + ], + "text/plain": [ + " Model RMSE MAE precision recall F_1 \\\n", + "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n", + "0 Ready_SVD 0.952247 0.751185 0.094168 0.044167 0.050919 \n", + "0 Self_SVD 0.915730 0.718921 0.102227 0.043137 0.051981 \n", + "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n", + "0 Ready_SVDBiased 0.939053 0.740840 0.083881 0.034033 0.041862 \n", + "0 Self_GlobalAvg 1.125760 0.943534 0.061188 0.025968 0.031383 \n", + "0 Ready_Random 1.524954 1.223352 0.045599 0.021181 0.024585 \n", + "0 Self_TopRated NaN NaN 0.032025 0.012674 0.015714 \n", + "0 Self_KNNSurprisetask 0.997106 0.784163 0.005620 0.002921 0.003494 \n", + "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n", + "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n", + "\n", + " F_05 precision_super recall_super NDCG mAP MRR \\\n", + "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n", + "0 0.065391 0.083047 0.069330 0.104266 0.047629 0.227719 \n", + "0 0.068872 0.093562 0.078057 0.104828 0.049448 0.191243 \n", + "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n", + "0 0.055808 0.074356 0.051753 0.092123 0.042224 0.199165 \n", + "0 0.041343 0.040558 0.032107 0.067695 0.027470 0.171187 \n", + "0 0.031518 0.027897 0.021931 0.048111 0.017381 0.119005 \n", + "0 0.021183 0.028433 0.018573 0.022741 0.005328 0.031602 \n", + "0 0.004325 0.004936 0.003461 0.007103 0.002833 0.021431 \n", + "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n", + "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n", + "\n", + " LAUC HR H2R Reco in test Test coverage Shannon \\\n", + "0 0.555546 0.765642 0.492047 1.000000 0.038961 3.159079 \n", + "0 0.518783 0.493107 0.238600 0.995016 0.212121 4.452947 \n", + "0 0.518286 0.472959 0.258749 0.859279 0.139250 3.831520 \n", + "0 0.515501 0.437964 0.239661 1.000000 0.033911 2.836513 \n", + "0 0.513679 0.434783 0.203606 0.996501 0.170274 4.190739 \n", + "0 0.509546 0.384942 0.142100 1.000000 0.025974 2.711772 \n", + "0 0.507096 0.330859 0.091198 0.988123 0.181818 5.100792 \n", + "0 0.502764 0.237540 0.065748 0.697031 0.014430 2.220811 \n", + "0 0.497819 0.042418 0.009544 0.453234 0.137085 2.866347 \n", + "0 0.496424 0.009544 0.000000 0.600530 0.005051 1.803126 \n", + "0 0.496391 0.003181 0.000000 0.392153 0.115440 4.174741 \n", + "\n", + " Gini \n", + "0 0.987317 \n", + "0 0.951495 \n", + "0 0.973234 \n", + "0 0.991139 \n", + "0 0.963349 \n", + "0 0.992003 \n", + "0 0.906866 \n", + "0 0.995173 \n", + "0 0.982811 \n", + "0 0.996380 \n", + "0 0.965327 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import imp\n", + "imp.reload(ev)\n", + "\n", + "import evaluation_measures as ev\n", + "dir_path=\"Recommendations generated/ml-100k/\"\n", + "super_reactions=[4,5]\n", + "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", + "\n", + "ev.evaluate_all(test, dir_path, super_reactions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}