diff --git a/P2. Evaluation.ipynb b/P2. Evaluation.ipynb
index 1bc216d..fdea66d 100644
--- a/P2. Evaluation.ipynb
+++ b/P2. Evaluation.ipynb
@@ -26,18 +26,18 @@
"from tqdm import tqdm\n",
"\n",
"# In evaluation we do not load train set - it is not needed\n",
- "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
- "test.columns=['user', 'item', 'rating', 'timestamp']\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "test.columns = [\"user\", \"item\", \"rating\", \"timestamp\"]\n",
"\n",
- "test['user_code'] = test['user'].astype(\"category\").cat.codes\n",
- "test['item_code'] = test['item'].astype(\"category\").cat.codes\n",
+ "test[\"user_code\"] = test[\"user\"].astype(\"category\").cat.codes\n",
+ "test[\"item_code\"] = test[\"item\"].astype(\"category\").cat.codes\n",
"\n",
- "user_code_id = dict(enumerate(test['user'].astype(\"category\").cat.categories))\n",
+ "user_code_id = dict(enumerate(test[\"user\"].astype(\"category\").cat.categories))\n",
"user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
- "item_code_id = dict(enumerate(test['item'].astype(\"category\").cat.categories))\n",
+ "item_code_id = dict(enumerate(test[\"item\"].astype(\"category\").cat.categories))\n",
"item_id_code = dict((v, k) for k, v in item_code_id.items())\n",
"\n",
- "test_ui = sparse.csr_matrix((test['rating'], (test['user_code'], test['item_code'])))"
+ "test_ui = sparse.csr_matrix((test[\"rating\"], (test[\"user_code\"], test[\"item_code\"])))"
]
},
{
@@ -53,12 +53,20 @@
"metadata": {},
"outputs": [],
"source": [
- "estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)\n",
- "estimations_df.columns=['user', 'item' ,'score']\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_estimations.csv\", header=None\n",
+ ")\n",
+ "estimations_df.columns = [\"user\", \"item\", \"score\"]\n",
"\n",
- "estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]\n",
- "estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]\n",
- "estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)"
+ "estimations_df[\"user_code\"] = [user_id_code[user] for user in estimations_df[\"user\"]]\n",
+ "estimations_df[\"item_code\"] = [item_id_code[item] for item in estimations_df[\"item\"]]\n",
+ "estimations = sparse.csr_matrix(\n",
+ " (\n",
+ " estimations_df[\"score\"],\n",
+ " (estimations_df[\"user_code\"], estimations_df[\"item_code\"]),\n",
+ " ),\n",
+ " shape=test_ui.shape,\n",
+ ")"
]
},
{
@@ -68,16 +76,16 @@
"outputs": [],
"source": [
"def estimations_metrics(test_ui, estimations):\n",
- " result=[]\n",
+ " result = []\n",
"\n",
- " RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)\n",
- " result.append(['RMSE', RMSE])\n",
+ " RMSE = (np.sum((estimations.data - test_ui.data) ** 2) / estimations.nnz) ** (1 / 2)\n",
+ " result.append([\"RMSE\", RMSE])\n",
"\n",
- " MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz\n",
- " result.append(['MAE', MAE])\n",
- " \n",
- " df_result=(pd.DataFrame(list(zip(*result))[1])).T\n",
- " df_result.columns=list(zip(*result))[0]\n",
+ " MAE = np.sum(abs(estimations.data - test_ui.data)) / estimations.nnz\n",
+ " result.append([\"MAE\", MAE])\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
" return df_result"
]
},
@@ -169,15 +177,17 @@
],
"source": [
"import numpy as np\n",
- "reco = np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')\n",
- "# Let's ignore scores - they are not used in evaluation: \n",
- "users=reco[:,:1]\n",
- "items=reco[:,1::2]\n",
+ "\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_reco.csv\", delimiter=\",\"\n",
+ ")\n",
+ "# Let's ignore scores - they are not used in evaluation:\n",
+ "users = reco[:, :1]\n",
+ "items = reco[:, 1::2]\n",
"# Let's use inner ids instead of real ones\n",
- "users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)\n",
- "items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set\n",
- "# Let's put them into one array\n",
- "reco=np.concatenate((users, items), axis=1)\n",
+ "users = np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)\n",
+ "items = np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items)\n",
+ "reco = np.concatenate((users, items), axis=1)\n",
"reco"
]
},
@@ -188,79 +198,111 @@
"outputs": [],
"source": [
"def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):\n",
- " \n",
- " nb_items=test_ui.shape[1]\n",
- " relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\\\n",
- " 0,0,0,0,0,0,0,0,0,0,0,0,0\n",
- " \n",
- " cg = (1.0 / np.log2(np.arange(2, topK + 2)))\n",
+ "\n",
+ " nb_items = test_ui.shape[1]\n",
+ " (\n",
+ " relevant_users,\n",
+ " super_relevant_users,\n",
+ " prec,\n",
+ " rec,\n",
+ " F_1,\n",
+ " F_05,\n",
+ " prec_super,\n",
+ " rec_super,\n",
+ " ndcg,\n",
+ " mAP,\n",
+ " MRR,\n",
+ " LAUC,\n",
+ " HR,\n",
+ " ) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)\n",
+ "\n",
+ " cg = 1.0 / np.log2(np.arange(2, topK + 2))\n",
" cg_sum = np.cumsum(cg)\n",
- " \n",
- " for (nb_user, user) in tqdm(enumerate(reco[:,0])):\n",
- " u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]\n",
- " nb_u_rated_items=len(u_rated_items)\n",
- " if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)\n",
- " relevant_users+=1\n",
- " \n",
- " u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\\\n",
- " (test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]\n",
+ "\n",
+ " for (nb_user, user) in tqdm(enumerate(reco[:, 0])):\n",
+ " u_rated_items = test_ui.indices[test_ui.indptr[user] : test_ui.indptr[user + 1]]\n",
+ " nb_u_rated_items = len(u_rated_items)\n",
+ " if (\n",
+ " nb_u_rated_items > 0\n",
+ " ): # skip users with no items in test set (still possible that there will be no super items)\n",
+ " relevant_users += 1\n",
+ "\n",
+ " u_super_items = u_rated_items[\n",
+ " np.vectorize(lambda x: x in super_reactions)(\n",
+ " test_ui.data[test_ui.indptr[user] : test_ui.indptr[user + 1]]\n",
+ " )\n",
+ " ]\n",
" # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]\n",
" # but accesing test_ui[user,item] is expensive -we should avoid doing it\n",
- " if len(u_super_items)>0:\n",
- " super_relevant_users+=1\n",
- " \n",
- " user_successes=np.zeros(topK)\n",
- " nb_user_successes=0\n",
- " user_super_successes=np.zeros(topK)\n",
- " nb_user_super_successes=0\n",
- " \n",
- " # evaluation\n",
- " for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):\n",
- " if item in u_rated_items:\n",
- " user_successes[item_position]=1\n",
- " nb_user_successes+=1\n",
- " if item in u_super_items:\n",
- " user_super_successes[item_position]=1\n",
- " nb_user_super_successes+=1\n",
- " \n",
- " prec_u=nb_user_successes/topK \n",
- " prec+=prec_u\n",
- " \n",
- " rec_u=nb_user_successes/nb_u_rated_items\n",
- " rec+=rec_u\n",
- " \n",
- " F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0\n",
- " F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0\n",
- " \n",
- " prec_super+=nb_user_super_successes/topK\n",
- " rec_super+=nb_user_super_successes/max(len(u_super_items),1) # to set 0 if no super items\n",
- " ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]\n",
- " \n",
- " cumsum_successes=np.cumsum(user_successes)\n",
- " mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)\n",
- " MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0\n",
- " LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\\\n",
- " (nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\\\n",
- " ((nb_items-nb_u_rated_items)*nb_u_rated_items)\n",
- " \n",
- " HR+=nb_user_successes>0\n",
- " \n",
- " \n",
- " result=[]\n",
- " result.append(('precision', prec/relevant_users))\n",
- " result.append(('recall', rec/relevant_users))\n",
- " result.append(('F_1', F_1/relevant_users))\n",
- " result.append(('F_05', F_05/relevant_users))\n",
- " result.append(('precision_super', prec_super/super_relevant_users))\n",
- " result.append(('recall_super', rec_super/super_relevant_users))\n",
- " result.append(('NDCG', ndcg/relevant_users))\n",
- " result.append(('mAP', mAP/relevant_users))\n",
- " result.append(('MRR', MRR/relevant_users))\n",
- " result.append(('LAUC', LAUC/relevant_users))\n",
- " result.append(('HR', HR/relevant_users))\n",
+ " if len(u_super_items) > 0:\n",
+ " super_relevant_users += 1\n",
"\n",
- " df_result=(pd.DataFrame(list(zip(*result))[1])).T\n",
- " df_result.columns=list(zip(*result))[0]\n",
+ " user_successes = np.zeros(topK)\n",
+ " nb_user_successes = 0\n",
+ " user_super_successes = np.zeros(topK)\n",
+ " nb_user_super_successes = 0\n",
+ "\n",
+ " # evaluation\n",
+ " for (item_position, item) in enumerate(reco[nb_user, 1 : topK + 1]):\n",
+ " if item in u_rated_items:\n",
+ " user_successes[item_position] = 1\n",
+ " nb_user_successes += 1\n",
+ " if item in u_super_items:\n",
+ " user_super_successes[item_position] = 1\n",
+ " nb_user_super_successes += 1\n",
+ "\n",
+ " prec_u = nb_user_successes / topK\n",
+ " prec += prec_u\n",
+ "\n",
+ " rec_u = nb_user_successes / nb_u_rated_items\n",
+ " rec += rec_u\n",
+ "\n",
+ " F_1 += 2 * (prec_u * rec_u) / (prec_u + rec_u) if prec_u + rec_u > 0 else 0\n",
+ " F_05 += (\n",
+ " (0.5 ** 2 + 1) * (prec_u * rec_u) / (0.5 ** 2 * prec_u + rec_u)\n",
+ " if prec_u + rec_u > 0\n",
+ " else 0\n",
+ " )\n",
+ "\n",
+ " prec_super += nb_user_super_successes / topK\n",
+ " rec_super += nb_user_super_successes / max(\n",
+ " len(u_super_items), 1\n",
+ " ) # to set 0 if no super items\n",
+ " ndcg += np.dot(user_successes, cg) / cg_sum[min(topK, nb_u_rated_items) - 1]\n",
+ "\n",
+ " cumsum_successes = np.cumsum(user_successes)\n",
+ " mAP += np.dot(\n",
+ " cumsum_successes / np.arange(1, topK + 1), user_successes\n",
+ " ) / min(topK, nb_u_rated_items)\n",
+ " MRR += (\n",
+ " 1 / (user_successes.nonzero()[0][0] + 1)\n",
+ " if user_successes.nonzero()[0].size > 0\n",
+ " else 0\n",
+ " )\n",
+ " LAUC += (\n",
+ " np.dot(cumsum_successes, 1 - user_successes)\n",
+ " + (nb_user_successes + nb_u_rated_items)\n",
+ " / 2\n",
+ " * ((nb_items - nb_u_rated_items) - (topK - nb_user_successes))\n",
+ " ) / ((nb_items - nb_u_rated_items) * nb_u_rated_items)\n",
+ "\n",
+ " HR += nb_user_successes > 0\n",
+ "\n",
+ " result = []\n",
+ " result.append((\"precision\", prec / relevant_users))\n",
+ " result.append((\"recall\", rec / relevant_users))\n",
+ " result.append((\"F_1\", F_1 / relevant_users))\n",
+ " result.append((\"F_05\", F_05 / relevant_users))\n",
+ " result.append((\"precision_super\", prec_super / super_relevant_users))\n",
+ " result.append((\"recall_super\", rec_super / super_relevant_users))\n",
+ " result.append((\"NDCG\", ndcg / relevant_users))\n",
+ " result.append((\"mAP\", mAP / relevant_users))\n",
+ " result.append((\"MRR\", MRR / relevant_users))\n",
+ " result.append((\"LAUC\", LAUC / relevant_users))\n",
+ " result.append((\"HR\", HR / relevant_users))\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
" return df_result"
]
},
@@ -273,7 +315,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "943it [00:00, 7783.14it/s]\n"
+ "943it [00:00, 9434.06it/s]\n"
]
},
{
@@ -343,7 +385,7 @@
}
],
"source": [
- "ranking_metrics(test_ui, reco, super_reactions=[4,5], topK=10)"
+ "ranking_metrics(test_ui, reco, super_reactions=[4, 5], topK=10)"
]
},
{
@@ -360,39 +402,52 @@
"outputs": [],
"source": [
"def diversity_metrics(test_ui, reco, topK=10):\n",
- " \n",
- " frequencies=defaultdict(int)\n",
- " \n",
+ "\n",
+ " frequencies = defaultdict(int)\n",
+ "\n",
" # let's assign 0 to all items in test set\n",
" for item in list(set(test_ui.indices)):\n",
- " frequencies[item]=0\n",
- " \n",
+ " frequencies[item] = 0\n",
+ "\n",
" # counting frequencies\n",
- " for item in reco[:,1:].flat:\n",
- " frequencies[item]+=1\n",
- " \n",
- " nb_reco_outside_test=frequencies[-1]\n",
+ " for item in reco[:, 1:].flat:\n",
+ " frequencies[item] += 1\n",
+ "\n",
+ " nb_reco_outside_test = frequencies[-1]\n",
" del frequencies[-1]\n",
- " \n",
- " frequencies=np.array(list(frequencies.values()))\n",
- " \n",
- " nb_rec_items=len(frequencies[frequencies>0])\n",
- " nb_reco_inside_test=np.sum(frequencies)\n",
- " \n",
- " frequencies=frequencies/np.sum(frequencies)\n",
- " frequencies=np.sort(frequencies)\n",
- " \n",
- " with np.errstate(divide='ignore'): # let's put zeros put items with 0 frequency and ignore division warning\n",
- " log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)\n",
- " \n",
- " result=[]\n",
- " result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))\n",
- " result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))\n",
- " result.append(('Shannon', -np.dot(frequencies, log_frequencies)))\n",
- " result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))\n",
- " \n",
- " df_result=(pd.DataFrame(list(zip(*result))[1])).T\n",
- " df_result.columns=list(zip(*result))[0]\n",
+ "\n",
+ " frequencies = np.array(list(frequencies.values()))\n",
+ "\n",
+ " nb_rec_items = len(frequencies[frequencies > 0])\n",
+ " nb_reco_inside_test = np.sum(frequencies)\n",
+ "\n",
+ " frequencies = frequencies / np.sum(frequencies)\n",
+ " frequencies = np.sort(frequencies)\n",
+ "\n",
+ " with np.errstate(\n",
+ " divide=\"ignore\"\n",
+ " ): # let's put zeros put items with 0 frequency and ignore division warning\n",
+ " log_frequencies = np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)\n",
+ "\n",
+ " result = []\n",
+ " result.append(\n",
+ " (\n",
+ " \"Reco in test\",\n",
+ " nb_reco_inside_test / (nb_reco_inside_test + nb_reco_outside_test),\n",
+ " )\n",
+ " )\n",
+ " result.append((\"Test coverage\", nb_rec_items / test_ui.shape[1]))\n",
+ " result.append((\"Shannon\", -np.dot(frequencies, log_frequencies)))\n",
+ " result.append(\n",
+ " (\n",
+ " \"Gini\",\n",
+ " np.dot(frequencies, np.arange(1 - len(frequencies), len(frequencies), 2))\n",
+ " / (len(frequencies) - 1),\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " df_result = (pd.DataFrame(list(zip(*result))[1])).T\n",
+ " df_result.columns = list(zip(*result))[0]\n",
" return df_result"
]
},
@@ -453,11 +508,7 @@
"source": [
"# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel\n",
"\n",
- "import evaluation_measures as ev\n",
- "import imp\n",
- "imp.reload(ev)\n",
- "\n",
- "x=diversity_metrics(test_ui, reco, topK=10)\n",
+ "x = diversity_metrics(test_ui, reco, topK=10)\n",
"x"
]
},
@@ -477,7 +528,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "943it [00:00, 7347.78it/s]\n"
+ "943it [00:00, 11012.47it/s]\n"
]
},
{
@@ -563,17 +614,21 @@
],
"source": [
"import evaluation_measures as ev\n",
- "import imp\n",
- "imp.reload(ev)\n",
"\n",
- "estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)\n",
- "reco=np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_estimations.csv\", header=None\n",
+ ")\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Ready_Baseline_reco.csv\", delimiter=\",\"\n",
+ ")\n",
"\n",
- "ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None),\n",
- " estimations_df=estimations_df, \n",
- " reco=reco,\n",
- " super_reactions=[4,5])\n",
- "#also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default"
+ "ev.evaluate(\n",
+ " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n",
+ " estimations_df=estimations_df,\n",
+ " reco=reco,\n",
+ " super_reactions=[4, 5],\n",
+ ")\n",
+ "# also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default"
]
},
{
@@ -585,25 +640,21 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "943it [00:00, 4894.39it/s]\n",
- "943it [00:00, 4357.39it/s]\n",
- "943it [00:00, 5045.11it/s]\n",
- "943it [00:00, 4855.03it/s]\n",
- "943it [00:00, 5359.75it/s]\n"
+ "943it [00:00, 10346.82it/s]\n",
+ "943it [00:00, 11772.32it/s]\n",
+ "943it [00:00, 10636.62it/s]\n",
+ "943it [00:00, 10767.92it/s]\n",
+ "943it [00:00, 12019.93it/s]\n"
]
}
],
"source": [
- "import evaluation_measures as ev\n",
- "import imp\n",
- "imp.reload(ev)\n",
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
"\n",
- "dir_path=\"Recommendations generated/ml-100k/\"\n",
- "super_reactions=[4,5]\n",
- "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
- "\n",
- "df=ev.evaluate_all(test, dir_path, super_reactions)\n",
- "#also you can just type ev.evaluate_all() - I put above values as default"
+ "df = ev.evaluate_all(test, dir_path, super_reactions)\n",
+ "# also you can just type ev.evaluate_all() - I put above values as default"
]
},
{
@@ -671,14 +722,14 @@
"
\n",
" 0 | \n",
" Ready_Random | \n",
- " 1.523899 | \n",
- " 1.226799 | \n",
- " 0.046872 | \n",
- " 0.022367 | \n",
- " 0.025297 | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
" 0.032269 | \n",
- " 0.031116 | \n",
- " 0.027843 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
"
\n",
" \n",
" 0 | \n",
@@ -712,14 +763,14 @@
" Model RMSE MAE precision recall F_1 \\\n",
"0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
"0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
- "0 Ready_Random 1.523899 1.226799 0.046872 0.022367 0.025297 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
"0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
"0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
"\n",
" F_05 precision_super recall_super \n",
"0 0.141584 0.130472 0.137473 \n",
"0 0.061286 0.079614 0.056463 \n",
- "0 0.032269 0.031116 0.027843 \n",
+ "0 0.032269 0.029506 0.023707 \n",
"0 0.000481 0.000644 0.000223 \n",
"0 0.000463 0.000644 0.000189 "
]
@@ -730,7 +781,7 @@
}
],
"source": [
- "df.iloc[:,:9]"
+ "df.iloc[:, :9]"
]
},
{
@@ -801,15 +852,15 @@
"
\n",
" 0 | \n",
" Ready_Random | \n",
- " 0.051414 | \n",
- " 0.019769 | \n",
- " 0.127558 | \n",
- " 0.507696 | \n",
- " 0.332980 | \n",
- " 0.987593 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
" 0.184704 | \n",
- " 5.104710 | \n",
- " 0.906035 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
"
\n",
" \n",
" 0 | \n",
@@ -845,14 +896,14 @@
" Model NDCG mAP MRR LAUC HR \\\n",
"0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 \n",
"0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 \n",
- "0 Ready_Random 0.051414 0.019769 0.127558 0.507696 0.332980 \n",
+ "0 Ready_Random 0.050075 0.018728 0.121957 0.506893 0.329799 \n",
"0 Self_TopRated 0.001043 0.000335 0.003348 0.496433 0.009544 \n",
"0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 \n",
"\n",
" Reco in test Test coverage Shannon Gini \n",
"0 1.000000 0.038961 3.159079 0.987317 \n",
"0 1.000000 0.033911 2.836513 0.991139 \n",
- "0 0.987593 0.184704 5.104710 0.906035 \n",
+ "0 0.986532 0.184704 5.099706 0.907217 \n",
"0 0.699046 0.005051 1.945910 0.995669 \n",
"0 0.600530 0.005051 1.803126 0.996380 "
]
@@ -863,7 +914,7 @@
}
],
"source": [
- "df.iloc[:,np.append(0,np.arange(9, df.shape[1]))]"
+ "df.iloc[:, np.append(0, np.arange(9, df.shape[1]))]"
]
},
{
@@ -882,7 +933,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "3it [00:00, 4226.71it/s]\n"
+ "3it [00:00, 5771.98it/s]\n"
]
},
{
@@ -1166,35 +1217,53 @@
}
],
"source": [
- "import evaluation_measures as ev\n",
- "import imp\n",
"import helpers\n",
- "imp.reload(ev)\n",
"\n",
- "dir_path=\"Recommendations generated/toy-example/\"\n",
- "super_reactions=[4,5]\n",
- "test=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None)\n",
+ "dir_path = \"Recommendations generated/toy-example/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/toy-example/test.csv\", sep=\"\\t\", header=None)\n",
"\n",
"display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))\n",
- "#also you can just type ev.evaluate_all() - I put above values as default\n",
+ "# also you can just type ev.evaluate_all() - I put above values as default\n",
"\n",
- "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
- "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
- "reco=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', header=None)\n",
- "estimations=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', names=['user', 'item', 'est_score'])\n",
- "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
- "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "toy_train_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "toy_test_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/test.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "reco = pd.read_csv(\n",
+ " \"Recommendations generated/toy-example/Self_BaselineUI_reco.csv\", header=None\n",
+ ")\n",
+ "estimations = pd.read_csv(\n",
+ " \"Recommendations generated/toy-example/Self_BaselineUI_estimations.csv\",\n",
+ " names=[\"user\", \"item\", \"est_score\"],\n",
+ ")\n",
+ "(\n",
+ " toy_train_ui,\n",
+ " toy_test_ui,\n",
+ " toy_user_code_id,\n",
+ " toy_user_id_code,\n",
+ " toy_item_code_id,\n",
+ " toy_item_id_code,\n",
+ ") = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
"\n",
- "print('Training data:')\n",
+ "print(\"Training data:\")\n",
"display(toy_train_ui.todense())\n",
"\n",
- "print('Test data:')\n",
+ "print(\"Test data:\")\n",
"display(toy_test_ui.todense())\n",
"\n",
- "print('Recommendations:')\n",
+ "print(\"Recommendations:\")\n",
"display(reco)\n",
"\n",
- "print('Estimations:')\n",
+ "print(\"Estimations:\")\n",
"display(estimations)"
]
},
@@ -1246,148 +1315,148 @@
" \n",
"
\n",
" \n",
- " 54092 | \n",
- " 365 | \n",
+ " 57482 | \n",
+ " 2 | \n",
" 5 | \n",
- " Boogie Nights (1997) | \n",
- " Drama | \n",
+ " Emma (1996) | \n",
+ " Drama, Romance | \n",
"
\n",
" \n",
- " 55243 | \n",
- " 365 | \n",
+ " 54506 | \n",
+ " 2 | \n",
" 5 | \n",
- " Celluloid Closet, The (1995) | \n",
- " Documentary | \n",
+ " Sense and Sensibility (1995) | \n",
+ " Drama, Romance | \n",
"
\n",
" \n",
- " 21637 | \n",
- " 365 | \n",
+ " 40581 | \n",
+ " 2 | \n",
" 5 | \n",
- " In & Out (1997) | \n",
- " Comedy | \n",
+ " Titanic (1997) | \n",
+ " Action, Drama, Romance | \n",
"
\n",
" \n",
- " 36508 | \n",
- " 365 | \n",
+ " 2949 | \n",
+ " 2 | \n",
" 5 | \n",
- " Swingers (1996) | \n",
+ " Star Wars (1977) | \n",
+ " Action, Adventure, Romance, Sci-Fi, War | \n",
+ "
\n",
+ " \n",
+ " 69653 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Wings of the Dove, The (1997) | \n",
+ " Drama, Romance, Thriller | \n",
+ "
\n",
+ " \n",
+ " 7906 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " As Good As It Gets (1997) | \n",
" Comedy, Drama | \n",
"
\n",
" \n",
- " 19398 | \n",
- " 365 | \n",
+ " 69400 | \n",
+ " 2 | \n",
" 5 | \n",
- " Scream (1996) | \n",
- " Horror, Thriller | \n",
+ " Shall We Dance? (1996) | \n",
+ " Comedy | \n",
"
\n",
" \n",
- " 14343 | \n",
- " 365 | \n",
+ " 14469 | \n",
+ " 2 | \n",
" 5 | \n",
" Fargo (1996) | \n",
" Crime, Drama, Thriller | \n",
"
\n",
" \n",
- " 23738 | \n",
- " 365 | \n",
+ " 46151 | \n",
+ " 2 | \n",
" 5 | \n",
- " Chasing Amy (1997) | \n",
- " Drama, Romance | \n",
+ " L.A. Confidential (1997) | \n",
+ " Crime, Film-Noir, Mystery, Thriller | \n",
"
\n",
" \n",
- " 69960 | \n",
- " 365 | \n",
+ " 67293 | \n",
+ " 2 | \n",
" 5 | \n",
- " Beautiful Thing (1996) | \n",
- " Drama, Romance | \n",
+ " Good Will Hunting (1997) | \n",
+ " Drama | \n",
"
\n",
" \n",
- " 54753 | \n",
- " 365 | \n",
- " 4 | \n",
- " Scream 2 (1997) | \n",
- " Horror, Thriller | \n",
+ " 20923 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Secrets & Lies (1996) | \n",
+ " Drama | \n",
"
\n",
" \n",
- " 54552 | \n",
- " 365 | \n",
- " 4 | \n",
- " Sense and Sensibility (1995) | \n",
- " Drama, Romance | \n",
- "
\n",
- " \n",
- " 30051 | \n",
- " 365 | \n",
- " 4 | \n",
- " Star Trek: First Contact (1996) | \n",
- " Action, Adventure, Sci-Fi | \n",
- "
\n",
- " \n",
- " 47086 | \n",
- " 365 | \n",
- " 4 | \n",
- " Primal Fear (1996) | \n",
- " Drama, Thriller | \n",
- "
\n",
- " \n",
- " 62931 | \n",
- " 365 | \n",
- " 4 | \n",
- " James and the Giant Peach (1996) | \n",
- " Animation, Children's, Musical | \n",
- "
\n",
- " \n",
- " 38939 | \n",
- " 365 | \n",
- " 4 | \n",
- " Full Monty, The (1997) | \n",
+ " 52921 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " Kolya (1996) | \n",
" Comedy | \n",
"
\n",
" \n",
- " 38764 | \n",
- " 365 | \n",
+ " 50103 | \n",
+ " 2 | \n",
" 4 | \n",
- " First Wives Club, The (1996) | \n",
+ " Mrs. Brown (Her Majesty, Mrs. Brown) (1997) | \n",
+ " Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 51972 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Mighty Aphrodite (1995) | \n",
" Comedy | \n",
"
\n",
+ " \n",
+ " 515 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Heat (1995) | \n",
+ " Action, Crime, Thriller | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " user rating title \\\n",
- "54092 365 5 Boogie Nights (1997) \n",
- "55243 365 5 Celluloid Closet, The (1995) \n",
- "21637 365 5 In & Out (1997) \n",
- "36508 365 5 Swingers (1996) \n",
- "19398 365 5 Scream (1996) \n",
- "14343 365 5 Fargo (1996) \n",
- "23738 365 5 Chasing Amy (1997) \n",
- "69960 365 5 Beautiful Thing (1996) \n",
- "54753 365 4 Scream 2 (1997) \n",
- "54552 365 4 Sense and Sensibility (1995) \n",
- "30051 365 4 Star Trek: First Contact (1996) \n",
- "47086 365 4 Primal Fear (1996) \n",
- "62931 365 4 James and the Giant Peach (1996) \n",
- "38939 365 4 Full Monty, The (1997) \n",
- "38764 365 4 First Wives Club, The (1996) \n",
+ " user rating title \\\n",
+ "57482 2 5 Emma (1996) \n",
+ "54506 2 5 Sense and Sensibility (1995) \n",
+ "40581 2 5 Titanic (1997) \n",
+ "2949 2 5 Star Wars (1977) \n",
+ "69653 2 5 Wings of the Dove, The (1997) \n",
+ "7906 2 5 As Good As It Gets (1997) \n",
+ "69400 2 5 Shall We Dance? (1996) \n",
+ "14469 2 5 Fargo (1996) \n",
+ "46151 2 5 L.A. Confidential (1997) \n",
+ "67293 2 5 Good Will Hunting (1997) \n",
+ "20923 2 5 Secrets & Lies (1996) \n",
+ "52921 2 5 Kolya (1996) \n",
+ "50103 2 4 Mrs. Brown (Her Majesty, Mrs. Brown) (1997) \n",
+ "51972 2 4 Mighty Aphrodite (1995) \n",
+ "515 2 4 Heat (1995) \n",
"\n",
- " genres \n",
- "54092 Drama \n",
- "55243 Documentary \n",
- "21637 Comedy \n",
- "36508 Comedy, Drama \n",
- "19398 Horror, Thriller \n",
- "14343 Crime, Drama, Thriller \n",
- "23738 Drama, Romance \n",
- "69960 Drama, Romance \n",
- "54753 Horror, Thriller \n",
- "54552 Drama, Romance \n",
- "30051 Action, Adventure, Sci-Fi \n",
- "47086 Drama, Thriller \n",
- "62931 Animation, Children's, Musical \n",
- "38939 Comedy \n",
- "38764 Comedy "
+ " genres \n",
+ "57482 Drama, Romance \n",
+ "54506 Drama, Romance \n",
+ "40581 Action, Drama, Romance \n",
+ "2949 Action, Adventure, Romance, Sci-Fi, War \n",
+ "69653 Drama, Romance, Thriller \n",
+ "7906 Comedy, Drama \n",
+ "69400 Comedy \n",
+ "14469 Crime, Drama, Thriller \n",
+ "46151 Crime, Film-Noir, Mystery, Thriller \n",
+ "67293 Drama \n",
+ "20923 Drama \n",
+ "52921 Comedy \n",
+ "50103 Drama, Romance \n",
+ "51972 Comedy \n",
+ "515 Action, Crime, Thriller "
]
},
"metadata": {},
@@ -1429,71 +1498,71 @@
" \n",
" \n",
" \n",
- " 363 | \n",
- " 365.0 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
" 1 | \n",
" Great Day in Harlem, A (1994) | \n",
" Documentary | \n",
"
\n",
" \n",
- " 1305 | \n",
- " 365.0 | \n",
+ " 943 | \n",
+ " 2.0 | \n",
" 2 | \n",
" Tough and Deadly (1995) | \n",
" Action, Drama, Thriller | \n",
"
\n",
" \n",
- " 2248 | \n",
- " 365.0 | \n",
+ " 1885 | \n",
+ " 2.0 | \n",
" 3 | \n",
" Aiqing wansui (1994) | \n",
" Drama | \n",
"
\n",
" \n",
- " 3189 | \n",
- " 365.0 | \n",
+ " 2827 | \n",
+ " 2.0 | \n",
" 4 | \n",
" Delta of Venus (1994) | \n",
" Drama | \n",
"
\n",
" \n",
- " 4132 | \n",
- " 365.0 | \n",
+ " 3769 | \n",
+ " 2.0 | \n",
" 5 | \n",
" Someone Else's America (1995) | \n",
" Drama | \n",
"
\n",
" \n",
- " 5073 | \n",
- " 365.0 | \n",
+ " 4711 | \n",
+ " 2.0 | \n",
" 6 | \n",
" Saint of Fort Washington, The (1993) | \n",
" Drama | \n",
"
\n",
" \n",
- " 6015 | \n",
- " 365.0 | \n",
+ " 5653 | \n",
+ " 2.0 | \n",
" 7 | \n",
" Celestial Clockwork (1994) | \n",
" Comedy | \n",
"
\n",
" \n",
- " 6958 | \n",
- " 365.0 | \n",
+ " 6595 | \n",
+ " 2.0 | \n",
" 8 | \n",
" Some Mother's Son (1996) | \n",
" Drama | \n",
"
\n",
" \n",
- " 8852 | \n",
- " 365.0 | \n",
+ " 8489 | \n",
+ " 2.0 | \n",
" 9 | \n",
" Maya Lin: A Strong Clear Vision (1994) | \n",
" Documentary | \n",
"
\n",
" \n",
- " 7898 | \n",
- " 365.0 | \n",
+ " 7536 | \n",
+ " 2.0 | \n",
" 10 | \n",
" Prefontaine (1997) | \n",
" Drama | \n",
@@ -1503,29 +1572,29 @@
""
],
"text/plain": [
- " user rec_nb title \\\n",
- "363 365.0 1 Great Day in Harlem, A (1994) \n",
- "1305 365.0 2 Tough and Deadly (1995) \n",
- "2248 365.0 3 Aiqing wansui (1994) \n",
- "3189 365.0 4 Delta of Venus (1994) \n",
- "4132 365.0 5 Someone Else's America (1995) \n",
- "5073 365.0 6 Saint of Fort Washington, The (1993) \n",
- "6015 365.0 7 Celestial Clockwork (1994) \n",
- "6958 365.0 8 Some Mother's Son (1996) \n",
- "8852 365.0 9 Maya Lin: A Strong Clear Vision (1994) \n",
- "7898 365.0 10 Prefontaine (1997) \n",
+ " user rec_nb title \\\n",
+ "1 2.0 1 Great Day in Harlem, A (1994) \n",
+ "943 2.0 2 Tough and Deadly (1995) \n",
+ "1885 2.0 3 Aiqing wansui (1994) \n",
+ "2827 2.0 4 Delta of Venus (1994) \n",
+ "3769 2.0 5 Someone Else's America (1995) \n",
+ "4711 2.0 6 Saint of Fort Washington, The (1993) \n",
+ "5653 2.0 7 Celestial Clockwork (1994) \n",
+ "6595 2.0 8 Some Mother's Son (1996) \n",
+ "8489 2.0 9 Maya Lin: A Strong Clear Vision (1994) \n",
+ "7536 2.0 10 Prefontaine (1997) \n",
"\n",
" genres \n",
- "363 Documentary \n",
- "1305 Action, Drama, Thriller \n",
- "2248 Drama \n",
- "3189 Drama \n",
- "4132 Drama \n",
- "5073 Drama \n",
- "6015 Comedy \n",
- "6958 Drama \n",
- "8852 Documentary \n",
- "7898 Drama "
+ "1 Documentary \n",
+ "943 Action, Drama, Thriller \n",
+ "1885 Drama \n",
+ "2827 Drama \n",
+ "3769 Drama \n",
+ "4711 Drama \n",
+ "5653 Comedy \n",
+ "6595 Drama \n",
+ "8489 Documentary \n",
+ "7536 Drama "
]
},
"execution_count": 15,
@@ -1534,37 +1603,49 @@
}
],
"source": [
- "train=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
- "items=pd.read_csv('./Datasets/ml-100k/movies.csv')\n",
+ "train = pd.read_csv(\n",
+ " \"./Datasets/ml-100k/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n",
"\n",
- "user=random.choice(list(set(train['user'])))\n",
+ "user = random.choice(list(set(train[\"user\"])))\n",
"\n",
- "train_content=pd.merge(train, items, left_on='item', right_on='id')\n",
+ "train_content = pd.merge(train, items, left_on=\"item\", right_on=\"id\")\n",
"\n",
- "print('Here is what user rated high:')\n",
- "display(train_content[train_content['user']==user][['user', 'rating', 'title', 'genres']]\\\n",
- " .sort_values(by='rating', ascending=False)[:15])\n",
+ "print(\"Here is what user rated high:\")\n",
+ "display(\n",
+ " train_content[train_content[\"user\"] == user][\n",
+ " [\"user\", \"rating\", \"title\", \"genres\"]\n",
+ " ].sort_values(by=\"rating\", ascending=False)[:15]\n",
+ ")\n",
"\n",
- "reco = np.loadtxt('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', delimiter=',')\n",
- "items=pd.read_csv('./Datasets/ml-100k/movies.csv')\n",
+ "reco = np.loadtxt(\n",
+ " \"Recommendations generated/ml-100k/Self_BaselineUI_reco.csv\", delimiter=\",\"\n",
+ ")\n",
+ "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n",
"\n",
- "# Let's ignore scores - they are not used in evaluation: \n",
- "reco_users=reco[:,:1]\n",
- "reco_items=reco[:,1::2]\n",
+ "# Let's ignore scores - they are not used in evaluation:\n",
+ "reco_users = reco[:, :1]\n",
+ "reco_items = reco[:, 1::2]\n",
"# Let's put them into one array\n",
- "reco=np.concatenate((reco_users, reco_items), axis=1)\n",
+ "reco = np.concatenate((reco_users, reco_items), axis=1)\n",
"\n",
"# Let's rebuild it user-item dataframe\n",
- "recommended=[]\n",
+ "recommended = []\n",
"for row in reco:\n",
" for rec_nb, entry in enumerate(row[1:]):\n",
- " recommended.append((row[0], rec_nb+1, entry))\n",
- "recommended=pd.DataFrame(recommended, columns=['user','rec_nb', 'item'])\n",
+ " recommended.append((row[0], rec_nb + 1, entry))\n",
+ "recommended = pd.DataFrame(recommended, columns=[\"user\", \"rec_nb\", \"item\"])\n",
"\n",
- "recommended_content=pd.merge(recommended, items, left_on='item', right_on='id')\n",
+ "recommended_content = pd.merge(recommended, items, left_on=\"item\", right_on=\"id\")\n",
"\n",
- "print('Here is what we recommend:')\n",
- "recommended_content[recommended_content['user']==user][['user', 'rec_nb', 'title', 'genres']].sort_values(by='rec_nb')"
+ "print(\"Here is what we recommend:\")\n",
+ "recommended_content[recommended_content[\"user\"] == user][\n",
+ " [\"user\", \"rec_nb\", \"title\", \"genres\"]\n",
+ "].sort_values(by=\"rec_nb\")"
]
},
{
@@ -1580,214 +1661,11 @@
"metadata": {},
"outputs": [],
"source": [
- "# it may be your idea, modification of what we have already implemented \n",
- "# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations) \n",
+ "# it may be your idea, modification of what we have already implemented\n",
+ "# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations)\n",
"# or something well-known\n",
"# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure"
]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "943it [00:00, 4859.65it/s]\n",
- "943it [00:00, 4809.91it/s]\n",
- "943it [00:00, 4678.68it/s]\n",
- "943it [00:00, 3240.04it/s]\n",
- "943it [00:00, 4796.98it/s]\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Model | \n",
- " RMSE | \n",
- " MAE | \n",
- " precision | \n",
- " recall | \n",
- " F_1 | \n",
- " F_05 | \n",
- " precision_super | \n",
- " recall_super | \n",
- " NDCG | \n",
- " mAP | \n",
- " MRR | \n",
- " LAUC | \n",
- " HR | \n",
- " Reco in test | \n",
- " Test coverage | \n",
- " Shannon | \n",
- " Gini | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Self_TopPop | \n",
- " 2.508258 | \n",
- " 2.217909 | \n",
- " 0.188865 | \n",
- " 0.116919 | \n",
- " 0.118732 | \n",
- " 0.141584 | \n",
- " 0.130472 | \n",
- " 0.137473 | \n",
- " 0.214651 | \n",
- " 0.111707 | \n",
- " 0.400939 | \n",
- " 0.555546 | \n",
- " 0.765642 | \n",
- " 1.000000 | \n",
- " 0.038961 | \n",
- " 3.159079 | \n",
- " 0.987317 | \n",
- "
\n",
- " \n",
- " 0 | \n",
- " Ready_Baseline | \n",
- " 0.949459 | \n",
- " 0.752487 | \n",
- " 0.091410 | \n",
- " 0.037652 | \n",
- " 0.046030 | \n",
- " 0.061286 | \n",
- " 0.079614 | \n",
- " 0.056463 | \n",
- " 0.095957 | \n",
- " 0.043178 | \n",
- " 0.198193 | \n",
- " 0.515501 | \n",
- " 0.437964 | \n",
- " 1.000000 | \n",
- " 0.033911 | \n",
- " 2.836513 | \n",
- " 0.991139 | \n",
- "
\n",
- " \n",
- " 0 | \n",
- " Ready_Random | \n",
- " 1.523899 | \n",
- " 1.226799 | \n",
- " 0.046872 | \n",
- " 0.022367 | \n",
- " 0.025297 | \n",
- " 0.032269 | \n",
- " 0.031116 | \n",
- " 0.027843 | \n",
- " 0.051414 | \n",
- " 0.019769 | \n",
- " 0.127558 | \n",
- " 0.507696 | \n",
- " 0.332980 | \n",
- " 0.987593 | \n",
- " 0.184704 | \n",
- " 5.104710 | \n",
- " 0.906035 | \n",
- "
\n",
- " \n",
- " 0 | \n",
- " Self_TopRated | \n",
- " 1.030712 | \n",
- " 0.820904 | \n",
- " 0.000954 | \n",
- " 0.000188 | \n",
- " 0.000298 | \n",
- " 0.000481 | \n",
- " 0.000644 | \n",
- " 0.000223 | \n",
- " 0.001043 | \n",
- " 0.000335 | \n",
- " 0.003348 | \n",
- " 0.496433 | \n",
- " 0.009544 | \n",
- " 0.699046 | \n",
- " 0.005051 | \n",
- " 1.945910 | \n",
- " 0.995669 | \n",
- "
\n",
- " \n",
- " 0 | \n",
- " Self_BaselineUI | \n",
- " 0.967585 | \n",
- " 0.762740 | \n",
- " 0.000954 | \n",
- " 0.000170 | \n",
- " 0.000278 | \n",
- " 0.000463 | \n",
- " 0.000644 | \n",
- " 0.000189 | \n",
- " 0.000752 | \n",
- " 0.000168 | \n",
- " 0.001677 | \n",
- " 0.496424 | \n",
- " 0.009544 | \n",
- " 0.600530 | \n",
- " 0.005051 | \n",
- " 1.803126 | \n",
- " 0.996380 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Model RMSE MAE precision recall F_1 \\\n",
- "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
- "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
- "0 Ready_Random 1.523899 1.226799 0.046872 0.022367 0.025297 \n",
- "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
- "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
- "\n",
- " F_05 precision_super recall_super NDCG mAP MRR \\\n",
- "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
- "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
- "0 0.032269 0.031116 0.027843 0.051414 0.019769 0.127558 \n",
- "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
- "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
- "\n",
- " LAUC HR Reco in test Test coverage Shannon Gini \n",
- "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
- "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
- "0 0.507696 0.332980 0.987593 0.184704 5.104710 0.906035 \n",
- "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
- "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 "
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dir_path=\"Recommendations generated/ml-100k/\"\n",
- "super_reactions=[4,5]\n",
- "test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
- "\n",
- "ev.evaluate_all(test, dir_path, super_reactions)"
- ]
}
],
"metadata": {
@@ -1806,7 +1684,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.9"
+ "version": "3.8.5"
}
},
"nbformat": 4,
diff --git a/P3. k-nearest neighbours.ipynb b/P3. k-nearest neighbours.ipynb
new file mode 100644
index 0000000..17eecae
--- /dev/null
+++ b/P3. k-nearest neighbours.ipynb
@@ -0,0 +1,1057 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self made simplified I-KNN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import helpers\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "from collections import defaultdict\n",
+ "from itertools import chain\n",
+ "import random\n",
+ "\n",
+ "train_read = pd.read_csv(\"./Datasets/ml-100k/train.csv\", sep=\"\\t\", header=None)\n",
+ "test_read = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "(\n",
+ " train_ui,\n",
+ " test_ui,\n",
+ " user_code_id,\n",
+ " user_id_code,\n",
+ " item_code_id,\n",
+ " item_id_code,\n",
+ ") = helpers.data_to_csr(train_read, test_read)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class IKNN:\n",
+ " def fit(self, train_ui):\n",
+ " self.train_ui = train_ui\n",
+ "\n",
+ " train_iu = train_ui.transpose()\n",
+ " norms = np.linalg.norm(\n",
+ " train_iu.A, axis=1\n",
+ " ) # here we compute length of each item ratings vector\n",
+ " norms = np.vectorize(lambda x: max(x, 1))(\n",
+ " norms[:, None]\n",
+ " ) # to avoid dividing by zero\n",
+ "\n",
+ " normalized_train_iu = sparse.csr_matrix(train_iu / norms)\n",
+ "\n",
+ " self.similarity_matrix_ii = (\n",
+ " normalized_train_iu * normalized_train_iu.transpose()\n",
+ " )\n",
+ "\n",
+ " self.estimations = np.array(\n",
+ " train_ui\n",
+ " * self.similarity_matrix_ii\n",
+ " / ((train_ui > 0) * self.similarity_matrix_ii)\n",
+ " )\n",
+ "\n",
+ " def recommend(self, user_code_id, item_code_id, topK=10):\n",
+ "\n",
+ " top_k = defaultdict(list)\n",
+ " for nb_user, user in enumerate(self.estimations):\n",
+ "\n",
+ " user_rated = self.train_ui.indices[\n",
+ " self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]\n",
+ " ]\n",
+ " for item, score in enumerate(user):\n",
+ " if item not in user_rated and not np.isnan(score):\n",
+ " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
+ " result = []\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, item_scores in top_k.items():\n",
+ " item_scores.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid] + list(chain(*item_scores[:topK])))\n",
+ " return result\n",
+ "\n",
+ " def estimate(self, user_code_id, item_code_id, test_ui):\n",
+ " result = []\n",
+ " for user, item in zip(*test_ui.nonzero()):\n",
+ " result.append(\n",
+ " [\n",
+ " user_code_id[user],\n",
+ " item_code_id[item],\n",
+ " self.estimations[user, item]\n",
+ " if not np.isnan(self.estimations[user, item])\n",
+ " else 1,\n",
+ " ]\n",
+ " )\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "toy train ui:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[3, 4, 0, 0, 5, 0, 0, 4],\n",
+ " [0, 1, 2, 3, 0, 0, 0, 0],\n",
+ " [0, 0, 0, 5, 0, 3, 4, 0]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "similarity matrix:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ],\n",
+ " [0.9701425 , 1. , 0.24253563, 0.12478355, 0.9701425 ,\n",
+ " 0. , 0. , 0.9701425 ],\n",
+ " [0. , 0.24253563, 1. , 0.51449576, 0. ,\n",
+ " 0. , 0. , 0. ],\n",
+ " [0. , 0.12478355, 0.51449576, 1. , 0. ,\n",
+ " 0.85749293, 0.85749293, 0. ],\n",
+ " [1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ],\n",
+ " [0. , 0. , 0. , 0.85749293, 0. ,\n",
+ " 1. , 1. , 0. ],\n",
+ " [0. , 0. , 0. , 0.85749293, 0. ,\n",
+ " 1. , 1. , 0. ],\n",
+ " [1. , 0.9701425 , 0. , 0. , 1. ,\n",
+ " 0. , 0. , 1. ]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "estimations matrix:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([[4. , 4. , 4. , 4. , 4. ,\n",
+ " nan, nan, 4. ],\n",
+ " [1. , 1.35990333, 2.15478388, 2.53390319, 1. ,\n",
+ " 3. , 3. , 1. ],\n",
+ " [ nan, 5. , 5. , 4.05248907, nan,\n",
+ " 3.95012863, 3.95012863, nan]])"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[[0, 20, 4.0, 30, 4.0],\n",
+ " [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],\n",
+ " [20, 10, 5.0, 20, 5.0]]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# toy example\n",
+ "toy_train_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/train.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "toy_test_read = pd.read_csv(\n",
+ " \"./Datasets/toy-example/test.csv\",\n",
+ " sep=\"\\t\",\n",
+ " header=None,\n",
+ " names=[\"user\", \"item\", \"rating\", \"timestamp\"],\n",
+ ")\n",
+ "\n",
+ "(\n",
+ " toy_train_ui,\n",
+ " toy_test_ui,\n",
+ " toy_user_code_id,\n",
+ " toy_user_id_code,\n",
+ " toy_item_code_id,\n",
+ " toy_item_id_code,\n",
+ ") = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
+ "\n",
+ "\n",
+ "model = IKNN()\n",
+ "model.fit(toy_train_ui)\n",
+ "\n",
+ "print(\"toy train ui:\")\n",
+ "display(toy_train_ui.A)\n",
+ "\n",
+ "print(\"similarity matrix:\")\n",
+ "display(model.similarity_matrix_ii.A)\n",
+ "\n",
+ "print(\"estimations matrix:\")\n",
+ "display(model.estimations)\n",
+ "\n",
+ "model.recommend(toy_user_code_id, toy_item_code_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = IKNN()\n",
+ "model.fit(train_ui)\n",
+ "\n",
+ "top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
+ "\n",
+ "top_n.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_reco.csv\", index=False, header=False\n",
+ ")\n",
+ "\n",
+ "estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
+ "estimations.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_estimations.csv\",\n",
+ " index=False,\n",
+ " header=False,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 9004.71it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.00014 | \n",
+ " 0.000189 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.11544 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " RMSE MAE precision recall F_1 F_05 \\\n",
+ "0 1.018363 0.808793 0.000318 0.000108 0.00014 0.000189 \n",
+ "\n",
+ " precision_super recall_super NDCG mAP MRR LAUC \\\n",
+ "0 0.0 0.0 0.000214 0.000037 0.000368 0.496391 \n",
+ "\n",
+ " HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.003181 0.392153 0.11544 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import evaluation_measures as ev\n",
+ "\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_IKNN_estimations.csv\", header=None\n",
+ ")\n",
+ "reco = np.loadtxt(\"Recommendations generated/ml-100k/Self_IKNN_reco.csv\", delimiter=\",\")\n",
+ "\n",
+ "ev.evaluate(\n",
+ " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n",
+ " estimations_df=estimations_df,\n",
+ " reco=reco,\n",
+ " super_reactions=[4, 5],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 8517.83it/s]\n",
+ "943it [00:00, 11438.64it/s]\n",
+ "943it [00:00, 11933.36it/s]\n",
+ "943it [00:00, 10307.81it/s]\n",
+ "943it [00:00, 12250.41it/s]\n",
+ "943it [00:00, 12064.07it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ready-made KNNs - Surprise implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### I-KNN - basic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing the cosine similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "import helpers\n",
+ "import surprise as sp\n",
+ "\n",
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": False,\n",
+ "} # compute similarities between items\n",
+ "algo = sp.KNNBasic(sim_options=sim_options)\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_I-KNN_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### U-KNN - basic"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing the cosine similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": True,\n",
+ "} # compute similarities between users\n",
+ "algo = sp.KNNBasic(sim_options=sim_options)\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_U-KNN_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### I-KNN - on top baseline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Estimating biases using als...\n",
+ "Computing the msd similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "sim_options = {\n",
+ " \"name\": \"cosine\",\n",
+ " \"user_based\": False,\n",
+ "} # compute similarities between items\n",
+ "algo = sp.KNNBaseline()\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 11286.27it/s]\n",
+ "943it [00:00, 10874.86it/s]\n",
+ "943it [00:00, 11509.97it/s]\n",
+ "943it [00:00, 11855.81it/s]\n",
+ "943it [00:00, 11574.00it/s]\n",
+ "943it [00:00, 11080.19it/s]\n",
+ "943it [00:00, 11550.84it/s]\n",
+ "943it [00:00, 12148.14it/s]\n",
+ "943it [00:00, 10779.39it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNN | \n",
+ " 1.030386 | \n",
+ " 0.813067 | \n",
+ " 0.026087 | \n",
+ " 0.006908 | \n",
+ " 0.010593 | \n",
+ " 0.016046 | \n",
+ " 0.021137 | \n",
+ " 0.009522 | \n",
+ " 0.024214 | \n",
+ " 0.008958 | \n",
+ " 0.048068 | \n",
+ " 0.499885 | \n",
+ " 0.154825 | \n",
+ " 0.402333 | \n",
+ " 0.434343 | \n",
+ " 5.133650 | \n",
+ " 0.877999 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNNBaseline | \n",
+ " 0.935327 | \n",
+ " 0.737424 | \n",
+ " 0.002545 | \n",
+ " 0.000755 | \n",
+ " 0.001105 | \n",
+ " 0.001602 | \n",
+ " 0.002253 | \n",
+ " 0.000930 | \n",
+ " 0.003444 | \n",
+ " 0.001362 | \n",
+ " 0.011760 | \n",
+ " 0.496724 | \n",
+ " 0.021209 | \n",
+ " 0.482821 | \n",
+ " 0.059885 | \n",
+ " 2.232578 | \n",
+ " 0.994487 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_U-KNN | \n",
+ " 1.023495 | \n",
+ " 0.807913 | \n",
+ " 0.000742 | \n",
+ " 0.000205 | \n",
+ " 0.000305 | \n",
+ " 0.000449 | \n",
+ " 0.000536 | \n",
+ " 0.000198 | \n",
+ " 0.000845 | \n",
+ " 0.000274 | \n",
+ " 0.002744 | \n",
+ " 0.496441 | \n",
+ " 0.007423 | \n",
+ " 0.602121 | \n",
+ " 0.010823 | \n",
+ " 2.089186 | \n",
+ " 0.995706 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n",
+ "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n",
+ "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n",
+ "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n",
+ "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n",
+ "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n",
+ "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# project task 3: use a version of your choice of Surprise KNNalgorithm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read the docs and try to find best parameter configuration (let say in terms of RMSE)\n",
+ "# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline\n",
+ "# the solution here can be similar to examples above\n",
+ "# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and\n",
+ "# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/P3. k-nearest neighbours.pdf b/P3. k-nearest neighbours.pdf
new file mode 100644
index 0000000..cbf957a
Binary files /dev/null and b/P3. k-nearest neighbours.pdf differ
diff --git a/P4. Appendix - embeddings in high demensional spaces.ipynb b/P4. Appendix - embeddings in high demensional spaces.ipynb
new file mode 100644
index 0000000..c594641
--- /dev/null
+++ b/P4. Appendix - embeddings in high demensional spaces.ipynb
@@ -0,0 +1,96 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['dimensions: 1, cases when observation is the nearest: 0.0%',\n",
+ " 'dimensions: 2, cases when observation is the nearest: 0.0%',\n",
+ " 'dimensions: 3, cases when observation is the nearest: 0.0%',\n",
+ " 'dimensions: 10, cases when observation is the nearest: 13.0%',\n",
+ " 'dimensions: 20, cases when observation is the nearest: 61.0%',\n",
+ " 'dimensions: 30, cases when observation is the nearest: 96.0%',\n",
+ " 'dimensions: 40, cases when observation is the nearest: 98.0%',\n",
+ " 'dimensions: 50, cases when observation is the nearest: 100.0%',\n",
+ " 'dimensions: 60, cases when observation is the nearest: 100.0%',\n",
+ " 'dimensions: 70, cases when observation is the nearest: 100.0%',\n",
+ " 'dimensions: 80, cases when observation is the nearest: 100.0%',\n",
+ " 'dimensions: 90, cases when observation is the nearest: 100.0%']"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import random\n",
+ "from numpy.linalg import norm\n",
+ "\n",
+ "dimensions = [1, 2, 3] + [10 * i for i in range(1, 10)]\n",
+ "nb_vectors = 10000\n",
+ "trials = 100\n",
+ "k = 1 # by setting k=1 we want to check how often the closest vector to the avarage of 2 random vectors is one of these 2 vectors\n",
+ "\n",
+ "result = []\n",
+ "for dimension in dimensions:\n",
+ " vectors = np.random.normal(0, 1, size=(nb_vectors, dimension))\n",
+ " successes = 0\n",
+ " for i in range(trials):\n",
+ " i1, i2 = random.sample(range(nb_vectors), 2)\n",
+ " target = (vectors[i1] + vectors[i2]) / 2\n",
+ "\n",
+ " distances = pd.DataFrame(\n",
+ " enumerate(\n",
+ " np.dot(target, vectors.transpose())\n",
+ " / norm(target)\n",
+ " / norm(vectors.transpose(), axis=0)\n",
+ " )\n",
+ " )\n",
+ " distances = distances.sort_values(by=[1], ascending=False)\n",
+ " if (i1 in (list(distances[0][:k]))) | (i2 in (list(distances[0][:k]))):\n",
+ " successes += 1\n",
+ " result.append(successes / trials)\n",
+ "\n",
+ "[\n",
+ " f\"dimensions: {i}, cases when observation is the nearest: {100*round(j,3)}%\"\n",
+ " for i, j in zip(dimensions, result)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/P4. Matrix Factorization.ipynb b/P4. Matrix Factorization.ipynb
new file mode 100644
index 0000000..6ab0c6c
--- /dev/null
+++ b/P4. Matrix Factorization.ipynb
@@ -0,0 +1,1403 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Self made SVD"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import helpers\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import scipy.sparse as sparse\n",
+ "from collections import defaultdict\n",
+ "from itertools import chain\n",
+ "import random\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "train_read = pd.read_csv(\"./Datasets/ml-100k/train.csv\", sep=\"\\t\", header=None)\n",
+ "test_read = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "(\n",
+ " train_ui,\n",
+ " test_ui,\n",
+ " user_code_id,\n",
+ " user_id_code,\n",
+ " item_code_id,\n",
+ " item_id_code,\n",
+ ") = helpers.data_to_csr(train_read, test_read)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Done similarly to https://github.com/albertauyeung/matrix-factorization-in-python\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "\n",
+ "class SVD:\n",
+ " def __init__(self, train_ui, learning_rate, regularization, nb_factors, iterations):\n",
+ " self.train_ui = train_ui\n",
+ " self.uir = list(\n",
+ " zip(*[train_ui.nonzero()[0], train_ui.nonzero()[1], train_ui.data])\n",
+ " )\n",
+ "\n",
+ " self.learning_rate = learning_rate\n",
+ " self.regularization = regularization\n",
+ " self.iterations = iterations\n",
+ " self.nb_users, self.nb_items = train_ui.shape\n",
+ " self.nb_ratings = train_ui.nnz\n",
+ " self.nb_factors = nb_factors\n",
+ "\n",
+ " self.Pu = np.random.normal(\n",
+ " loc=0, scale=1.0 / self.nb_factors, size=(self.nb_users, self.nb_factors)\n",
+ " )\n",
+ " self.Qi = np.random.normal(\n",
+ " loc=0, scale=1.0 / self.nb_factors, size=(self.nb_items, self.nb_factors)\n",
+ " )\n",
+ "\n",
+ " def train(self, test_ui=None):\n",
+ " if test_ui != None:\n",
+ " self.test_uir = list(\n",
+ " zip(*[test_ui.nonzero()[0], test_ui.nonzero()[1], test_ui.data])\n",
+ " )\n",
+ "\n",
+ " self.learning_process = []\n",
+ " pbar = tqdm(range(self.iterations))\n",
+ " for i in pbar:\n",
+ " pbar.set_description(\n",
+ " f\"Epoch {i} RMSE: {self.learning_process[-1][1] if i>0 else 0}. Training epoch {i+1}...\"\n",
+ " )\n",
+ " np.random.shuffle(self.uir)\n",
+ " self.sgd(self.uir)\n",
+ " if test_ui == None:\n",
+ " self.learning_process.append([i + 1, self.RMSE_total(self.uir)])\n",
+ " else:\n",
+ " self.learning_process.append(\n",
+ " [i + 1, self.RMSE_total(self.uir), self.RMSE_total(self.test_uir)]\n",
+ " )\n",
+ "\n",
+ " def sgd(self, uir):\n",
+ "\n",
+ " for u, i, score in uir:\n",
+ " # Computer prediction and error\n",
+ " prediction = self.get_rating(u, i)\n",
+ " e = score - prediction\n",
+ "\n",
+ " # Update user and item latent feature matrices\n",
+ " Pu_update = self.learning_rate * (\n",
+ " e * self.Qi[i] - self.regularization * self.Pu[u]\n",
+ " )\n",
+ " Qi_update = self.learning_rate * (\n",
+ " e * self.Pu[u] - self.regularization * self.Qi[i]\n",
+ " )\n",
+ "\n",
+ " self.Pu[u] += Pu_update\n",
+ " self.Qi[i] += Qi_update\n",
+ "\n",
+ " def get_rating(self, u, i):\n",
+ " prediction = self.Pu[u].dot(self.Qi[i].T)\n",
+ " return prediction\n",
+ "\n",
+ " def RMSE_total(self, uir):\n",
+ " RMSE = 0\n",
+ " for u, i, score in uir:\n",
+ " prediction = self.get_rating(u, i)\n",
+ " RMSE += (score - prediction) ** 2\n",
+ " return np.sqrt(RMSE / len(uir))\n",
+ "\n",
+ " def estimations(self):\n",
+ " self.estimations = np.dot(self.Pu, self.Qi.T)\n",
+ "\n",
+ " def recommend(self, user_code_id, item_code_id, topK=10):\n",
+ "\n",
+ " top_k = defaultdict(list)\n",
+ " for nb_user, user in enumerate(self.estimations):\n",
+ "\n",
+ " user_rated = self.train_ui.indices[\n",
+ " self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]\n",
+ " ]\n",
+ " for item, score in enumerate(user):\n",
+ " if item not in user_rated and not np.isnan(score):\n",
+ " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
+ " result = []\n",
+ " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
+ " for uid, item_scores in top_k.items():\n",
+ " item_scores.sort(key=lambda x: x[1], reverse=True)\n",
+ " result.append([uid] + list(chain(*item_scores[:topK])))\n",
+ " return result\n",
+ "\n",
+ " def estimate(self, user_code_id, item_code_id, test_ui):\n",
+ " result = []\n",
+ " for user, item in zip(*test_ui.nonzero()):\n",
+ " result.append(\n",
+ " [\n",
+ " user_code_id[user],\n",
+ " item_code_id[item],\n",
+ " self.estimations[user, item]\n",
+ " if not np.isnan(self.estimations[user, item])\n",
+ " else 1,\n",
+ " ]\n",
+ " )\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Epoch 39 RMSE: 0.7477090330529405. Training epoch 40...: 100%|██████████| 40/40 [01:03<00:00, 1.59s/it]\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = SVD(\n",
+ " train_ui, learning_rate=0.005, regularization=0.02, nb_factors=100, iterations=40\n",
+ ")\n",
+ "model.train(test_ui)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame(model.learning_process).iloc[:, :2]\n",
+ "df.columns = [\"epoch\", \"train_RMSE\"]\n",
+ "plt.plot(\"epoch\", \"train_RMSE\", data=df, color=\"blue\")\n",
+ "plt.legend()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame(\n",
+ " model.learning_process[10:], columns=[\"epoch\", \"train_RMSE\", \"test_RMSE\"]\n",
+ ")\n",
+ "plt.plot(\"epoch\", \"train_RMSE\", data=df, color=\"blue\")\n",
+ "plt.plot(\"epoch\", \"test_RMSE\", data=df, color=\"green\", linestyle=\"dashed\")\n",
+ "plt.legend()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Saving and evaluating recommendations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.estimations()\n",
+ "\n",
+ "top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
+ "\n",
+ "top_n.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_SVD_reco.csv\", index=False, header=False\n",
+ ")\n",
+ "\n",
+ "estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
+ "estimations.to_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_SVD_estimations.csv\",\n",
+ " index=False,\n",
+ " header=False,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 11138.92it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.914143 | \n",
+ " 0.717131 | \n",
+ " 0.101803 | \n",
+ " 0.042134 | \n",
+ " 0.05161 | \n",
+ " 0.068543 | \n",
+ " 0.091953 | \n",
+ " 0.071255 | \n",
+ " 0.104015 | \n",
+ " 0.048817 | \n",
+ " 0.193027 | \n",
+ " 0.517784 | \n",
+ " 0.471898 | \n",
+ " 0.867232 | \n",
+ " 0.147908 | \n",
+ " 3.871296 | \n",
+ " 0.97182 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " RMSE MAE precision recall F_1 F_05 \\\n",
+ "0 0.914143 0.717131 0.101803 0.042134 0.05161 0.068543 \n",
+ "\n",
+ " precision_super recall_super NDCG mAP MRR LAUC \\\n",
+ "0 0.091953 0.071255 0.104015 0.048817 0.193027 0.517784 \n",
+ "\n",
+ " HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.471898 0.867232 0.147908 3.871296 0.97182 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import evaluation_measures as ev\n",
+ "\n",
+ "estimations_df = pd.read_csv(\n",
+ " \"Recommendations generated/ml-100k/Self_SVD_estimations.csv\", header=None\n",
+ ")\n",
+ "reco = np.loadtxt(\"Recommendations generated/ml-100k/Self_SVD_reco.csv\", delimiter=\",\")\n",
+ "\n",
+ "ev.evaluate(\n",
+ " test=pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None),\n",
+ " estimations_df=estimations_df,\n",
+ " reco=reco,\n",
+ " super_reactions=[4, 5],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 10694.55it/s]\n",
+ "943it [00:00, 11600.99it/s]\n",
+ "943it [00:00, 11461.54it/s]\n",
+ "943it [00:00, 11660.39it/s]\n",
+ "943it [00:00, 9872.18it/s]\n",
+ "943it [00:00, 11443.77it/s]\n",
+ "943it [00:00, 11990.88it/s]\n",
+ "943it [00:00, 11615.02it/s]\n",
+ "943it [00:00, 11874.78it/s]\n",
+ "943it [00:00, 12387.19it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_SVD | \n",
+ " 0.914143 | \n",
+ " 0.717131 | \n",
+ " 0.101803 | \n",
+ " 0.042134 | \n",
+ " 0.051610 | \n",
+ " 0.068543 | \n",
+ " 0.091953 | \n",
+ " 0.071255 | \n",
+ " 0.104015 | \n",
+ " 0.048817 | \n",
+ " 0.193027 | \n",
+ " 0.517784 | \n",
+ " 0.471898 | \n",
+ " 0.867232 | \n",
+ " 0.147908 | \n",
+ " 3.871296 | \n",
+ " 0.971820 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNN | \n",
+ " 1.030386 | \n",
+ " 0.813067 | \n",
+ " 0.026087 | \n",
+ " 0.006908 | \n",
+ " 0.010593 | \n",
+ " 0.016046 | \n",
+ " 0.021137 | \n",
+ " 0.009522 | \n",
+ " 0.024214 | \n",
+ " 0.008958 | \n",
+ " 0.048068 | \n",
+ " 0.499885 | \n",
+ " 0.154825 | \n",
+ " 0.402333 | \n",
+ " 0.434343 | \n",
+ " 5.133650 | \n",
+ " 0.877999 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNNBaseline | \n",
+ " 0.935327 | \n",
+ " 0.737424 | \n",
+ " 0.002545 | \n",
+ " 0.000755 | \n",
+ " 0.001105 | \n",
+ " 0.001602 | \n",
+ " 0.002253 | \n",
+ " 0.000930 | \n",
+ " 0.003444 | \n",
+ " 0.001362 | \n",
+ " 0.011760 | \n",
+ " 0.496724 | \n",
+ " 0.021209 | \n",
+ " 0.482821 | \n",
+ " 0.059885 | \n",
+ " 2.232578 | \n",
+ " 0.994487 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_U-KNN | \n",
+ " 1.023495 | \n",
+ " 0.807913 | \n",
+ " 0.000742 | \n",
+ " 0.000205 | \n",
+ " 0.000305 | \n",
+ " 0.000449 | \n",
+ " 0.000536 | \n",
+ " 0.000198 | \n",
+ " 0.000845 | \n",
+ " 0.000274 | \n",
+ " 0.002744 | \n",
+ " 0.496441 | \n",
+ " 0.007423 | \n",
+ " 0.602121 | \n",
+ " 0.010823 | \n",
+ " 2.089186 | \n",
+ " 0.995706 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Self_SVD 0.914143 0.717131 0.101803 0.042134 0.051610 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n",
+ "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n",
+ "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.068543 0.091953 0.071255 0.104015 0.048817 0.193027 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n",
+ "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n",
+ "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.517784 0.471898 0.867232 0.147908 3.871296 0.971820 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n",
+ "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n",
+ "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " code | \n",
+ " score | \n",
+ " item_id | \n",
+ " id | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 405 | \n",
+ " 1.000000 | \n",
+ " 406 | \n",
+ " 406 | \n",
+ " Thinner (1996) | \n",
+ " Horror, Thriller | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 827 | \n",
+ " 0.968354 | \n",
+ " 828 | \n",
+ " 828 | \n",
+ " Alaska (1996) | \n",
+ " Adventure, Children's | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 818 | \n",
+ " 0.967103 | \n",
+ " 819 | \n",
+ " 819 | \n",
+ " Eddie (1996) | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 904 | \n",
+ " 0.963944 | \n",
+ " 905 | \n",
+ " 905 | \n",
+ " Great Expectations (1998) | \n",
+ " Drama, Romance | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1293 | \n",
+ " 0.962779 | \n",
+ " 1294 | \n",
+ " 1294 | \n",
+ " Ayn Rand: A Sense of Life (1997) | \n",
+ " Documentary | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 347 | \n",
+ " 0.961946 | \n",
+ " 348 | \n",
+ " 348 | \n",
+ " Desperate Measures (1998) | \n",
+ " Crime, Drama, Thriller | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 807 | \n",
+ " 0.960952 | \n",
+ " 808 | \n",
+ " 808 | \n",
+ " Program, The (1993) | \n",
+ " Action, Drama | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 560 | \n",
+ " 0.960885 | \n",
+ " 561 | \n",
+ " 561 | \n",
+ " Mary Shelley's Frankenstein (1994) | \n",
+ " Drama, Horror | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 1392 | \n",
+ " 0.958724 | \n",
+ " 1393 | \n",
+ " 1393 | \n",
+ " Stag (1997) | \n",
+ " Action, Thriller | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 787 | \n",
+ " 0.957891 | \n",
+ " 788 | \n",
+ " 788 | \n",
+ " Relative Fear (1994) | \n",
+ " Horror, Thriller | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " code score item_id id title \\\n",
+ "0 405 1.000000 406 406 Thinner (1996) \n",
+ "1 827 0.968354 828 828 Alaska (1996) \n",
+ "2 818 0.967103 819 819 Eddie (1996) \n",
+ "3 904 0.963944 905 905 Great Expectations (1998) \n",
+ "4 1293 0.962779 1294 1294 Ayn Rand: A Sense of Life (1997) \n",
+ "5 347 0.961946 348 348 Desperate Measures (1998) \n",
+ "6 807 0.960952 808 808 Program, The (1993) \n",
+ "7 560 0.960885 561 561 Mary Shelley's Frankenstein (1994) \n",
+ "8 1392 0.958724 1393 1393 Stag (1997) \n",
+ "9 787 0.957891 788 788 Relative Fear (1994) \n",
+ "\n",
+ " genres \n",
+ "0 Horror, Thriller \n",
+ "1 Adventure, Children's \n",
+ "2 Comedy \n",
+ "3 Drama, Romance \n",
+ "4 Documentary \n",
+ "5 Crime, Drama, Thriller \n",
+ "6 Action, Drama \n",
+ "7 Drama, Horror \n",
+ "8 Action, Thriller \n",
+ "9 Horror, Thriller "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "item = random.choice(list(set(train_ui.indices)))\n",
+ "\n",
+ "embeddings_norm = (\n",
+ " model.Qi / np.linalg.norm(model.Qi, axis=1)[:, None]\n",
+ ") # we do not mean-center here\n",
+ "# omitting normalization also makes sense, but items with a greater magnitude will be recommended more often\n",
+ "\n",
+ "similarity_scores = np.dot(embeddings_norm, embeddings_norm[item].T)\n",
+ "top_similar_items = pd.DataFrame(\n",
+ " enumerate(similarity_scores), columns=[\"code\", \"score\"]\n",
+ ").sort_values(by=[\"score\"], ascending=[False])[:10]\n",
+ "\n",
+ "top_similar_items[\"item_id\"] = top_similar_items[\"code\"].apply(\n",
+ " lambda x: item_code_id[x]\n",
+ ")\n",
+ "\n",
+ "items = pd.read_csv(\"./Datasets/ml-100k/movies.csv\")\n",
+ "\n",
+ "result = pd.merge(top_similar_items, items, left_on=\"item_id\", right_on=\"id\")\n",
+ "\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# project task 5: implement SVD on top baseline (as it is in Surprise library)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# making changes to our implementation by considering additional parameters in the gradient descent procedure\n",
+ "# seems to be the fastest option\n",
+ "# please save the output in 'Recommendations generated/ml-100k/Self_SVDBaseline_reco.csv' and\n",
+ "# 'Recommendations generated/ml-100k/Self_SVDBaseline_estimations.csv'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ready-made SVD - Surprise implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### SVD"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "import helpers\n",
+ "import surprise as sp\n",
+ "\n",
+ "algo = sp.SVD(biased=False) # to use unbiased version\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_SVD_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_SVD_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### SVD biased - on top baseline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Generating predictions...\n",
+ "Generating top N recommendations...\n",
+ "Generating predictions...\n"
+ ]
+ }
+ ],
+ "source": [
+ "algo = sp.SVD() # default is biased=True\n",
+ "\n",
+ "helpers.ready_made(\n",
+ " algo,\n",
+ " reco_path=\"Recommendations generated/ml-100k/Ready_SVDBiased_reco.csv\",\n",
+ " estimations_path=\"Recommendations generated/ml-100k/Ready_SVDBiased_estimations.csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "943it [00:00, 11249.52it/s]\n",
+ "943it [00:00, 10927.13it/s]\n",
+ "943it [00:00, 11816.00it/s]\n",
+ "943it [00:00, 11204.84it/s]\n",
+ "943it [00:00, 11803.13it/s]\n",
+ "943it [00:00, 10580.63it/s]\n",
+ "943it [00:00, 11843.28it/s]\n",
+ "943it [00:00, 12313.76it/s]\n",
+ "943it [00:00, 10678.21it/s]\n",
+ "943it [00:00, 9772.22it/s]\n",
+ "943it [00:00, 10699.52it/s]\n",
+ "943it [00:00, 11789.55it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model | \n",
+ " RMSE | \n",
+ " MAE | \n",
+ " precision | \n",
+ " recall | \n",
+ " F_1 | \n",
+ " F_05 | \n",
+ " precision_super | \n",
+ " recall_super | \n",
+ " NDCG | \n",
+ " mAP | \n",
+ " MRR | \n",
+ " LAUC | \n",
+ " HR | \n",
+ " Reco in test | \n",
+ " Test coverage | \n",
+ " Shannon | \n",
+ " Gini | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Self_TopPop | \n",
+ " 2.508258 | \n",
+ " 2.217909 | \n",
+ " 0.188865 | \n",
+ " 0.116919 | \n",
+ " 0.118732 | \n",
+ " 0.141584 | \n",
+ " 0.130472 | \n",
+ " 0.137473 | \n",
+ " 0.214651 | \n",
+ " 0.111707 | \n",
+ " 0.400939 | \n",
+ " 0.555546 | \n",
+ " 0.765642 | \n",
+ " 1.000000 | \n",
+ " 0.038961 | \n",
+ " 3.159079 | \n",
+ " 0.987317 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_SVD | \n",
+ " 0.950347 | \n",
+ " 0.749312 | \n",
+ " 0.100636 | \n",
+ " 0.050514 | \n",
+ " 0.055794 | \n",
+ " 0.070753 | \n",
+ " 0.091202 | \n",
+ " 0.082734 | \n",
+ " 0.114054 | \n",
+ " 0.053200 | \n",
+ " 0.248803 | \n",
+ " 0.521983 | \n",
+ " 0.517497 | \n",
+ " 0.992153 | \n",
+ " 0.210678 | \n",
+ " 4.418683 | \n",
+ " 0.952848 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_SVD | \n",
+ " 0.914143 | \n",
+ " 0.717131 | \n",
+ " 0.101803 | \n",
+ " 0.042134 | \n",
+ " 0.051610 | \n",
+ " 0.068543 | \n",
+ " 0.091953 | \n",
+ " 0.071255 | \n",
+ " 0.104015 | \n",
+ " 0.048817 | \n",
+ " 0.193027 | \n",
+ " 0.517784 | \n",
+ " 0.471898 | \n",
+ " 0.867232 | \n",
+ " 0.147908 | \n",
+ " 3.871296 | \n",
+ " 0.971820 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Baseline | \n",
+ " 0.949459 | \n",
+ " 0.752487 | \n",
+ " 0.091410 | \n",
+ " 0.037652 | \n",
+ " 0.046030 | \n",
+ " 0.061286 | \n",
+ " 0.079614 | \n",
+ " 0.056463 | \n",
+ " 0.095957 | \n",
+ " 0.043178 | \n",
+ " 0.198193 | \n",
+ " 0.515501 | \n",
+ " 0.437964 | \n",
+ " 1.000000 | \n",
+ " 0.033911 | \n",
+ " 2.836513 | \n",
+ " 0.991139 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_SVDBiased | \n",
+ " 0.939472 | \n",
+ " 0.739816 | \n",
+ " 0.085896 | \n",
+ " 0.036073 | \n",
+ " 0.043528 | \n",
+ " 0.057643 | \n",
+ " 0.077039 | \n",
+ " 0.057463 | \n",
+ " 0.097753 | \n",
+ " 0.045546 | \n",
+ " 0.219839 | \n",
+ " 0.514709 | \n",
+ " 0.431601 | \n",
+ " 0.997455 | \n",
+ " 0.168831 | \n",
+ " 4.217578 | \n",
+ " 0.962577 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_Random | \n",
+ " 1.521845 | \n",
+ " 1.225949 | \n",
+ " 0.047190 | \n",
+ " 0.020753 | \n",
+ " 0.024810 | \n",
+ " 0.032269 | \n",
+ " 0.029506 | \n",
+ " 0.023707 | \n",
+ " 0.050075 | \n",
+ " 0.018728 | \n",
+ " 0.121957 | \n",
+ " 0.506893 | \n",
+ " 0.329799 | \n",
+ " 0.986532 | \n",
+ " 0.184704 | \n",
+ " 5.099706 | \n",
+ " 0.907217 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNN | \n",
+ " 1.030386 | \n",
+ " 0.813067 | \n",
+ " 0.026087 | \n",
+ " 0.006908 | \n",
+ " 0.010593 | \n",
+ " 0.016046 | \n",
+ " 0.021137 | \n",
+ " 0.009522 | \n",
+ " 0.024214 | \n",
+ " 0.008958 | \n",
+ " 0.048068 | \n",
+ " 0.499885 | \n",
+ " 0.154825 | \n",
+ " 0.402333 | \n",
+ " 0.434343 | \n",
+ " 5.133650 | \n",
+ " 0.877999 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_I-KNNBaseline | \n",
+ " 0.935327 | \n",
+ " 0.737424 | \n",
+ " 0.002545 | \n",
+ " 0.000755 | \n",
+ " 0.001105 | \n",
+ " 0.001602 | \n",
+ " 0.002253 | \n",
+ " 0.000930 | \n",
+ " 0.003444 | \n",
+ " 0.001362 | \n",
+ " 0.011760 | \n",
+ " 0.496724 | \n",
+ " 0.021209 | \n",
+ " 0.482821 | \n",
+ " 0.059885 | \n",
+ " 2.232578 | \n",
+ " 0.994487 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Ready_U-KNN | \n",
+ " 1.023495 | \n",
+ " 0.807913 | \n",
+ " 0.000742 | \n",
+ " 0.000205 | \n",
+ " 0.000305 | \n",
+ " 0.000449 | \n",
+ " 0.000536 | \n",
+ " 0.000198 | \n",
+ " 0.000845 | \n",
+ " 0.000274 | \n",
+ " 0.002744 | \n",
+ " 0.496441 | \n",
+ " 0.007423 | \n",
+ " 0.602121 | \n",
+ " 0.010823 | \n",
+ " 2.089186 | \n",
+ " 0.995706 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_TopRated | \n",
+ " 1.030712 | \n",
+ " 0.820904 | \n",
+ " 0.000954 | \n",
+ " 0.000188 | \n",
+ " 0.000298 | \n",
+ " 0.000481 | \n",
+ " 0.000644 | \n",
+ " 0.000223 | \n",
+ " 0.001043 | \n",
+ " 0.000335 | \n",
+ " 0.003348 | \n",
+ " 0.496433 | \n",
+ " 0.009544 | \n",
+ " 0.699046 | \n",
+ " 0.005051 | \n",
+ " 1.945910 | \n",
+ " 0.995669 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_BaselineUI | \n",
+ " 0.967585 | \n",
+ " 0.762740 | \n",
+ " 0.000954 | \n",
+ " 0.000170 | \n",
+ " 0.000278 | \n",
+ " 0.000463 | \n",
+ " 0.000644 | \n",
+ " 0.000189 | \n",
+ " 0.000752 | \n",
+ " 0.000168 | \n",
+ " 0.001677 | \n",
+ " 0.496424 | \n",
+ " 0.009544 | \n",
+ " 0.600530 | \n",
+ " 0.005051 | \n",
+ " 1.803126 | \n",
+ " 0.996380 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Self_IKNN | \n",
+ " 1.018363 | \n",
+ " 0.808793 | \n",
+ " 0.000318 | \n",
+ " 0.000108 | \n",
+ " 0.000140 | \n",
+ " 0.000189 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000214 | \n",
+ " 0.000037 | \n",
+ " 0.000368 | \n",
+ " 0.496391 | \n",
+ " 0.003181 | \n",
+ " 0.392153 | \n",
+ " 0.115440 | \n",
+ " 4.174741 | \n",
+ " 0.965327 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model RMSE MAE precision recall F_1 \\\n",
+ "0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 \n",
+ "0 Ready_SVD 0.950347 0.749312 0.100636 0.050514 0.055794 \n",
+ "0 Self_SVD 0.914143 0.717131 0.101803 0.042134 0.051610 \n",
+ "0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 \n",
+ "0 Ready_SVDBiased 0.939472 0.739816 0.085896 0.036073 0.043528 \n",
+ "0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 \n",
+ "0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 \n",
+ "0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 \n",
+ "0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 \n",
+ "0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 \n",
+ "0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 \n",
+ "0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 \n",
+ "\n",
+ " F_05 precision_super recall_super NDCG mAP MRR \\\n",
+ "0 0.141584 0.130472 0.137473 0.214651 0.111707 0.400939 \n",
+ "0 0.070753 0.091202 0.082734 0.114054 0.053200 0.248803 \n",
+ "0 0.068543 0.091953 0.071255 0.104015 0.048817 0.193027 \n",
+ "0 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 \n",
+ "0 0.057643 0.077039 0.057463 0.097753 0.045546 0.219839 \n",
+ "0 0.032269 0.029506 0.023707 0.050075 0.018728 0.121957 \n",
+ "0 0.016046 0.021137 0.009522 0.024214 0.008958 0.048068 \n",
+ "0 0.001602 0.002253 0.000930 0.003444 0.001362 0.011760 \n",
+ "0 0.000449 0.000536 0.000198 0.000845 0.000274 0.002744 \n",
+ "0 0.000481 0.000644 0.000223 0.001043 0.000335 0.003348 \n",
+ "0 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 \n",
+ "0 0.000189 0.000000 0.000000 0.000214 0.000037 0.000368 \n",
+ "\n",
+ " LAUC HR Reco in test Test coverage Shannon Gini \n",
+ "0 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317 \n",
+ "0 0.521983 0.517497 0.992153 0.210678 4.418683 0.952848 \n",
+ "0 0.517784 0.471898 0.867232 0.147908 3.871296 0.971820 \n",
+ "0 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139 \n",
+ "0 0.514709 0.431601 0.997455 0.168831 4.217578 0.962577 \n",
+ "0 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217 \n",
+ "0 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999 \n",
+ "0 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487 \n",
+ "0 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706 \n",
+ "0 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669 \n",
+ "0 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380 \n",
+ "0 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dir_path = \"Recommendations generated/ml-100k/\"\n",
+ "super_reactions = [4, 5]\n",
+ "test = pd.read_csv(\"./Datasets/ml-100k/test.csv\", sep=\"\\t\", header=None)\n",
+ "\n",
+ "ev.evaluate_all(test, dir_path, super_reactions)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/P4. Matrix Factorization.pdf b/P4. Matrix Factorization.pdf
new file mode 100644
index 0000000..548d412
Binary files /dev/null and b/P4. Matrix Factorization.pdf differ