Zadanie-1
This commit is contained in:
parent
388c96f32a
commit
4499231c79
File diff suppressed because one or more lines are too long
@ -195,7 +195,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"<3x4 sparse matrix of type '<class 'numpy.int64'>'\n",
|
"<3x4 sparse matrix of type '<class 'numpy.intc'>'\n",
|
||||||
"\twith 8 stored elements in Compressed Sparse Row format>"
|
"\twith 8 stored elements in Compressed Sparse Row format>"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -229,7 +229,7 @@
|
|||||||
"text/plain": [
|
"text/plain": [
|
||||||
"matrix([[4, 1, 3, 0],\n",
|
"matrix([[4, 1, 3, 0],\n",
|
||||||
" [0, 2, 0, 1],\n",
|
" [0, 2, 0, 1],\n",
|
||||||
" [2, 0, 5, 4]])"
|
" [2, 0, 5, 4]], dtype=int32)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -306,7 +306,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
|
"658 ns ± 16.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
|
||||||
"Inefficient way to access items rated by user:\n"
|
"Inefficient way to access items rated by user:\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -314,7 +314,7 @@
|
|||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
|
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
|
||||||
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
|
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -324,7 +324,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
|
"67.8 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -364,7 +364,7 @@
|
|||||||
"text/plain": [
|
"text/plain": [
|
||||||
"matrix([[4, 1, 3, 0],\n",
|
"matrix([[4, 1, 3, 0],\n",
|
||||||
" [0, 2, 0, 1],\n",
|
" [0, 2, 0, 1],\n",
|
||||||
" [2, 0, 5, 4]])"
|
" [2, 0, 5, 4]], dtype=int32)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -877,7 +877,7 @@
|
|||||||
"text/plain": [
|
"text/plain": [
|
||||||
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
|
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
|
||||||
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
|
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
|
||||||
" [0, 0, 0, 5, 0, 3, 4, 0]])"
|
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -1070,6 +1070,269 @@
|
|||||||
"- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u."
|
"- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class selfBaselineIU():\n",
|
||||||
|
" \n",
|
||||||
|
" def fit(self, train_ui):\n",
|
||||||
|
" self.train_ui=train_ui.copy()\n",
|
||||||
|
" self.train_iu=train_ui.transpose().tocsr()\n",
|
||||||
|
" \n",
|
||||||
|
" result=self.train_ui.copy()\n",
|
||||||
|
" \n",
|
||||||
|
" #we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
|
||||||
|
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
|
||||||
|
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
|
||||||
|
" \n",
|
||||||
|
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
|
||||||
|
" col_means=self.col_means.copy()\n",
|
||||||
|
" \n",
|
||||||
|
" max_col_mean=np.max(col_means)\n",
|
||||||
|
" col_means[col_means==0]=max_col_mean+1\n",
|
||||||
|
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
|
||||||
|
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
|
||||||
|
" \n",
|
||||||
|
" subtract=to_subtract_cols.data\n",
|
||||||
|
" subtract[subtract==max_col_mean+1]=0\n",
|
||||||
|
" \n",
|
||||||
|
" result.data=result.data-subtract\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
|
||||||
|
" \n",
|
||||||
|
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
|
||||||
|
" # (other option is to define addition/multiplication in a desired way)\n",
|
||||||
|
" row_means=self.row_means.copy()\n",
|
||||||
|
" \n",
|
||||||
|
" max_row_mean=np.max(row_means)\n",
|
||||||
|
" row_means[row_means==0]=max_row_mean+1\n",
|
||||||
|
" to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
|
||||||
|
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
|
||||||
|
" \n",
|
||||||
|
" subtract=to_subtract_rows.data\n",
|
||||||
|
" subtract[subtract==max_row_mean+1]=0\n",
|
||||||
|
" \n",
|
||||||
|
" result.data=result.data-subtract\n",
|
||||||
|
"\n",
|
||||||
|
" return result\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
|
||||||
|
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
|
||||||
|
" \n",
|
||||||
|
" top_k = defaultdict(list)\n",
|
||||||
|
" for nb_user, user in enumerate(estimations):\n",
|
||||||
|
" \n",
|
||||||
|
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
|
||||||
|
" for item, score in enumerate(user):\n",
|
||||||
|
" if item not in user_rated:\n",
|
||||||
|
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
|
||||||
|
" result=[]\n",
|
||||||
|
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
|
||||||
|
" for uid, item_scores in top_k.items():\n",
|
||||||
|
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
|
||||||
|
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
|
||||||
|
" return result\n",
|
||||||
|
" \n",
|
||||||
|
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
|
||||||
|
" result=[]\n",
|
||||||
|
" for user, item in zip(*test_ui.nonzero()):\n",
|
||||||
|
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
|
||||||
|
" return result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Training data:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
|
||||||
|
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
|
||||||
|
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"After subtracting columns and rows:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"matrix([[-0.375 , 1.125 , 0. , 0. , -0.375 ,\n",
|
||||||
|
" 0. , 0. , -0.375 ],\n",
|
||||||
|
" [ 0. , -0.66666667, 0.83333333, -0.16666667, 0. ,\n",
|
||||||
|
" 0. , 0. , 0. ],\n",
|
||||||
|
" [ 0. , 0. , 0. , 0.66666667, 0. ,\n",
|
||||||
|
" -0.33333333, -0.33333333, 0. ]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Recommend best unseen item:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Print estimations on unseen items:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>user</th>\n",
|
||||||
|
" <th>item</th>\n",
|
||||||
|
" <th>est_score</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>60</td>\n",
|
||||||
|
" <td>4.375000</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>10</td>\n",
|
||||||
|
" <td>40</td>\n",
|
||||||
|
" <td>4.166667</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>20</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>3.333333</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>20</td>\n",
|
||||||
|
" <td>20</td>\n",
|
||||||
|
" <td>2.333333</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>20</td>\n",
|
||||||
|
" <td>70</td>\n",
|
||||||
|
" <td>4.333333</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" user item est_score\n",
|
||||||
|
"0 0 60 4.375000\n",
|
||||||
|
"1 10 40 4.166667\n",
|
||||||
|
"2 20 0 3.333333\n",
|
||||||
|
"3 20 20 2.333333\n",
|
||||||
|
"4 20 70 4.333333"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
|
||||||
|
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
|
||||||
|
"\n",
|
||||||
|
"toy_train_iu, toy_test_iu, toy_user_code_id, toy_user_id_code, \\\n",
|
||||||
|
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
|
||||||
|
"\n",
|
||||||
|
"print('Training data:')\n",
|
||||||
|
"display(toy_train_iu.todense())\n",
|
||||||
|
"\n",
|
||||||
|
"model=selfBaselineIU()\n",
|
||||||
|
"print('After subtracting columns and rows:')\n",
|
||||||
|
"display(model.fit(toy_train_iu).todense())\n",
|
||||||
|
"\n",
|
||||||
|
"print('Recommend best unseen item:')\n",
|
||||||
|
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
|
||||||
|
"\n",
|
||||||
|
"print('Print estimations on unseen items:')\n",
|
||||||
|
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
|
||||||
|
"estimations.columns=['user', 'item', 'est_score']\n",
|
||||||
|
"display(estimations)\n",
|
||||||
|
"\n",
|
||||||
|
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
|
||||||
|
"\n",
|
||||||
|
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n",
|
||||||
|
"\n",
|
||||||
|
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
|
||||||
|
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model=selfBaselineIU()\n",
|
||||||
|
"model.fit(train_ui)\n",
|
||||||
|
"\n",
|
||||||
|
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
|
||||||
|
"\n",
|
||||||
|
"top_n.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_reco.csv', index=False, header=False)\n",
|
||||||
|
"\n",
|
||||||
|
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
|
||||||
|
"estimations.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_estimations.csv', index=False, header=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -1079,7 +1342,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 28,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1136,7 +1399,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": 29,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -1153,7 +1416,7 @@
|
|||||||
"0.7524871012820799"
|
"0.7524871012820799"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 23,
|
"execution_count": 29,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -1183,24 +1446,24 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": 30,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"RMSE: 1.5239\n",
|
"RMSE: 1.5230\n",
|
||||||
"MAE: 1.2268\n"
|
"MAE: 1.2226\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"1.2267993503843746"
|
"1.2226271020019277"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 24,
|
"execution_count": 30,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -1233,6 +1496,34 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"sp.accuracy.mae(predictions, verbose=True)"
|
"sp.accuracy.mae(predictions, verbose=True)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -1251,7 +1542,12 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.8.8"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "2a3a95f8b675c5b7dd6a35e1675edaf697539b1f0a71c4603e9520a8bbd07d82"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
Reference in New Issue
Block a user