Zadanie-1

2021-06-07 19:19:39 +02:00 · 2021-06-07 19:19:39 +02:00 · 4499231c79
commit 4499231c79
parent 388c96f32a
2 changed files with 350 additions and 40 deletions
--- a/preparation.ipynb
+++ b/preparation.ipynb
--- a/Baseline.ipynb
+++ b/Baseline.ipynb
@ -195,7 +195,7 @@
    {
     "data": {
      "text/plain": [
-       "<3x4 sparse matrix of type '<class 'numpy.int64'>'\n",
+       "<3x4 sparse matrix of type '<class 'numpy.intc'>'\n",
       "\twith 8 stored elements in Compressed Sparse Row format>"
      ]
     },
@ -229,7 +229,7 @@
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
-       "        [2, 0, 5, 4]])"
+       "        [2, 0, 5, 4]], dtype=int32)"
      ]
     },
     "metadata": {},
@ -306,7 +306,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
+      "658 ns ± 16.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
      "Inefficient way to access items rated by user:\n"
     ]
    },
@ -314,7 +314,7 @@
     "data": {
      "text/plain": [
       "array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,\n",
-       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
+       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
      ]
     },
     "metadata": {},
@ -324,7 +324,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+      "67.8 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
@ -364,7 +364,7 @@
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
-       "        [2, 0, 5, 4]])"
+       "        [2, 0, 5, 4]], dtype=int32)"
      ]
     },
     "metadata": {},
@ -877,7 +877,7 @@
      "text/plain": [
       "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
       "        [0, 1, 2, 3, 0, 0, 0, 0],\n",
-       "        [0, 0, 0, 5, 0, 3, 4, 0]])"
+       "        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
      ]
     },
     "metadata": {},
@ -1070,6 +1070,269 @@
    "- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "class selfBaselineIU():\n",
    "    \n",
    "    def fit(self, train_ui):\n",
    "        self.train_ui=train_ui.copy()\n",
    "        self.train_iu=train_ui.transpose().tocsr()\n",
    "        \n",
    "        result=self.train_ui.copy()\n",
    "        \n",
    "        #we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
    "        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
    "                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
    "        \n",
    "        # again - it is possible that some mean will be zero, so let's use the same workaround\n",
    "        col_means=self.col_means.copy()\n",
    "        \n",
    "        max_col_mean=np.max(col_means)\n",
    "        col_means[col_means==0]=max_col_mean+1\n",
    "        to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
    "        to_subtract_cols.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_cols.data\n",
    "        subtract[subtract==max_col_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "\n",
    "        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
    "        \n",
    "        # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
    "        # (other option is to define addition/multiplication in a desired way)\n",
    "        row_means=self.row_means.copy()\n",
    "        \n",
    "        max_row_mean=np.max(row_means)\n",
    "        row_means[row_means==0]=max_row_mean+1\n",
    "        to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
    "        to_subtract_rows.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_rows.data\n",
    "        subtract[subtract==max_row_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "        return result\n",
    "    \n",
    "    \n",
    "    def recommend(self, user_code_id, item_code_id, topK=10):\n",
    "        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
    "        \n",
    "        top_k = defaultdict(list)\n",
    "        for nb_user, user in enumerate(estimations):\n",
    "            \n",
    "            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
    "            for item, score in enumerate(user):\n",
    "                if item not in user_rated:\n",
    "                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
    "        result=[]\n",
    "        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "        for uid, item_scores in top_k.items():\n",
    "            item_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "            result.append([uid]+list(chain(*item_scores[:topK])))\n",
    "        return result\n",
    "    \n",
    "    def estimate(self, user_code_id, item_code_id, test_ui):\n",
    "        result=[]\n",
    "        for user, item in zip(*test_ui.nonzero()):\n",
    "            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
       "        [0, 1, 2, 3, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After subtracting columns and rows:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[-0.375     ,  1.125     ,  0.        ,  0.        , -0.375     ,\n",
       "          0.        ,  0.        , -0.375     ],\n",
       "        [ 0.        , -0.66666667,  0.83333333, -0.16666667,  0.        ,\n",
       "          0.        ,  0.        ,  0.        ],\n",
       "        [ 0.        ,  0.        ,  0.        ,  0.66666667,  0.        ,\n",
       "         -0.33333333, -0.33333333,  0.        ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommend best unseen item:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Print estimations on unseen items:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>est_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>4.375000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>40</td>\n",
       "      <td>4.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>3.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>2.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20</td>\n",
       "      <td>70</td>\n",
       "      <td>4.333333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  est_score\n",
       "0     0    60   4.375000\n",
       "1    10    40   4.166667\n",
       "2    20     0   3.333333\n",
       "3    20    20   2.333333\n",
       "4    20    70   4.333333"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "\n",
    "toy_train_iu, toy_test_iu, toy_user_code_id, toy_user_id_code, \\\n",
    "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
    "\n",
    "print('Training data:')\n",
    "display(toy_train_iu.todense())\n",
    "\n",
    "model=selfBaselineIU()\n",
    "print('After subtracting columns and rows:')\n",
    "display(model.fit(toy_train_iu).todense())\n",
    "\n",
    "print('Recommend best unseen item:')\n",
    "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
    "\n",
    "print('Print estimations on unseen items:')\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
    "estimations.columns=['user', 'item', 'est_score']\n",
    "display(estimations)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
    "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=selfBaselineIU()\n",
    "model.fit(train_ui)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
    "estimations.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -1079,7 +1342,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
@ -1136,7 +1399,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
@ -1153,7 +1416,7 @@
       "0.7524871012820799"
      ]
     },
-     "execution_count": 23,
+     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1183,24 +1446,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "RMSE: 1.5239\n",
+      "RMSE: 1.5230\n",
-      "MAE:  1.2268\n"
+      "MAE:  1.2226\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "1.2267993503843746"
+       "1.2226271020019277"
      ]
     },
-     "execution_count": 24,
+     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1233,6 +1496,34 @@
    "\n",
    "sp.accuracy.mae(predictions, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
@ -1251,7 +1542,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.8"
  },
  "metadata": {
   "interpreter": {
    "hash": "2a3a95f8b675c5b7dd6a35e1675edaf697539b1f0a71c4603e9520a8bbd07d82"
   }
  }
 },
 "nbformat": 4,