workshops_recommender_systems/P1. Baseline.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy.sparse as sparse\n",
    "from collections import defaultdict\n",
    "from itertools import chain\n",
    "import random\n",
    "\n",
    "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's prepare dataset\n",
    "train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
    "train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
    "train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
    "\n",
    "user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
    "user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
    "item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
    "item_id_code = dict((v, k) for k, v in item_code_id.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>user_code</th>\n",
       "      <th>item_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>664</td>\n",
       "      <td>525</td>\n",
       "      <td>4</td>\n",
       "      <td>876526580</td>\n",
       "      <td>663</td>\n",
       "      <td>524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>49</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>888068651</td>\n",
       "      <td>48</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>352</td>\n",
       "      <td>273</td>\n",
       "      <td>2</td>\n",
       "      <td>884290328</td>\n",
       "      <td>351</td>\n",
       "      <td>272</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>618</td>\n",
       "      <td>96</td>\n",
       "      <td>3</td>\n",
       "      <td>891307749</td>\n",
       "      <td>617</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>560</td>\n",
       "      <td>24</td>\n",
       "      <td>2</td>\n",
       "      <td>879976772</td>\n",
       "      <td>559</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  rating  timestamp  user_code  item_code\n",
       "0   664   525       4  876526580        663        524\n",
       "1    49     1       2  888068651         48          0\n",
       "2   352   273       2  884290328        351        272\n",
       "3   618    96       3  891307749        617         95\n",
       "4   560    24       2  879976772        559         23"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_and_test[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
    "test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take number of users and items\n",
    "(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
    "\n",
    "# Create sparse csr matrices\n",
    "train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
    "test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Above steps are the same for many algorithms, so I put the code in separate file:\n",
    "import helpers\n",
    "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
    "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
    "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CSR matrices - what is it?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x4 sparse matrix of type '<class 'numpy.longlong'>'\n",
       "\twith 8 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
    "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
    "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
    "sample_csr=sparse.csr_matrix((data, (row, col)))\n",
    "sample_csr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ratings matrix with missing entries replaced by zeros:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
       "        [2, 0, 5, 4]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Number of ratings: 8 \n",
      "Number of users: 3 \n",
      "Number of items: 4 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('Ratings matrix with missing entries replaced by zeros:')\n",
    "display(sample_csr.todense())\n",
    "\n",
    "print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n",
    "      .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ratings data: [4 1 3 2 1 2 5 4]\n",
      "Regarding items: [0 1 2 1 3 0 2 3]\n",
      "Where ratings from 0 to 2 belongs to user 0.\n",
      "Where ratings from 3 to 4 belongs to user 1.\n",
      "Where ratings from 5 to 7 belongs to user 2.\n"
     ]
    }
   ],
   "source": [
    "print('Ratings data:', sample_csr.data)\n",
    "\n",
    "print('Regarding items:', sample_csr.indices)\n",
    "\n",
    "for i in range(sample_csr.shape[0]):\n",
    "    print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Efficient way to access items rated by user:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,\n",
       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "586 ns ± 31.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
      "Inefficient way to access items rated by user:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,\n",
       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64.4 µs ± 1.75 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "user=123\n",
    "\n",
    "print('Efficient way to access items rated by user:')\n",
    "display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
    "%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
    "\n",
    "print('Inefficient way to access items rated by user:')\n",
    "display(train_ui[user].indices)\n",
    "%timeit train_ui[user].indices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Example: subtracting row means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Our matrix:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
       "        [2, 0, 5, 4]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of row sums:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 8,  3, 11]])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Our matrix:')\n",
    "display(sample_csr.todense())\n",
    "print('List of row sums:')\n",
    "sample_csr.sum(axis=1).ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Array with row means:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([2.66666667, 1.5       , 3.66666667])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Diagonal csr matrix with inverse of row sums on diagonal:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[2.66666667, 0.        , 0.        ],\n",
       "        [0.        , 1.5       , 0.        ],\n",
       "        [0.        , 0.        , 3.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Let's apply them in nonzero entries:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[2.66666667, 2.66666667, 2.66666667, 0.        ],\n",
       "        [0.        , 1.5       , 0.        , 1.5       ],\n",
       "        [3.66666667, 0.        , 3.66666667, 3.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finally after subtraction:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 1.33333333, -1.66666667,  0.33333333,  0.        ],\n",
       "        [ 0.        ,  0.5       ,  0.        , -0.5       ],\n",
       "        [-1.66666667,  0.        ,  1.33333333,  0.33333333]])"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Array with row means:')\n",
    "row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
    "display(row_means)\n",
    "\n",
    "print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
    "display(sparse.diags(row_means).todense())\n",
    "\n",
    "print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
    "to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n",
    "display(to_subtract.todense())\n",
    "\n",
    "print(\"Finally after subtraction:\")\n",
    "sample_csr-to_subtract.todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Transposing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample matrix: \n",
      " [[4 1 3 0]\n",
      " [0 2 0 1]\n",
      " [2 0 5 4]]\n",
      "\n",
      "Indices: \n",
      " [0 1 2 1 3 0 2 3]\n",
      "\n",
      "Transposed matrix: \n",
      " [[4 0 2]\n",
      " [1 2 0]\n",
      " [3 0 5]\n",
      " [0 1 4]]\n",
      "\n",
      "Indices of transposed matrix: \n",
      " [0 1 2 1 3 0 2 3]\n",
      "\n",
      "Reason:  <class 'scipy.sparse.csc.csc_matrix'>\n",
      "\n",
      "After converting to csr: \n",
      " [0 2 0 1 0 2 1 2]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from scipy import sparse\n",
    "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
    "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
    "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
    "sample=sparse.csr_matrix((data, (row, col)))\n",
    "print('Sample matrix: \\n', sample.A)\n",
    "print('\\nIndices: \\n', sample.indices)\n",
    "transposed=sample.transpose()\n",
    "print('\\nTransposed matrix: \\n', transposed.A)\n",
    "print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
    "\n",
    "print('\\nReason: ', type(transposed))\n",
    "\n",
    "print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self made top popular"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "if not os.path.exists('./Recommendations generated/'):\n",
    "    os.mkdir('./Recommendations generated/')\n",
    "    os.mkdir('./Recommendations generated/ml-100k/')\n",
    "    os.mkdir('./Recommendations generated/toy-example/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "TopPop=[]\n",
    "train_iu=train_ui.transpose().tocsr()\n",
    "scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
    "    \n",
    "TopPop.sort(key=lambda x: x[1], reverse=True)\n",
    "#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
    "\n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if TopPop[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, item in zip(*test_ui.nonzero()):\n",
    "    estimations.append([user_code_id[user], item_code_id[item],\n",
    "        (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "      <th>18</th>\n",
       "      <th>19</th>\n",
       "      <th>20</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>286</td>\n",
       "      <td>4.117021</td>\n",
       "      <td>288</td>\n",
       "      <td>4.053191</td>\n",
       "      <td>294</td>\n",
       "      <td>3.851064</td>\n",
       "      <td>300</td>\n",
       "      <td>...</td>\n",
       "      <td>174</td>\n",
       "      <td>3.531915</td>\n",
       "      <td>98</td>\n",
       "      <td>3.351064</td>\n",
       "      <td>313</td>\n",
       "      <td>3.063830</td>\n",
       "      <td>405</td>\n",
       "      <td>2.904255</td>\n",
       "      <td>79</td>\n",
       "      <td>2.851064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>181</td>\n",
       "      <td>4.446809</td>\n",
       "      <td>1</td>\n",
       "      <td>3.914894</td>\n",
       "      <td>121</td>\n",
       "      <td>3.595745</td>\n",
       "      <td>127</td>\n",
       "      <td>3.563830</td>\n",
       "      <td>174</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>3.446809</td>\n",
       "      <td>98</td>\n",
       "      <td>3.351064</td>\n",
       "      <td>56</td>\n",
       "      <td>3.308511</td>\n",
       "      <td>237</td>\n",
       "      <td>3.287234</td>\n",
       "      <td>117</td>\n",
       "      <td>3.159574</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0    1         2    3         4    5         6    7         8    9   ...  \\\n",
       "0   1   50  5.000000  286  4.117021  288  4.053191  294  3.851064  300  ...   \n",
       "1   2  181  4.446809    1  3.914894  121  3.595745  127  3.563830  174  ...   \n",
       "\n",
       "    11        12  13        14   15        16   17        18   19        20  \n",
       "0  174  3.531915  98  3.351064  313  3.063830  405  2.904255   79  2.851064  \n",
       "1    7  3.446809  98  3.351064   56  3.308511  237  3.287234  117  3.159574  \n",
       "\n",
       "[2 rows x 21 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(result)[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self made global average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "GlobalAvg=[]\n",
    "avg=np.sum(train_ui)/train_ui.nnz\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    GlobalAvg.append((i, avg))\n",
    "    \n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if GlobalAvg[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, item in zip(*test_ui.nonzero()):\n",
    "    estimations.append([user_code_id[user], item_code_id[item], avg])\n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "      <th>18</th>\n",
       "      <th>19</th>\n",
       "      <th>20</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>10</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>25</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>32</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>33</td>\n",
       "      <td>...</td>\n",
       "      <td>44</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>46</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>50</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>52</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>55</td>\n",
       "      <td>3.529975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>2</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>3</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>4</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>5</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>7</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>8</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>9</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>11</td>\n",
       "      <td>3.529975</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0   1         2   3         4   5         6   7         8   9   ...  11  \\\n",
       "0   1   5  3.529975  10  3.529975  25  3.529975  32  3.529975  33  ...  44   \n",
       "1   2   1  3.529975   2  3.529975   3  3.529975   4  3.529975   5  ...   6   \n",
       "\n",
       "         12  13        14  15        16  17        18  19        20  \n",
       "0  3.529975  46  3.529975  50  3.529975  52  3.529975  55  3.529975  \n",
       "1  3.529975   7  3.529975   8  3.529975   9  3.529975  11  3.529975  \n",
       "\n",
       "[2 rows x 21 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(result)[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Project task 1 - self made top rated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# project task 1:  implement TopRated\n",
    "# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n",
    "# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n",
    "# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "TopRated=[]\n",
    "train_iu=train_ui.transpose().tocsr()\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    if len(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]]) == 0:\n",
    "        TopRated.append((i, 0.))\n",
    "    else:\n",
    "        TopRated.append((i, sum(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]]) / len(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]])))\n",
    "\n",
    "\n",
    "\n",
    "    \n",
    "TopRated.sort(key=lambda x: x[1], reverse=True)\n",
    "\n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if TopRated[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[TopRated[item_pos][0]], TopRated[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "    \n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, i in zip(*test_ui.nonzero()):\n",
    "    if len(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]]) == 0:\n",
    "        estimations.append([user_code_id[user], item_code_id[i], 2.5])\n",
    "    else:\n",
    "        estimations.append(\n",
    "            [user_code_id[user], item_code_id[i], sum(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]]) / len(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]])])\n",
    "    \n",
    "    \n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "      <th>18</th>\n",
       "      <th>19</th>\n",
       "      <th>20</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>814</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1189</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1201</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1293</td>\n",
       "      <td>...</td>\n",
       "      <td>1306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1467</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1491</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1500</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1536</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>119</td>\n",
       "      <td>5.0</td>\n",
       "      <td>814</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1189</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1201</td>\n",
       "      <td>...</td>\n",
       "      <td>1293</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1467</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1491</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1500</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>119</td>\n",
       "      <td>5.0</td>\n",
       "      <td>814</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1189</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1201</td>\n",
       "      <td>...</td>\n",
       "      <td>1293</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1467</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1491</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1500</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>119</td>\n",
       "      <td>5.0</td>\n",
       "      <td>814</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1189</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1201</td>\n",
       "      <td>...</td>\n",
       "      <td>1293</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1467</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1491</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1500</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>119</td>\n",
       "      <td>5.0</td>\n",
       "      <td>814</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1189</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1201</td>\n",
       "      <td>...</td>\n",
       "      <td>1293</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1306</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1467</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1491</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1500</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0    1    2     3    4     5    6     7    8     9   ...    11   12    13  \\\n",
       "0   1  814  5.0  1122  5.0  1189  5.0  1201  5.0  1293  ...  1306  5.0  1467   \n",
       "1   2  119  5.0   814  5.0  1122  5.0  1189  5.0  1201  ...  1293  5.0  1306   \n",
       "2   3  119  5.0   814  5.0  1122  5.0  1189  5.0  1201  ...  1293  5.0  1306   \n",
       "3   4  119  5.0   814  5.0  1122  5.0  1189  5.0  1201  ...  1293  5.0  1306   \n",
       "4   5  119  5.0   814  5.0  1122  5.0  1189  5.0  1201  ...  1293  5.0  1306   \n",
       "\n",
       "    14    15   16    17   18    19   20  \n",
       "0  5.0  1491  5.0  1500  5.0  1536  5.0  \n",
       "1  5.0  1467  5.0  1491  5.0  1500  5.0  \n",
       "2  5.0  1467  5.0  1491  5.0  1500  5.0  \n",
       "3  5.0  1467  5.0  1491  5.0  1500  5.0  \n",
       "4  5.0  1467  5.0  1491  5.0  1500  5.0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(result)[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>3.516903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>3.901217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>3.479851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>32</td>\n",
       "      <td>3.833570</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>33</td>\n",
       "      <td>3.488115</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0   1         2\n",
       "0  1   5  3.516903\n",
       "1  1  10  3.901217\n",
       "2  1  25  3.479851\n",
       "3  1  32  3.833570\n",
       "4  1  33  3.488115"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(estimations)[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self-made baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "class selfBaselineUI():\n",
    "    \n",
    "    def fit(self, train_ui):\n",
    "        self.train_ui=train_ui.copy()\n",
    "        self.train_iu=train_ui.transpose().tocsr()\n",
    "        \n",
    "        result=self.train_ui.copy()\n",
    "        \n",
    "        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
    "        \n",
    "        # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
    "        # (other option is to define addition/multiplication in a desired way)\n",
    "        row_means=self.row_means.copy()\n",
    "        \n",
    "        max_row_mean=np.max(row_means)\n",
    "        row_means[row_means==0]=max_row_mean+1\n",
    "        to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
    "        to_subtract_rows.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_rows.data\n",
    "        subtract[subtract==max_row_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "#        we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
    "        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
    "                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
    "        \n",
    "        # again - it is possible that some mean will be zero, so let's use the same workaround\n",
    "        col_means=self.col_means.copy()\n",
    "        \n",
    "        max_col_mean=np.max(col_means)\n",
    "        col_means[col_means==0]=max_col_mean+1\n",
    "        to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
    "        to_subtract_cols.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_cols.data\n",
    "        subtract[subtract==max_col_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "        return result\n",
    "    \n",
    "    \n",
    "    def recommend(self, user_code_id, item_code_id, topK=10):\n",
    "        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
    "        \n",
    "        top_k = defaultdict(list)\n",
    "        for nb_user, user in enumerate(estimations):\n",
    "            \n",
    "            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
    "            for item, score in enumerate(user):\n",
    "                if item not in user_rated:\n",
    "                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
    "        result=[]\n",
    "        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "        for uid, item_scores in top_k.items():\n",
    "            item_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "            result.append([uid]+list(chain(*item_scores[:topK])))\n",
    "        return result\n",
    "    \n",
    "    def estimate(self, user_code_id, item_code_id, test_ui):\n",
    "        result=[]\n",
    "        for user, item in zip(*test_ui.nonzero()):\n",
    "            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
       "        [0, 1, 2, 3, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After subtracting rows and columns:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 0. ,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],\n",
       "        [ 0. , -0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],\n",
       "        [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommend best unseen item:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Print estimations on unseen items:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>est_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>40</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20</td>\n",
       "      <td>70</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  est_score\n",
       "0     0    60        4.0\n",
       "1    10    40        3.0\n",
       "2    20     0        3.0\n",
       "3    20    20        4.0\n",
       "4    20    70        4.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "\n",
    "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
    "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
    "\n",
    "print('Training data:')\n",
    "display(toy_train_ui.todense())\n",
    "\n",
    "model=selfBaselineUI()\n",
    "print('After subtracting rows and columns:')\n",
    "display(model.fit(toy_train_ui).todense())\n",
    "\n",
    "print('Recommend best unseen item:')\n",
    "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
    "\n",
    "print('Print estimations on unseen items:')\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.columns=['user', 'item', 'est_score']\n",
    "display(estimations)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=selfBaselineUI()\n",
    "model.fit(train_ui)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
    "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# project task 2:  implement self-made BaselineIU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n",
    "# but first subtract col means then row means\n",
    "# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n",
    "# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "class selfBaselineIU():\n",
    "    \n",
    "    def fit(self, train_ui):\n",
    "        self.train_ui=train_ui.copy()\n",
    "        self.train_iu=train_ui.transpose().tocsr()\n",
    "        \n",
    "        result=self.train_ui.copy()\n",
    "        \n",
    "        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
    "                    out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
    "        \n",
    "        col_means=self.col_means.copy()\n",
    "        \n",
    "        max_col_mean=np.max(col_means)\n",
    "        col_means[col_means==0]=max_col_mean+1\n",
    "        to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
    "        to_subtract_cols.sort_indices()\n",
    "        \n",
    "        subtract=to_subtract_cols.data\n",
    "        subtract[subtract==max_col_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
    "        \n",
    "        row_means=self.row_means.copy()\n",
    "        \n",
    "        max_row_mean=np.max(row_means)\n",
    "        row_means[row_means==0]=max_row_mean+1\n",
    "        to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
    "        to_subtract_rows.sort_indices()\n",
    "        \n",
    "        subtract=to_subtract_rows.data\n",
    "        subtract[subtract==max_row_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "        return result\n",
    "    \n",
    "    \n",
    "    def recommend(self, user_code_id, item_code_id, topK=10):\n",
    "        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
    "        \n",
    "        top_k = defaultdict(list)\n",
    "        for nb_user, user in enumerate(estimations):\n",
    "            \n",
    "            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
    "            for item, score in enumerate(user):\n",
    "                if item not in user_rated:\n",
    "                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
    "        result=[]\n",
    "\n",
    "        for uid, item_scores in top_k.items():\n",
    "            item_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "            result.append([uid]+list(chain(*item_scores[:topK])))\n",
    "        return result\n",
    "    \n",
    "    def estimate(self, user_code_id, item_code_id, test_ui):\n",
    "        result=[]\n",
    "        for user, item in zip(*test_ui.nonzero()):\n",
    "            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
       "        [0, 1, 2, 3, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After subtracting rows and columns:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[-0.375     ,  1.125     ,  0.        ,  0.        , -0.375     ,\n",
       "          0.        ,  0.        , -0.375     ],\n",
       "        [ 0.        , -0.66666667,  0.83333333, -0.16666667,  0.        ,\n",
       "          0.        ,  0.        ,  0.        ],\n",
       "        [ 0.        ,  0.        ,  0.        ,  0.66666667,  0.        ,\n",
       "         -0.33333333, -0.33333333,  0.        ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommend best unseen item:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Print estimations on unseen items:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>est_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>4.375000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>40</td>\n",
       "      <td>4.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>3.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>2.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20</td>\n",
       "      <td>70</td>\n",
       "      <td>4.333333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  est_score\n",
       "0     0    60   4.375000\n",
       "1    10    40   4.166667\n",
       "2    20     0   3.333333\n",
       "3    20    20   2.333333\n",
       "4    20    70   4.333333"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "\n",
    "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
    "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
    "\n",
    "print('Training data:')\n",
    "display(toy_train_ui.todense())\n",
    "\n",
    "model=selfBaselineIU()\n",
    "print('After subtracting rows and columns:')\n",
    "display(model.fit(toy_train_ui).todense())\n",
    "\n",
    "print('Recommend best unseen item:')\n",
    "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
    "\n",
    "print('Print estimations on unseen items:')\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.columns=['user', 'item', 'est_score']\n",
    "display(estimations)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=selfBaselineIU()\n",
    "model.fit(train_ui)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
    "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ready-made baseline - Surprise implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimating biases using als...\n"
     ]
    }
   ],
   "source": [
    "import surprise as sp\n",
    "import time\n",
    "\n",
    "# Based on surprise.readthedocs.io\n",
    "def get_top_n(predictions, n=10):\n",
    "    \n",
    "    # Here we create a dictionary which items are lists of pairs (item, score)\n",
    "    top_n = defaultdict(list)\n",
    "    for uid, iid, true_r, est, _ in predictions:\n",
    "        top_n[uid].append((iid, est))\n",
    "        \n",
    "    result=[]\n",
    "    # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "    for uid, user_ratings in top_n.items():\n",
    "        user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
    "        result.append([uid]+list(chain(*user_ratings[:n]))) \n",
    "    return result\n",
    "\n",
    "\n",
    "reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
    "trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
    "trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
    "\n",
    "testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
    "testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
    "\n",
    "algo = sp.BaselineOnly()\n",
    "# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
    "# observe how bad results gives above algorithm\n",
    "# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
    "\n",
    "algo.fit(trainset)\n",
    "\n",
    "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
    "predictions = algo.test(antitrainset)\n",
    "\n",
    "top_n = get_top_n(predictions, n=10)\n",
    "\n",
    "top_n=pd.DataFrame(top_n)\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 0.9495\n",
      "MAE:  0.7525\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7524871012820799"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Compute RMSE on testset using buildin functions\n",
    "predictions = algo.test(testset)\n",
    "sp.accuracy.rmse(predictions, verbose=True)\n",
    "\n",
    "# Let's also save the results in file\n",
    "predictions_df=[]\n",
    "for uid, iid, true_r, est, _ in predictions:\n",
    "    predictions_df.append([uid, iid, est])\n",
    "    \n",
    "predictions_df=pd.DataFrame(predictions_df)\n",
    "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
    "\n",
    "sp.accuracy.mae(predictions, verbose=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Let's compare with random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 1.5151\n",
      "MAE:  1.2192\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1.2192187389503517"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
    "algo = sp.NormalPredictor()\n",
    "algo.fit(trainset)\n",
    "\n",
    "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
    "predictions = algo.test(antitrainset)\n",
    "\n",
    "top_n = get_top_n(predictions, n=10)\n",
    "\n",
    "top_n=pd.DataFrame(top_n)\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
    "\n",
    "# Compute RMSE on testset using buildin functions\n",
    "predictions = algo.test(testset)\n",
    "sp.accuracy.rmse(predictions, verbose=True)\n",
    "\n",
    "# Let's also save the results in file\n",
    "predictions_df=[]\n",
    "for uid, iid, true_r, est, _ in predictions:\n",
    "    predictions_df.append([uid, iid, est])\n",
    "    \n",
    "predictions_df=pd.DataFrame(predictions_df)\n",
    "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
    "\n",
    "sp.accuracy.mae(predictions, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}