workshops_recommender_systems/P1. Baseline.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install surprise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy.sparse as sparse\n",
    "from collections import defaultdict\n",
    "from itertools import chain\n",
    "import random\n",
    "\n",
    "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's prepare dataset\n",
    "train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
    "train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
    "train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
    "\n",
    "user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
    "user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
    "item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
    "item_id_code = dict((v, k) for k, v in item_code_id.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>user_code</th>\n",
       "      <th>item_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>664</td>\n",
       "      <td>525</td>\n",
       "      <td>4</td>\n",
       "      <td>876526580</td>\n",
       "      <td>663</td>\n",
       "      <td>524</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>49</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>888068651</td>\n",
       "      <td>48</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>352</td>\n",
       "      <td>273</td>\n",
       "      <td>2</td>\n",
       "      <td>884290328</td>\n",
       "      <td>351</td>\n",
       "      <td>272</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>618</td>\n",
       "      <td>96</td>\n",
       "      <td>3</td>\n",
       "      <td>891307749</td>\n",
       "      <td>617</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>560</td>\n",
       "      <td>24</td>\n",
       "      <td>2</td>\n",
       "      <td>879976772</td>\n",
       "      <td>559</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  rating  timestamp  user_code  item_code\n",
       "0   664   525       4  876526580        663        524\n",
       "1    49     1       2  888068651         48          0\n",
       "2   352   273       2  884290328        351        272\n",
       "3   618    96       3  891307749        617         95\n",
       "4   560    24       2  879976772        559         23"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_and_test[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
    "test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take number of users and items\n",
    "(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
    "\n",
    "# Create sparse csr matrices\n",
    "train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
    "test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Above steps are the same for many algorithms, so I put the code in separate file:\n",
    "import helpers\n",
    "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
    "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
    "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CSR matrices - what is it?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x4 sparse matrix of type '<class 'numpy.intc'>'\n",
       "\twith 8 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
    "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
    "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
    "sample_csr=sparse.csr_matrix((data, (row, col)))\n",
    "sample_csr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ratings matrix with missing entries replaced by zeros:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
       "        [2, 0, 5, 4]], dtype=int32)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Number of ratings: 8 \n",
      "Number of users: 3 \n",
      "Number of items: 4 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('Ratings matrix with missing entries replaced by zeros:')\n",
    "display(sample_csr.todense())\n",
    "\n",
    "print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n",
    "      .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ratings data: [4 1 3 2 1 2 5 4]\n",
      "Regarding items: [0 1 2 1 3 0 2 3]\n",
      "Where ratings from 0 to 2 belongs to user 0.\n",
      "Where ratings from 3 to 4 belongs to user 1.\n",
      "Where ratings from 5 to 7 belongs to user 2.\n"
     ]
    }
   ],
   "source": [
    "print('Ratings data:', sample_csr.data)\n",
    "\n",
    "print('Regarding items:', sample_csr.indices)\n",
    "\n",
    "for i in range(sample_csr.shape[0]):\n",
    "    print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Efficient way to access items rated by user:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,\n",
       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "580 ns ± 15.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
      "Inefficient way to access items rated by user:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,\n",
       "       171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "45.2 µs ± 728 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "user=123\n",
    "\n",
    "print('Efficient way to access items rated by user:')\n",
    "display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
    "%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
    "\n",
    "print('Inefficient way to access items rated by user:')\n",
    "display(train_ui[user].indices)\n",
    "%timeit train_ui[user].indices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Example: subtracting row means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Our matrix:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[4, 1, 3, 0],\n",
       "        [0, 2, 0, 1],\n",
       "        [2, 0, 5, 4]], dtype=int32)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of row sums:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 8,  3, 11]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Our matrix:')\n",
    "display(sample_csr.todense())\n",
    "print('List of row sums:')\n",
    "sample_csr.sum(axis=1).ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Array with row means:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([2.66666667, 1.5       , 3.66666667])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Diagonal csr matrix with inverse of row sums on diagonal:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[2.66666667, 0.        , 0.        ],\n",
       "        [0.        , 1.5       , 0.        ],\n",
       "        [0.        , 0.        , 3.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Let's apply them in nonzero entries:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[2.66666667, 2.66666667, 2.66666667, 0.        ],\n",
       "        [0.        , 1.5       , 0.        , 1.5       ],\n",
       "        [3.66666667, 0.        , 3.66666667, 3.66666667]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finally after subtraction:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 1.33333333, -1.66666667,  0.33333333,  0.        ],\n",
       "        [ 0.        ,  0.5       ,  0.        , -0.5       ],\n",
       "        [-1.66666667,  0.        ,  1.33333333,  0.33333333]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Array with row means:')\n",
    "row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
    "display(row_means)\n",
    "\n",
    "print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
    "display(sparse.diags(row_means).todense())\n",
    "\n",
    "print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
    "to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n",
    "display(to_subtract.todense())\n",
    "\n",
    "print(\"Finally after subtraction:\")\n",
    "sample_csr-to_subtract.todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Transposing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample matrix: \n",
      " [[4 1 3 0]\n",
      " [0 2 0 1]\n",
      " [2 0 5 4]]\n",
      "\n",
      "Indices: \n",
      " [0 1 2 1 3 0 2 3]\n",
      "\n",
      "Transposed matrix: \n",
      " [[4 0 2]\n",
      " [1 2 0]\n",
      " [3 0 5]\n",
      " [0 1 4]]\n",
      "\n",
      "Indices of transposed matrix: \n",
      " [0 1 2 1 3 0 2 3]\n",
      "\n",
      "Reason:  <class 'scipy.sparse.csc.csc_matrix'>\n",
      "\n",
      "After converting to csr: \n",
      " [0 2 0 1 0 2 1 2]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from scipy import sparse\n",
    "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
    "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
    "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
    "sample=sparse.csr_matrix((data, (row, col)))\n",
    "print('Sample matrix: \\n', sample.A)\n",
    "print('\\nIndices: \\n', sample.indices)\n",
    "transposed=sample.transpose()\n",
    "print('\\nTransposed matrix: \\n', transposed.A)\n",
    "print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
    "\n",
    "print('\\nReason: ', type(transposed))\n",
    "\n",
    "print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self made top popular"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "if not os.path.exists('./Recommendations generated/'):\n",
    "    os.mkdir('./Recommendations generated/')\n",
    "    os.mkdir('./Recommendations generated/ml-100k/')\n",
    "    os.mkdir('./Recommendations generated/toy-example/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "TopPop=[]\n",
    "train_iu=train_ui.transpose().tocsr()\n",
    "scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
    "    \n",
    "TopPop.sort(key=lambda x: x[1], reverse=True)\n",
    "#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if TopPop[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, item in zip(*test_ui.nonzero()):\n",
    "    estimations.append([user_code_id[user], item_code_id[item],\n",
    "        (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self made global average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "GlobalAvg=[]\n",
    "avg=np.sum(train_ui)/train_ui.nnz\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    GlobalAvg.append((i, avg))\n",
    "    \n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if GlobalAvg[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, item in zip(*test_ui.nonzero()):\n",
    "    estimations.append([user_code_id[user], item_code_id[item], avg])\n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "      <th>16</th>\n",
       "      <th>17</th>\n",
       "      <th>18</th>\n",
       "      <th>19</th>\n",
       "      <th>20</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>10</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>25</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>32</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>33</td>\n",
       "      <td>...</td>\n",
       "      <td>44</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>46</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>50</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>52</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>55</td>\n",
       "      <td>3.529975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>2</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>3</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>4</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>5</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>7</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>8</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>9</td>\n",
       "      <td>3.529975</td>\n",
       "      <td>11</td>\n",
       "      <td>3.529975</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0   1         2   3         4   5         6   7         8   9   ...  11  \\\n",
       "0   1   5  3.529975  10  3.529975  25  3.529975  32  3.529975  33  ...  44   \n",
       "1   2   1  3.529975   2  3.529975   3  3.529975   4  3.529975   5  ...   6   \n",
       "\n",
       "         12  13        14  15        16  17        18  19        20  \n",
       "0  3.529975  46  3.529975  50  3.529975  52  3.529975  55  3.529975  \n",
       "1  3.529975   7  3.529975   8  3.529975   9  3.529975  11  3.529975  \n",
       "\n",
       "[2 rows x 21 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(result)[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Project task 1 - self made top rated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# project task 1:  implement TopRated\n",
    "# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n",
    "# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n",
    "# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "Item_AvgRating=[]\n",
    "\n",
    "for i in range(train_iu.shape[0]):\n",
    "    item_rating_sum = train_iu[i].sum()\n",
    "    number_of_ratings_for_this_item = train_iu[i].nnz\n",
    "    if number_of_ratings_for_this_item == 0:\n",
    "        avg = 0\n",
    "    else:\n",
    "        avg = item_rating_sum / number_of_ratings_for_this_item\n",
    "    Item_AvgRating.append((i, avg))\n",
    "    \n",
    "    \n",
    "k=10\n",
    "result=[]\n",
    "\n",
    "for u in range(train_ui.shape[0]):\n",
    "    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
    "    rec_user=[]\n",
    "    item_pos=0\n",
    "    while len(rec_user)<10:\n",
    "        if Item_AvgRating[item_pos][0] not in user_rated:\n",
    "            rec_user.append((item_code_id[Item_AvgRating[item_pos][0]], Item_AvgRating[item_pos][1]))\n",
    "        item_pos+=1\n",
    "    result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
    "\n",
    "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
    "\n",
    "\n",
    "\n",
    "estimations=[]\n",
    "\n",
    "for user, item in zip(*test_ui.nonzero()):\n",
    "    estimations.append([user_code_id[user], item_code_id[item], avg])\n",
    "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self-made baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "class selfBaselineUI():\n",
    "    \n",
    "    def fit(self, train_ui):\n",
    "        self.train_ui=train_ui.copy()\n",
    "        self.train_iu=train_ui.transpose().tocsr()\n",
    "        \n",
    "        result=self.train_ui.copy()\n",
    "        \n",
    "        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
    "        \n",
    "        # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
    "        # (other option is to define addition/multiplication in a desired way)\n",
    "        row_means=self.row_means.copy()\n",
    "        \n",
    "        max_row_mean=np.max(row_means)\n",
    "        row_means[row_means==0]=max_row_mean+1\n",
    "        to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
    "        to_subtract_rows.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_rows.data\n",
    "        subtract[subtract==max_row_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "#        we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
    "        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
    "                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
    "        \n",
    "        # again - it is possible that some mean will be zero, so let's use the same workaround\n",
    "        col_means=self.col_means.copy()\n",
    "        \n",
    "        max_col_mean=np.max(col_means)\n",
    "        col_means[col_means==0]=max_col_mean+1\n",
    "        to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
    "        to_subtract_cols.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_cols.data\n",
    "        subtract[subtract==max_col_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "        return result\n",
    "    \n",
    "    \n",
    "    def recommend(self, user_code_id, item_code_id, topK=10):\n",
    "        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
    "        \n",
    "        top_k = defaultdict(list)\n",
    "        for nb_user, user in enumerate(estimations):\n",
    "            \n",
    "            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
    "            for item, score in enumerate(user):\n",
    "                if item not in user_rated:\n",
    "                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
    "        result=[]\n",
    "        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "        for uid, item_scores in top_k.items():\n",
    "            item_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "            result.append([uid]+list(chain(*item_scores[:topK])))\n",
    "        return result\n",
    "    \n",
    "    def estimate(self, user_code_id, item_code_id, test_ui):\n",
    "        result=[]\n",
    "        for user, item in zip(*test_ui.nonzero()):\n",
    "            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
       "        [0, 1, 2, 3, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After subtracting rows and columns:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[ 0. ,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],\n",
       "        [ 0. , -0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],\n",
       "        [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommend best unseen item:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Print estimations on unseen items:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>item</th>\n",
       "      <th>est_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>40</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20</td>\n",
       "      <td>20</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20</td>\n",
       "      <td>70</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user  item  est_score\n",
       "0     0    60        4.0\n",
       "1    10    40        3.0\n",
       "2    20     0        3.0\n",
       "3    20    20        4.0\n",
       "4    20    70        4.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
    "\n",
    "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
    "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
    "\n",
    "print('Training data:')\n",
    "display(toy_train_ui.todense())\n",
    "\n",
    "model=selfBaselineUI()\n",
    "print('After subtracting rows and columns:')\n",
    "display(model.fit(toy_train_ui).todense())\n",
    "\n",
    "print('Recommend best unseen item:')\n",
    "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
    "\n",
    "print('Print estimations on unseen items:')\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.columns=['user', 'item', 'est_score']\n",
    "display(estimations)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
    "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=selfBaselineUI()\n",
    "model.fit(train_ui)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
    "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# project task 2:  implement self-made BaselineIU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n",
    "# but first subtract col means then row means\n",
    "# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n",
    "# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "class selfBaselineIU():\n",
    "    \n",
    "    def fit(self, train_ui):\n",
    "        self.train_ui=train_ui.copy()\n",
    "        self.train_iu=train_ui.transpose().tocsr()\n",
    "        \n",
    "        result=self.train_ui.copy()\n",
    "        \n",
    "        \n",
    "        #we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
    "        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
    "                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
    "        \n",
    "        # again - it is possible that some mean will be zero, so let's use the same workaround\n",
    "        col_means=self.col_means.copy()\n",
    "        \n",
    "        max_col_mean=np.max(col_means)\n",
    "        col_means[col_means==0]=max_col_mean+1\n",
    "        to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
    "        to_subtract_cols.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_cols.data\n",
    "        subtract[subtract==max_col_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "        \n",
    "        \n",
    "        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
    "        \n",
    "        # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
    "        # (other option is to define addition/multiplication in a desired way)\n",
    "        row_means=self.row_means.copy()\n",
    "        \n",
    "        max_row_mean=np.max(row_means)\n",
    "        row_means[row_means==0]=max_row_mean+1\n",
    "        to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
    "        to_subtract_rows.sort_indices() # needed to have valid .data\n",
    "        \n",
    "        subtract=to_subtract_rows.data\n",
    "        subtract[subtract==max_row_mean+1]=0\n",
    "        \n",
    "        result.data=result.data-subtract\n",
    "\n",
    "\n",
    "        return result\n",
    "    \n",
    "    \n",
    "    def recommend(self, user_code_id, item_code_id, topK=10):\n",
    "        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
    "        \n",
    "        top_k = defaultdict(list)\n",
    "        for nb_user, user in enumerate(estimations):\n",
    "            \n",
    "            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
    "            for item, score in enumerate(user):\n",
    "                if item not in user_rated:\n",
    "                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
    "        result=[]\n",
    "        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "        for uid, item_scores in top_k.items():\n",
    "            item_scores.sort(key=lambda x: x[1], reverse=True)\n",
    "            result.append([uid]+list(chain(*item_scores[:topK])))\n",
    "        return result\n",
    "    \n",
    "    def estimate(self, user_code_id, item_code_id, test_ui):\n",
    "        result=[]\n",
    "        for user, item in zip(*test_ui.nonzero()):\n",
    "            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
    "        return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "model=selfBaselineIU()\n",
    "model.fit(train_ui)\n",
    "\n",
    "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_reco.csv', index=False, header=False)\n",
    "\n",
    "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
    "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ready-made baseline - Surprise implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimating biases using als...\n"
     ]
    }
   ],
   "source": [
    "import surprise as sp\n",
    "import time\n",
    "\n",
    "# Based on surprise.readthedocs.io\n",
    "def get_top_n(predictions, n=10):\n",
    "    \n",
    "    # Here we create a dictionary which items are lists of pairs (item, score)\n",
    "    top_n = defaultdict(list)\n",
    "    for uid, iid, true_r, est, _ in predictions:\n",
    "        top_n[uid].append((iid, est))\n",
    "        \n",
    "    result=[]\n",
    "    # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
    "    for uid, user_ratings in top_n.items():\n",
    "        user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
    "        result.append([uid]+list(chain(*user_ratings[:n]))) \n",
    "    return result\n",
    "\n",
    "\n",
    "reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
    "trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
    "trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
    "\n",
    "testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
    "testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
    "\n",
    "algo = sp.BaselineOnly()\n",
    "# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
    "# observe how bad results gives above algorithm\n",
    "# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
    "\n",
    "algo.fit(trainset)\n",
    "\n",
    "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
    "predictions = algo.test(antitrainset)\n",
    "\n",
    "top_n = get_top_n(predictions, n=10)\n",
    "\n",
    "top_n=pd.DataFrame(top_n)\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 0.9495\n",
      "MAE:  0.7525\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7524871012820799"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Compute RMSE on testset using buildin functions\n",
    "predictions = algo.test(testset)\n",
    "sp.accuracy.rmse(predictions, verbose=True)\n",
    "\n",
    "# Let's also save the results in file\n",
    "predictions_df=[]\n",
    "for uid, iid, true_r, est, _ in predictions:\n",
    "    predictions_df.append([uid, iid, est])\n",
    "    \n",
    "predictions_df=pd.DataFrame(predictions_df)\n",
    "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
    "\n",
    "sp.accuracy.mae(predictions, verbose=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Let's compare with random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 1.5190\n",
      "MAE:  1.2167\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1.2166585677245956"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
    "algo = sp.NormalPredictor()\n",
    "algo.fit(trainset)\n",
    "\n",
    "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
    "predictions = algo.test(antitrainset)\n",
    "\n",
    "top_n = get_top_n(predictions, n=10)\n",
    "\n",
    "top_n=pd.DataFrame(top_n)\n",
    "\n",
    "top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
    "\n",
    "# Compute RMSE on testset using buildin functions\n",
    "predictions = algo.test(testset)\n",
    "sp.accuracy.rmse(predictions, verbose=True)\n",
    "\n",
    "# Let's also save the results in file\n",
    "predictions_df=[]\n",
    "for uid, iid, true_r, est, _ in predictions:\n",
    "    predictions_df.append([uid, iid, est])\n",
    "    \n",
    "predictions_df=pd.DataFrame(predictions_df)\n",
    "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
    "\n",
    "sp.accuracy.mae(predictions, verbose=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}