warsztaty2/P1. Baseline.ipynb
2020-06-16 19:40:37 +02:00

1377 lines
42 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparing dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"from collections import defaultdict\n",
"from itertools import chain\n",
"import random\n",
"\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Let's prepare dataset\n",
"train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
"train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
"train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
"\n",
"user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
"user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
"item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
"item_id_code = dict((v, k) for k, v in item_code_id.items())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>user_code</th>\n",
" <th>item_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>664</td>\n",
" <td>525</td>\n",
" <td>4</td>\n",
" <td>876526580</td>\n",
" <td>663</td>\n",
" <td>524</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>888068651</td>\n",
" <td>48</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>352</td>\n",
" <td>273</td>\n",
" <td>2</td>\n",
" <td>884290328</td>\n",
" <td>351</td>\n",
" <td>272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>618</td>\n",
" <td>96</td>\n",
" <td>3</td>\n",
" <td>891307749</td>\n",
" <td>617</td>\n",
" <td>95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>560</td>\n",
" <td>24</td>\n",
" <td>2</td>\n",
" <td>879976772</td>\n",
" <td>559</td>\n",
" <td>23</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp user_code item_code\n",
"0 664 525 4 876526580 663 524\n",
"1 49 1 2 888068651 48 0\n",
"2 352 273 2 884290328 351 272\n",
"3 618 96 3 891307749 617 95\n",
"4 560 24 2 879976772 559 23"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_and_test[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
"test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Take number of users and items\n",
"(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
"\n",
"# Create sparse csr matrices\n",
"train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
"test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Above steps are the same for many algorithms, so I put the code in separate file:\n",
"import helpers\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
"train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CSR matrices - what is it?"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<3x4 sparse matrix of type '<class 'numpy.intc'>'\n",
"\twith 8 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample_csr=sparse.csr_matrix((data, (row, col)))\n",
"sample_csr"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings matrix with missing entries replaced by zeros:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Number of ratings: 8 \n",
"Number of users: 3 \n",
"Number of items: 4 \n",
"\n"
]
}
],
"source": [
"print('Ratings matrix with missing entries replaced by zeros:')\n",
"display(sample_csr.todense())\n",
"\n",
"print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n",
" .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings data: [4 1 3 2 1 2 5 4]\n",
"Regarding items: [0 1 2 1 3 0 2 3]\n",
"Where ratings from 0 to 2 belongs to user 0.\n",
"Where ratings from 3 to 4 belongs to user 1.\n",
"Where ratings from 5 to 7 belongs to user 2.\n"
]
}
],
"source": [
"print('Ratings data:', sample_csr.data)\n",
"\n",
"print('Regarding items:', sample_csr.indices)\n",
"\n",
"for i in range(sample_csr.shape[0]):\n",
" print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Efficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"679 ns ± 11.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
"Inefficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"57.1 µs ± 894 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"user=123\n",
"\n",
"print('Efficient way to access items rated by user:')\n",
"display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
"%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
"\n",
"print('Inefficient way to access items rated by user:')\n",
"display(train_ui[user].indices)\n",
"%timeit train_ui[user].indices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Example: subtracting row means"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Our matrix:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"List of row sums:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 8, 3, 11]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Our matrix:')\n",
"display(sample_csr.todense())\n",
"print('List of row sums:')\n",
"sample_csr.sum(axis=1).ravel()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Array with row means:\n"
]
},
{
"data": {
"text/plain": [
"array([2.66666667, 1.5 , 3.66666667])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Diagonal csr matrix with inverse of row sums on diagonal:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 0. , 0. ],\n",
" [0. , 1.5 , 0. ],\n",
" [0. , 0. , 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Let's apply them in nonzero entries:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n",
" [0. , 1.5 , 0. , 1.5 ],\n",
" [3.66666667, 0. , 3.66666667, 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finally after subtraction:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n",
" [ 0. , 0.5 , 0. , -0.5 ],\n",
" [-1.66666667, 0. , 1.33333333, 0.33333333]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Array with row means:')\n",
"row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
"display(row_means)\n",
"\n",
"print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
"display(sparse.diags(row_means).todense())\n",
"\n",
"print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
"to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n",
"display(to_subtract.todense())\n",
"\n",
"print(\"Finally after subtraction:\")\n",
"sample_csr-to_subtract.todense()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Transposing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sample matrix: \n",
" [[4 1 3 0]\n",
" [0 2 0 1]\n",
" [2 0 5 4]]\n",
"\n",
"Indices: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Transposed matrix: \n",
" [[4 0 2]\n",
" [1 2 0]\n",
" [3 0 5]\n",
" [0 1 4]]\n",
"\n",
"Indices of transposed matrix: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Reason: <class 'scipy.sparse.csc.csc_matrix'>\n",
"\n",
"After converting to csr: \n",
" [0 2 0 1 0 2 1 2]\n"
]
}
],
"source": [
"import numpy as np\n",
"from scipy import sparse\n",
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample=sparse.csr_matrix((data, (row, col)))\n",
"print('Sample matrix: \\n', sample.A)\n",
"print('\\nIndices: \\n', sample.indices)\n",
"transposed=sample.transpose()\n",
"print('\\nTransposed matrix: \\n', transposed.A)\n",
"print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
"\n",
"print('\\nReason: ', type(transposed))\n",
"\n",
"print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self made top popular"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.exists('./Recommendations generated/'):\n",
" os.mkdir('./Recommendations generated/')\n",
" os.mkdir('./Recommendations generated/ml-100k/')\n",
" os.mkdir('./Recommendations generated/toy-example/')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"TopPop=[]\n",
"train_iu=train_ui.transpose().tocsr()\n",
"scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n",
"\n",
"for i in range(train_iu.shape[0]):\n",
" TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
" \n",
"TopPop.sort(key=lambda x: x[1], reverse=True)\n",
"#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
"\n",
"k=10\n",
"result=[]\n",
"\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user=[]\n",
" item_pos=0\n",
" while len(rec_user)<10:\n",
" if TopPop[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n",
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
"\n",
"\n",
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
"\n",
"estimations=[]\n",
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item],\n",
" (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self made global average"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"GlobalAvg=[]\n",
"avg=np.sum(train_ui)/train_ui.nnz\n",
"\n",
"for i in range(train_iu.shape[0]):\n",
" GlobalAvg.append((i, avg))\n",
" \n",
"k=10\n",
"result=[]\n",
"\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user=[]\n",
" item_pos=0\n",
" while len(rec_user)<10:\n",
" if GlobalAvg[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n",
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n",
"\n",
"\n",
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
"\n",
"estimations=[]\n",
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item], avg])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>13</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>3.529975</td>\n",
" <td>10</td>\n",
" <td>3.529975</td>\n",
" <td>25</td>\n",
" <td>3.529975</td>\n",
" <td>32</td>\n",
" <td>3.529975</td>\n",
" <td>33</td>\n",
" <td>...</td>\n",
" <td>44</td>\n",
" <td>3.529975</td>\n",
" <td>46</td>\n",
" <td>3.529975</td>\n",
" <td>50</td>\n",
" <td>3.529975</td>\n",
" <td>52</td>\n",
" <td>3.529975</td>\n",
" <td>55</td>\n",
" <td>3.529975</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3.529975</td>\n",
" <td>2</td>\n",
" <td>3.529975</td>\n",
" <td>3</td>\n",
" <td>3.529975</td>\n",
" <td>4</td>\n",
" <td>3.529975</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.529975</td>\n",
" <td>7</td>\n",
" <td>3.529975</td>\n",
" <td>8</td>\n",
" <td>3.529975</td>\n",
" <td>9</td>\n",
" <td>3.529975</td>\n",
" <td>11</td>\n",
" <td>3.529975</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9 ... 11 \\\n",
"0 1 5 3.529975 10 3.529975 25 3.529975 32 3.529975 33 ... 44 \n",
"1 2 1 3.529975 2 3.529975 3 3.529975 4 3.529975 5 ... 6 \n",
"\n",
" 12 13 14 15 16 17 18 19 20 \n",
"0 3.529975 46 3.529975 50 3.529975 52 3.529975 55 3.529975 \n",
"1 3.529975 7 3.529975 8 3.529975 9 3.529975 11 3.529975 \n",
"\n",
"[2 rows x 21 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(result)[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Project task 1 - self made top rated"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# project task 1: implement TopRated\n",
"# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n",
"# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n",
"# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"TopRated=[]\n",
"train_iu=train_ui.transpose().tocsr()\n",
"for i in range(train_iu.shape[0]):\n",
" if(train_iu.indptr[i+1]-train_iu.indptr[i] != 0):\n",
" avg = np.sum(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]])/(train_iu.indptr[i+1]-train_iu.indptr[i])\n",
" TopRated.append((i, avg))\n",
"TopRated.sort(key=lambda x: x[1], reverse=True)\n",
"result=[]\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user=[]\n",
" item_pos=0\n",
" while len(rec_user)<10:\n",
" if TopRated[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[TopRated[item_pos][0]], TopRated[item_pos][1]))\n",
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
"estimations=[]\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item],\n",
" (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self-made baseline"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"class selfBaselineUI():\n",
" \n",
" def fit(self, train_ui):\n",
" self.train_ui=train_ui.copy()\n",
" self.train_iu=train_ui.transpose().tocsr()\n",
" \n",
" result=self.train_ui.copy()\n",
" \n",
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
" \n",
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
" # (other option is to define addition/multiplication in a desired way)\n",
" row_means=self.row_means.copy()\n",
" \n",
" max_row_mean=np.max(row_means)\n",
" row_means[row_means==0]=max_row_mean+1\n",
" to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_rows.data\n",
" subtract[subtract==max_row_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
" \n",
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
" col_means=self.col_means.copy()\n",
" \n",
" max_col_mean=np.max(col_means)\n",
" col_means[col_means==0]=max_col_mean+1\n",
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_cols.data\n",
" subtract[subtract==max_col_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" return result\n",
" \n",
" \n",
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
" \n",
" top_k = defaultdict(list)\n",
" for nb_user, user in enumerate(estimations):\n",
" \n",
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
" for item, score in enumerate(user):\n",
" if item not in user_rated:\n",
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, item_scores in top_k.items():\n",
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
" return result\n",
" \n",
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
" result=[]\n",
" for user, item in zip(*test_ui.nonzero()):\n",
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After subtracting rows and columns:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommend best unseen item:\n"
]
},
{
"data": {
"text/plain": [
"[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Print estimations on unseen items:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>est_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>40</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20</td>\n",
" <td>70</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item est_score\n",
"0 0 60 4.0\n",
"1 10 40 3.0\n",
"2 20 0 3.0\n",
"3 20 20 4.0\n",
"4 20 70 4.0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"\n",
"toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
"\n",
"print('Training data:')\n",
"display(toy_train_ui.todense())\n",
"\n",
"model=selfBaselineUI()\n",
"print('After subtracting rows and columns:')\n",
"display(model.fit(toy_train_ui).todense())\n",
"\n",
"print('Recommend best unseen item:')\n",
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
"\n",
"print('Print estimations on unseen items:')\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.columns=['user', 'item', 'est_score']\n",
"display(estimations)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
"\n",
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"model=selfBaselineUI()\n",
"model.fit(train_ui)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
"estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# project task 2: implement self-made BaselineIU"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n",
"# but first subtract col means then row means\n",
"# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n",
"# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"class selfBaselineIU():\n",
" \n",
" def fit(self, train_ui):\n",
" self.train_ui=train_ui.copy()\n",
" self.train_iu=train_ui.transpose().tocsr()\n",
" \n",
" result=self.train_ui.copy()\n",
" \n",
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr), out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0)\n",
" \n",
" col_means=self.col_means.copy()\n",
" \n",
" max_col_mean=np.max(col_means)\n",
" col_means[col_means==0]=max_col_mean+1\n",
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_cols.data\n",
" subtract[subtract==max_col_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
" \n",
" row_means=self.row_means.copy()\n",
" \n",
" max_row_mean=np.max(row_means)\n",
" row_means[row_means==0]=max_row_mean+1\n",
" to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_rows.data\n",
" subtract[subtract==max_row_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" return result\n",
" \n",
" \n",
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
" \n",
" top_k = defaultdict(list)\n",
" for nb_user, user in enumerate(estimations):\n",
" \n",
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
" for item, score in enumerate(user):\n",
" if item not in user_rated:\n",
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
" result=[]\n",
"\n",
" for uid, item_scores in top_k.items():\n",
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
" return result\n",
" \n",
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
" result=[]\n",
" for user, item in zip(*test_ui.nonzero()):\n",
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
" return result\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ready-made baseline - Surprise implementation"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimating biases using als...\n"
]
}
],
"source": [
"import surprise as sp\n",
"import time\n",
"\n",
"# Based on surprise.readthedocs.io\n",
"def get_top_n(predictions, n=10):\n",
" \n",
" # Here we create a dictionary which items are lists of pairs (item, score)\n",
" top_n = defaultdict(list)\n",
" for uid, iid, true_r, est, _ in predictions:\n",
" top_n[uid].append((iid, est))\n",
" \n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, user_ratings in top_n.items():\n",
" user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*user_ratings[:n]))) \n",
" return result\n",
"\n",
"\n",
"reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
"trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
"trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
"\n",
"testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
"testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
"\n",
"algo = sp.BaselineOnly()\n",
"# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
"# observe how bad results gives above algorithm\n",
"# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
"\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.9495\n",
"MAE: 0.7525\n"
]
},
{
"data": {
"text/plain": [
"0.7524871012820799"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Let's compare with random"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 1.5100\n",
"MAE: 1.2118\n"
]
},
{
"data": {
"text/plain": [
"1.211847558071457"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
"algo = sp.NormalPredictor()\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
"\n",
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}