WSS-project/P1. Baseline.ipynb

1528 lines
47 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparing dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"from collections import defaultdict\n",
"from itertools import chain\n",
"import random\n",
"\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Let's prepare dataset\n",
"train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
"train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
"train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
"\n",
"user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
"user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
"item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
"item_id_code = dict((v, k) for k, v in item_code_id.items())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>user_code</th>\n",
" <th>item_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>664</td>\n",
" <td>525</td>\n",
" <td>4</td>\n",
" <td>876526580</td>\n",
" <td>663</td>\n",
" <td>524</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>888068651</td>\n",
" <td>48</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>352</td>\n",
" <td>273</td>\n",
" <td>2</td>\n",
" <td>884290328</td>\n",
" <td>351</td>\n",
" <td>272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>618</td>\n",
" <td>96</td>\n",
" <td>3</td>\n",
" <td>891307749</td>\n",
" <td>617</td>\n",
" <td>95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>560</td>\n",
" <td>24</td>\n",
" <td>2</td>\n",
" <td>879976772</td>\n",
" <td>559</td>\n",
" <td>23</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp user_code item_code\n",
"0 664 525 4 876526580 663 524\n",
"1 49 1 2 888068651 48 0\n",
"2 352 273 2 884290328 351 272\n",
"3 618 96 3 891307749 617 95\n",
"4 560 24 2 879976772 559 23"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_and_test[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
"test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Take number of users and items\n",
"(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
"\n",
"# Create sparse csr matrices\n",
"train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
"test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Above steps are the same for many algorithms, so I put the code in separate file:\n",
"import helpers\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
"train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CSR matrices - what is it?"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<3x4 sparse matrix of type '<class 'numpy.intc'>'\n",
"\twith 8 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample_csr=sparse.csr_matrix((data, (row, col)))\n",
"sample_csr"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings matrix with missing entries replaced by zeros:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of ratings: 8\n",
"Number of users: 3\n",
"Number of items: 4\n"
]
}
],
"source": [
"print('Ratings matrix with missing entries replaced by zeros:')\n",
"display(sample_csr.todense())\n",
"\n",
"print(f'Number of ratings: {sample_csr.nnz}')\n",
"print(f'Number of users: {sample_csr.shape[0]}')\n",
"print(f'Number of items: {sample_csr.shape[1]}')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings data: [4 1 3 2 1 2 5 4]\n",
"Regarding items: [0 1 2 1 3 0 2 3]\n",
"Where ratings from 0 to 2 belongs to user 0.\n",
"Where ratings from 3 to 4 belongs to user 1.\n",
"Where ratings from 5 to 7 belongs to user 2.\n"
]
}
],
"source": [
"print('Ratings data:', sample_csr.data)\n",
"\n",
"print('Regarding items:', sample_csr.indices)\n",
"\n",
"for i in range(sample_csr.shape[0]):\n",
" print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Efficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"471 ns ± 15.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
"Inefficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"48.3 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"user=123\n",
"\n",
"print('Efficient way to access items rated by user:')\n",
"display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
"%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
"\n",
"print('Inefficient way to access items rated by user:')\n",
"display(train_ui[user].indices)\n",
"%timeit train_ui[user].indices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Example: subtracting row means"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Our matrix:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"List of row sums:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 8, 3, 11]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Our matrix:')\n",
"display(sample_csr.todense())\n",
"print('List of row sums:')\n",
"sample_csr.sum(axis=1).ravel()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Array with row means:\n"
]
},
{
"data": {
"text/plain": [
"array([2.66666667, 1.5 , 3.66666667])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Diagonal csr matrix with inverse of row sums on diagonal:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 0. , 0. ],\n",
" [0. , 1.5 , 0. ],\n",
" [0. , 0. , 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Let's apply them in nonzero entries:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n",
" [0. , 1.5 , 0. , 1.5 ],\n",
" [3.66666667, 0. , 3.66666667, 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finally after subtraction:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n",
" [ 0. , 0.5 , 0. , -0.5 ],\n",
" [-1.66666667, 0. , 1.33333333, 0.33333333]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Array with row means:')\n",
"row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
"display(row_means)\n",
"\n",
"print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
"display(sparse.diags(row_means).todense())\n",
"\n",
"print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
"to_subtract=sparse.diags(row_means)*(sample_csr>0)\n",
"display(to_subtract.todense())\n",
"\n",
"print(\"Finally after subtraction:\")\n",
"sample_csr-to_subtract.todense()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Transposing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sample matrix: \n",
" [[4 1 3 0]\n",
" [0 2 0 1]\n",
" [2 0 5 4]]\n",
"\n",
"Indices: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Transposed matrix: \n",
" [[4 0 2]\n",
" [1 2 0]\n",
" [3 0 5]\n",
" [0 1 4]]\n",
"\n",
"Indices of transposed matrix: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Reason: <class 'scipy.sparse.csc.csc_matrix'>\n",
"\n",
"After converting to csr: \n",
" [0 2 0 1 0 2 1 2]\n"
]
}
],
"source": [
"import numpy as np\n",
"from scipy import sparse\n",
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample=sparse.csr_matrix((data, (row, col)))\n",
"print('Sample matrix: \\n', sample.A)\n",
"print('\\nIndices: \\n', sample.indices)\n",
"transposed=sample.transpose()\n",
"print('\\nTransposed matrix: \\n', transposed.A)\n",
"print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
"\n",
"print('\\nReason: ', type(transposed))\n",
"\n",
"print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self made top popular"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.exists('./Recommendations generated/'):\n",
" os.mkdir('./Recommendations generated/')\n",
" os.mkdir('./Recommendations generated/ml-100k/')\n",
" os.mkdir('./Recommendations generated/toy-example/')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"top_pop = []\n",
"train_iu = train_ui.transpose().tocsr()\n",
"scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))\n",
"\n",
"for i in range(train_iu.shape[0]):\n",
" top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
" \n",
"top_pop.sort(key=lambda x: x[1], reverse=True)\n",
"#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
"\n",
"k = 10\n",
"result = []\n",
"\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user = []\n",
" item_pos = 0\n",
" while len(rec_user)<10:\n",
" if top_pop[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))\n",
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
"\n",
"\n",
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
"\n",
"estimations=[]\n",
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item],\n",
" (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self made top rated"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"top_rated = []\n",
"global_avg = sum(train_iu.data)/train_ui.nnz\n",
"\n",
"for i in range(train_iu.shape[0]):\n",
" ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]\n",
" avg = np.mean(ratings) if len(ratings)>0 else global_avg\n",
" top_rated.append((i, avg))\n",
" \n",
"top_rated.sort(key=lambda x: x[1], reverse=True)\n",
" \n",
"k=10\n",
"result=[]\n",
"\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user=[]\n",
" item_pos=0\n",
" while len(rec_user)<10:\n",
" if top_rated[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))\n",
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
"\n",
"\n",
"\n",
"estimations=[]\n",
"d = dict(top_rated)\n",
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item], d[item]])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>13</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>814</td>\n",
" <td>5.0</td>\n",
" <td>1122</td>\n",
" <td>5.0</td>\n",
" <td>1189</td>\n",
" <td>5.0</td>\n",
" <td>1201</td>\n",
" <td>5.0</td>\n",
" <td>1293</td>\n",
" <td>...</td>\n",
" <td>1306</td>\n",
" <td>5.0</td>\n",
" <td>1467</td>\n",
" <td>5.0</td>\n",
" <td>1491</td>\n",
" <td>5.0</td>\n",
" <td>1500</td>\n",
" <td>5.0</td>\n",
" <td>1536</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>119</td>\n",
" <td>5.0</td>\n",
" <td>814</td>\n",
" <td>5.0</td>\n",
" <td>1122</td>\n",
" <td>5.0</td>\n",
" <td>1189</td>\n",
" <td>5.0</td>\n",
" <td>1201</td>\n",
" <td>...</td>\n",
" <td>1293</td>\n",
" <td>5.0</td>\n",
" <td>1306</td>\n",
" <td>5.0</td>\n",
" <td>1467</td>\n",
" <td>5.0</td>\n",
" <td>1491</td>\n",
" <td>5.0</td>\n",
" <td>1500</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9 ... 11 12 13 \\\n",
"0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 1293 ... 1306 5.0 1467 \n",
"1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 1293 5.0 1306 \n",
"\n",
" 14 15 16 17 18 19 20 \n",
"0 5.0 1491 5.0 1500 5.0 1536 5.0 \n",
"1 5.0 1467 5.0 1491 5.0 1500 5.0 \n",
"\n",
"[2 rows x 21 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(result)[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self-made baseline"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"class selfBaselineUI():\n",
" \n",
" def fit(self, train_ui):\n",
" self.train_ui=train_ui.copy()\n",
" self.train_iu=train_ui.transpose().tocsr()\n",
" \n",
" result=self.train_ui.copy()\n",
" \n",
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
" \n",
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
" # (other option is to define addition/multiplication in a desired way)\n",
" row_means=self.row_means.copy()\n",
" \n",
" max_row_mean=np.max(row_means)\n",
" row_means[row_means==0]=max_row_mean+1\n",
" to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_rows.data\n",
" subtract[subtract==max_row_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
" \n",
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
" col_means=self.col_means.copy()\n",
" \n",
" max_col_mean=np.max(col_means)\n",
" col_means[col_means==0]=max_col_mean+1\n",
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_cols.data\n",
" subtract[subtract==max_col_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" return result\n",
" \n",
" \n",
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
" \n",
" top_k = defaultdict(list)\n",
" for nb_user, user in enumerate(estimations):\n",
" \n",
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
" for item, score in enumerate(user):\n",
" if item not in user_rated:\n",
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, item_scores in top_k.items():\n",
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
" return result\n",
" \n",
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
" result=[]\n",
" for user, item in zip(*test_ui.nonzero()):\n",
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After subtracting rows and columns:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommend best unseen item:\n"
]
},
{
"data": {
"text/plain": [
"[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Print estimations on unseen items:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>est_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>40</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20</td>\n",
" <td>70</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item est_score\n",
"0 0 60 4.0\n",
"1 10 40 3.0\n",
"2 20 0 3.0\n",
"3 20 20 4.0\n",
"4 20 70 4.0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"\n",
"toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
"\n",
"print('Training data:')\n",
"display(toy_train_ui.todense())\n",
"\n",
"model=selfBaselineUI()\n",
"print('After subtracting rows and columns:')\n",
"display(model.fit(toy_train_ui).todense())\n",
"\n",
"print('Recommend best unseen item:')\n",
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
"\n",
"print('Print estimations on unseen items:')\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.columns=['user', 'item', 'est_score']\n",
"display(estimations)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
"\n",
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"model=selfBaselineUI()\n",
"model.fit(train_ui)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
"estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# project task 1: implement self-made BaselineIU"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI but first subtract column means then row means.\n",
"\n",
"The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv' and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'.\n",
"\n",
"<br><br>\n",
"Additional clarification: \n",
"\n",
"Summarizing, the prediction of the rating of the user u regarding the item i should be equal to b_u + b_i.\n",
"The procedure to get b_u and b_i is the following:\n",
"- We have the original user-item ratings matrix M.\n",
"- For each column representing the item i, we compute the mean of ratings and denote by b_i. From each rating in matrix M we subtract the corresponding column mean (b_i) to receive new matrix M'.\n",
"- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"class selfBaselineIU():\n",
" \n",
" def fit(self, train_ui):\n",
" self.train_ui=train_ui.copy()\n",
" self.train_iu=train_ui.transpose().tocsr()\n",
" \n",
" result=self.train_ui.copy()\n",
" \n",
" #we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
" \n",
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
" col_means=self.col_means.copy()\n",
" \n",
" max_col_mean=np.max(col_means)\n",
" col_means[col_means==0]=max_col_mean+1\n",
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_cols.data\n",
" subtract[subtract==max_col_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
"\n",
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
" \n",
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
" # (other option is to define addition/multiplication in a desired way)\n",
" row_means=self.row_means.copy()\n",
" \n",
" max_row_mean=np.max(row_means)\n",
" row_means[row_means==0]=max_row_mean+1\n",
" to_subtract_rows=sparse.diags(row_means)*(result.power(0))\n",
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_rows.data\n",
" subtract[subtract==max_row_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" return result\n",
" \n",
" \n",
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
" \n",
" top_k = defaultdict(list)\n",
" for nb_user, user in enumerate(estimations):\n",
" \n",
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
" for item, score in enumerate(user):\n",
" if item not in user_rated:\n",
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, item_scores in top_k.items():\n",
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
" return result\n",
" \n",
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
" result=[]\n",
" for user, item in zip(*test_ui.nonzero()):\n",
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After subtracting columns and rows:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[-0.375 , 1.125 , 0. , 0. , -0.375 ,\n",
" 0. , 0. , -0.375 ],\n",
" [ 0. , -0.66666667, 0.83333333, -0.16666667, 0. ,\n",
" 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0.66666667, 0. ,\n",
" -0.33333333, -0.33333333, 0. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommend best unseen item:\n"
]
},
{
"data": {
"text/plain": [
"[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Print estimations on unseen items:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>est_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>4.375000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>40</td>\n",
" <td>4.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>0</td>\n",
" <td>3.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" <td>2.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20</td>\n",
" <td>70</td>\n",
" <td>4.333333</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item est_score\n",
"0 0 60 4.375000\n",
"1 10 40 4.166667\n",
"2 20 0 3.333333\n",
"3 20 20 2.333333\n",
"4 20 70 4.333333"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"\n",
"toy_train_iu, toy_test_iu, toy_user_code_id, toy_user_id_code, \\\n",
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
"\n",
"print('Training data:')\n",
"display(toy_train_iu.todense())\n",
"\n",
"model=selfBaselineIU()\n",
"print('After subtracting columns and rows:')\n",
"display(model.fit(toy_train_iu).todense())\n",
"\n",
"print('Recommend best unseen item:')\n",
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
"\n",
"print('Print estimations on unseen items:')\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
"estimations.columns=['user', 'item', 'est_score']\n",
"display(estimations)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
"\n",
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_iu))\n",
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"model=selfBaselineIU()\n",
"model.fit(train_ui)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
"\n",
"top_n.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
"estimations.to_csv('Recommendations generated/Projects/Project1_Self_BaselineIU_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ready-made baseline - Surprise implementation"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimating biases using als...\n"
]
}
],
"source": [
"import surprise as sp\n",
"import time\n",
"\n",
"# Based on surprise.readthedocs.io\n",
"def get_top_n(predictions, n=10):\n",
" \n",
" # Here we create a dictionary which items are lists of pairs (item, score)\n",
" top_n = defaultdict(list)\n",
" for uid, iid, true_r, est, _ in predictions:\n",
" top_n[uid].append((iid, est))\n",
" \n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, user_ratings in top_n.items():\n",
" user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*user_ratings[:n]))) \n",
" return result\n",
"\n",
"\n",
"reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
"trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
"trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
"\n",
"testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
"testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
"\n",
"algo = sp.BaselineOnly()\n",
"# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
"# observe how bad results gives above algorithm\n",
"# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
"\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.9495\n",
"MAE: 0.7525\n"
]
},
{
"data": {
"text/plain": [
"0.7524871012820799"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Let's compare with random"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 1.5165\n",
"MAE: 1.2172\n"
]
},
{
"data": {
"text/plain": [
"1.2172144988785374"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
"algo = sp.NormalPredictor()\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
"\n",
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"metadata": {
"interpreter": {
"hash": "2a3a95f8b675c5b7dd6a35e1675edaf697539b1f0a71c4603e9520a8bbd07d82"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}