WSS-project/P1. Baseline.ipynb

1254 lines
37 KiB
Plaintext
Raw Normal View History

2021-03-20 20:01:22 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparing dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"from collections import defaultdict\n",
"from itertools import chain\n",
"import random\n",
"\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Let's prepare dataset\n",
"train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
"train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
"train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
"\n",
"user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
"user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
"item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
"item_id_code = dict((v, k) for k, v in item_code_id.items())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>user_code</th>\n",
" <th>item_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>664</td>\n",
" <td>525</td>\n",
" <td>4</td>\n",
" <td>876526580</td>\n",
" <td>663</td>\n",
" <td>524</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>888068651</td>\n",
" <td>48</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>352</td>\n",
" <td>273</td>\n",
" <td>2</td>\n",
" <td>884290328</td>\n",
" <td>351</td>\n",
" <td>272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>618</td>\n",
" <td>96</td>\n",
" <td>3</td>\n",
" <td>891307749</td>\n",
" <td>617</td>\n",
" <td>95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>560</td>\n",
" <td>24</td>\n",
" <td>2</td>\n",
" <td>879976772</td>\n",
" <td>559</td>\n",
" <td>23</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp user_code item_code\n",
"0 664 525 4 876526580 663 524\n",
"1 49 1 2 888068651 48 0\n",
"2 352 273 2 884290328 351 272\n",
"3 618 96 3 891307749 617 95\n",
"4 560 24 2 879976772 559 23"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_and_test[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
"test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Take number of users and items\n",
"(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
"\n",
"# Create sparse csr matrices\n",
"train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
"test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Above steps are the same for many algorithms, so I put the code in separate file:\n",
"import helpers\n",
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
"train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CSR matrices - what is it?"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<3x4 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 8 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample_csr=sparse.csr_matrix((data, (row, col)))\n",
"sample_csr"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings matrix with missing entries replaced by zeros:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2021-03-23 21:52:46 +01:00
"Number of ratings: 8\n",
"Number of users: 3\n",
"Number of items: 4\n"
2021-03-20 20:01:22 +01:00
]
}
],
"source": [
"print('Ratings matrix with missing entries replaced by zeros:')\n",
"display(sample_csr.todense())\n",
"\n",
2021-03-23 21:52:46 +01:00
"print(f'Number of ratings: {sample_csr.nnz}')\n",
"print(f'Number of users: {sample_csr.shape[0]}')\n",
"print(f'Number of items: {sample_csr.shape[1]}')"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ratings data: [4 1 3 2 1 2 5 4]\n",
"Regarding items: [0 1 2 1 3 0 2 3]\n",
"Where ratings from 0 to 2 belongs to user 0.\n",
"Where ratings from 3 to 4 belongs to user 1.\n",
"Where ratings from 5 to 7 belongs to user 2.\n"
]
}
],
"source": [
"print('Ratings data:', sample_csr.data)\n",
"\n",
"print('Regarding items:', sample_csr.indices)\n",
"\n",
"for i in range(sample_csr.shape[0]):\n",
2021-03-23 21:52:46 +01:00
" print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Efficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2021-03-26 21:00:52 +01:00
"1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
2021-03-20 20:01:22 +01:00
"Inefficient way to access items rated by user:\n"
]
},
{
"data": {
"text/plain": [
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2021-03-26 21:00:52 +01:00
"149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
2021-03-20 20:01:22 +01:00
]
}
],
"source": [
"user=123\n",
"\n",
"print('Efficient way to access items rated by user:')\n",
"display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
"%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
"\n",
"print('Inefficient way to access items rated by user:')\n",
"display(train_ui[user].indices)\n",
"%timeit train_ui[user].indices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Example: subtracting row means"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Our matrix:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[4, 1, 3, 0],\n",
" [0, 2, 0, 1],\n",
" [2, 0, 5, 4]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"List of row sums:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 8, 3, 11]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Our matrix:')\n",
"display(sample_csr.todense())\n",
"print('List of row sums:')\n",
"sample_csr.sum(axis=1).ravel()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Array with row means:\n"
]
},
{
"data": {
"text/plain": [
"array([2.66666667, 1.5 , 3.66666667])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Diagonal csr matrix with inverse of row sums on diagonal:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 0. , 0. ],\n",
" [0. , 1.5 , 0. ],\n",
" [0. , 0. , 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Let's apply them in nonzero entries:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n",
" [0. , 1.5 , 0. , 1.5 ],\n",
" [3.66666667, 0. , 3.66666667, 3.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finally after subtraction:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n",
" [ 0. , 0.5 , 0. , -0.5 ],\n",
" [-1.66666667, 0. , 1.33333333, 0.33333333]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Array with row means:')\n",
"row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
"display(row_means)\n",
"\n",
"print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
"display(sparse.diags(row_means).todense())\n",
"\n",
"print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
2021-03-23 21:52:46 +01:00
"to_subtract=sparse.diags(row_means)*(sample_csr>0)\n",
2021-03-20 20:01:22 +01:00
"display(to_subtract.todense())\n",
"\n",
"print(\"Finally after subtraction:\")\n",
"sample_csr-to_subtract.todense()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### Transposing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sample matrix: \n",
" [[4 1 3 0]\n",
" [0 2 0 1]\n",
" [2 0 5 4]]\n",
"\n",
"Indices: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Transposed matrix: \n",
" [[4 0 2]\n",
" [1 2 0]\n",
" [3 0 5]\n",
" [0 1 4]]\n",
"\n",
"Indices of transposed matrix: \n",
" [0 1 2 1 3 0 2 3]\n",
"\n",
"Reason: <class 'scipy.sparse.csc.csc_matrix'>\n",
"\n",
"After converting to csr: \n",
" [0 2 0 1 0 2 1 2]\n"
]
}
],
"source": [
"import numpy as np\n",
"from scipy import sparse\n",
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
"sample=sparse.csr_matrix((data, (row, col)))\n",
"print('Sample matrix: \\n', sample.A)\n",
"print('\\nIndices: \\n', sample.indices)\n",
"transposed=sample.transpose()\n",
"print('\\nTransposed matrix: \\n', transposed.A)\n",
"print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
"\n",
"print('\\nReason: ', type(transposed))\n",
"\n",
"print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self made top popular"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.exists('./Recommendations generated/'):\n",
" os.mkdir('./Recommendations generated/')\n",
" os.mkdir('./Recommendations generated/ml-100k/')\n",
" os.mkdir('./Recommendations generated/toy-example/')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
2021-03-23 21:52:46 +01:00
"top_pop = []\n",
"train_iu = train_ui.transpose().tocsr()\n",
"scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))\n",
2021-03-20 20:01:22 +01:00
"\n",
"for i in range(train_iu.shape[0]):\n",
2021-03-23 21:52:46 +01:00
" top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
2021-03-20 20:01:22 +01:00
" \n",
2021-03-23 21:52:46 +01:00
"top_pop.sort(key=lambda x: x[1], reverse=True)\n",
"#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
"k = 10\n",
"result = []\n",
2021-03-20 20:01:22 +01:00
"\n",
"for u in range(train_ui.shape[0]):\n",
2021-03-23 21:52:46 +01:00
" user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user = []\n",
" item_pos = 0\n",
2021-03-20 20:01:22 +01:00
" while len(rec_user)<10:\n",
2021-03-23 21:52:46 +01:00
" if top_pop[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))\n",
2021-03-20 20:01:22 +01:00
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
"\n",
"\n",
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
"\n",
"estimations=[]\n",
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
" estimations.append([user_code_id[user], item_code_id[item],\n",
" (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 21:52:46 +01:00
"# Self made top rated"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
2021-03-23 21:52:46 +01:00
"top_rated = []\n",
"global_avg = sum(train_iu.data)/train_ui.nnz\n",
2021-03-20 20:01:22 +01:00
"\n",
"for i in range(train_iu.shape[0]):\n",
2021-03-23 21:52:46 +01:00
" ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]\n",
" avg = np.mean(ratings) if len(ratings)>0 else global_avg\n",
" top_rated.append((i, avg))\n",
" \n",
"top_rated.sort(key=lambda x: x[1], reverse=True)\n",
2021-03-20 20:01:22 +01:00
" \n",
"k=10\n",
"result=[]\n",
"\n",
"for u in range(train_ui.shape[0]):\n",
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
" rec_user=[]\n",
" item_pos=0\n",
" while len(rec_user)<10:\n",
2021-03-23 21:52:46 +01:00
" if top_rated[item_pos][0] not in user_rated:\n",
" rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))\n",
2021-03-20 20:01:22 +01:00
" item_pos+=1\n",
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
"\n",
2021-03-23 21:52:46 +01:00
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n",
2021-03-20 20:01:22 +01:00
"\n",
"\n",
"\n",
"estimations=[]\n",
2021-03-23 21:52:46 +01:00
"d = dict(top_rated)\n",
2021-03-20 20:01:22 +01:00
"\n",
"for user, item in zip(*test_ui.nonzero()):\n",
2021-03-23 21:52:46 +01:00
" estimations.append([user_code_id[user], item_code_id[item], d[item]])\n",
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>13</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2021-03-23 21:52:46 +01:00
" <td>814</td>\n",
" <td>5.0</td>\n",
" <td>1122</td>\n",
" <td>5.0</td>\n",
" <td>1189</td>\n",
" <td>5.0</td>\n",
" <td>1201</td>\n",
" <td>5.0</td>\n",
" <td>1293</td>\n",
2021-03-20 20:01:22 +01:00
" <td>...</td>\n",
2021-03-23 21:52:46 +01:00
" <td>1306</td>\n",
" <td>5.0</td>\n",
" <td>1467</td>\n",
" <td>5.0</td>\n",
" <td>1491</td>\n",
" <td>5.0</td>\n",
" <td>1500</td>\n",
" <td>5.0</td>\n",
" <td>1536</td>\n",
" <td>5.0</td>\n",
2021-03-20 20:01:22 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2021-03-23 21:52:46 +01:00
" <td>119</td>\n",
" <td>5.0</td>\n",
" <td>814</td>\n",
" <td>5.0</td>\n",
" <td>1122</td>\n",
" <td>5.0</td>\n",
" <td>1189</td>\n",
" <td>5.0</td>\n",
" <td>1201</td>\n",
2021-03-20 20:01:22 +01:00
" <td>...</td>\n",
2021-03-23 21:52:46 +01:00
" <td>1293</td>\n",
" <td>5.0</td>\n",
" <td>1306</td>\n",
" <td>5.0</td>\n",
" <td>1467</td>\n",
" <td>5.0</td>\n",
" <td>1491</td>\n",
" <td>5.0</td>\n",
" <td>1500</td>\n",
" <td>5.0</td>\n",
2021-03-20 20:01:22 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
2021-03-23 21:52:46 +01:00
" 0 1 2 3 4 5 6 7 8 9 ... 11 12 13 \\\n",
"0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 1293 ... 1306 5.0 1467 \n",
"1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 1293 5.0 1306 \n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
" 14 15 16 17 18 19 20 \n",
"0 5.0 1491 5.0 1500 5.0 1536 5.0 \n",
"1 5.0 1467 5.0 1491 5.0 1500 5.0 \n",
2021-03-20 20:01:22 +01:00
"\n",
"[2 rows x 21 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(result)[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Self-made baseline"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 18,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"class selfBaselineUI():\n",
" \n",
" def fit(self, train_ui):\n",
" self.train_ui=train_ui.copy()\n",
" self.train_iu=train_ui.transpose().tocsr()\n",
" \n",
" result=self.train_ui.copy()\n",
" \n",
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
" \n",
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
" # (other option is to define addition/multiplication in a desired way)\n",
" row_means=self.row_means.copy()\n",
" \n",
" max_row_mean=np.max(row_means)\n",
" row_means[row_means==0]=max_row_mean+1\n",
2021-03-23 21:52:46 +01:00
" to_subtract_rows=sparse.diags(row_means)*(result>0)\n",
2021-03-20 20:01:22 +01:00
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_rows.data\n",
" subtract[subtract==max_row_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
" \n",
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
" col_means=self.col_means.copy()\n",
" \n",
" max_col_mean=np.max(col_means)\n",
" col_means[col_means==0]=max_col_mean+1\n",
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
" \n",
" subtract=to_subtract_cols.data\n",
" subtract[subtract==max_col_mean+1]=0\n",
" \n",
" result.data=result.data-subtract\n",
"\n",
" return result\n",
" \n",
" \n",
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
" \n",
" top_k = defaultdict(list)\n",
" for nb_user, user in enumerate(estimations):\n",
" \n",
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
" for item, score in enumerate(user):\n",
" if item not in user_rated:\n",
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, item_scores in top_k.items():\n",
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
" return result\n",
" \n",
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
" result=[]\n",
" for user, item in zip(*test_ui.nonzero()):\n",
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
" return result"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 19,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
" [0, 0, 0, 5, 0, 3, 4, 0]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"After subtracting rows and columns:\n"
]
},
{
"data": {
"text/plain": [
"matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommend best unseen item:\n"
]
},
{
"data": {
"text/plain": [
"[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Print estimations on unseen items:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>est_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>40</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20</td>\n",
" <td>20</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20</td>\n",
" <td>70</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item est_score\n",
"0 0 60 4.0\n",
"1 10 40 3.0\n",
"2 20 0 3.0\n",
"3 20 20 4.0\n",
"4 20 70 4.0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
"\n",
"toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
"\n",
"print('Training data:')\n",
"display(toy_train_ui.todense())\n",
"\n",
"model=selfBaselineUI()\n",
"print('After subtracting rows and columns:')\n",
"display(model.fit(toy_train_ui).todense())\n",
"\n",
"print('Recommend best unseen item:')\n",
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
"\n",
"print('Print estimations on unseen items:')\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.columns=['user', 'item', 'est_score']\n",
"display(estimations)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
"\n",
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 20,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"model=selfBaselineUI()\n",
"model.fit(train_ui)\n",
"\n",
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
"\n",
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
"estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 21:52:46 +01:00
"# project task 1: implement self-made BaselineIU"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 21,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n",
2021-03-23 21:52:46 +01:00
"# but first subtract column means then row means\n",
2021-03-20 20:01:22 +01:00
"# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n",
"# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ready-made baseline - Surprise implementation"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 22,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimating biases using als...\n"
]
}
],
"source": [
"import surprise as sp\n",
"import time\n",
"\n",
"# Based on surprise.readthedocs.io\n",
"def get_top_n(predictions, n=10):\n",
" \n",
" # Here we create a dictionary which items are lists of pairs (item, score)\n",
" top_n = defaultdict(list)\n",
" for uid, iid, true_r, est, _ in predictions:\n",
" top_n[uid].append((iid, est))\n",
" \n",
" result=[]\n",
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
" for uid, user_ratings in top_n.items():\n",
" user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
" result.append([uid]+list(chain(*user_ratings[:n]))) \n",
" return result\n",
"\n",
"\n",
"reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
"trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
"trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
"\n",
"testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
"testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
"\n",
"algo = sp.BaselineOnly()\n",
"# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
"# observe how bad results gives above algorithm\n",
"# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
"\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 23,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.9495\n",
"MAE: 0.7525\n"
]
},
{
"data": {
"text/plain": [
"0.7524871012820799"
]
},
2021-03-23 21:52:46 +01:00
"execution_count": 23,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Let's compare with random"
]
},
{
"cell_type": "code",
2021-03-23 21:52:46 +01:00
"execution_count": 24,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2021-03-26 21:00:52 +01:00
"RMSE: 1.5239\n",
"MAE: 1.2268\n"
2021-03-20 20:01:22 +01:00
]
},
{
"data": {
"text/plain": [
2021-03-26 21:00:52 +01:00
"1.2267993503843746"
2021-03-20 20:01:22 +01:00
]
},
2021-03-23 21:52:46 +01:00
"execution_count": 24,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
"algo = sp.NormalPredictor()\n",
"algo.fit(trainset)\n",
"\n",
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
"predictions = algo.test(antitrainset)\n",
"\n",
"top_n = get_top_n(predictions, n=10)\n",
"\n",
"top_n=pd.DataFrame(top_n)\n",
"\n",
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
"\n",
"# Compute RMSE on testset using buildin functions\n",
"predictions = algo.test(testset)\n",
"sp.accuracy.rmse(predictions, verbose=True)\n",
"\n",
"# Let's also save the results in file\n",
"predictions_df=[]\n",
"for uid, iid, true_r, est, _ in predictions:\n",
" predictions_df.append([uid, iid, est])\n",
" \n",
"predictions_df=pd.DataFrame(predictions_df)\n",
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
"\n",
"sp.accuracy.mae(predictions, verbose=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}