1257 lines
37 KiB
Plaintext
1257 lines
37 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Preparing dataset"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import scipy.sparse as sparse\n",
|
|||
|
"from collections import defaultdict\n",
|
|||
|
"from itertools import chain\n",
|
|||
|
"import random\n",
|
|||
|
"\n",
|
|||
|
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
|
|||
|
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Let's prepare dataset\n",
|
|||
|
"train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n",
|
|||
|
"train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n",
|
|||
|
"train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n",
|
|||
|
"\n",
|
|||
|
"user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n",
|
|||
|
"user_id_code = dict((v, k) for k, v in user_code_id.items())\n",
|
|||
|
"item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n",
|
|||
|
"item_id_code = dict((v, k) for k, v in item_code_id.items())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>user</th>\n",
|
|||
|
" <th>item</th>\n",
|
|||
|
" <th>rating</th>\n",
|
|||
|
" <th>timestamp</th>\n",
|
|||
|
" <th>user_code</th>\n",
|
|||
|
" <th>item_code</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>664</td>\n",
|
|||
|
" <td>525</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>876526580</td>\n",
|
|||
|
" <td>663</td>\n",
|
|||
|
" <td>524</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>49</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>888068651</td>\n",
|
|||
|
" <td>48</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>352</td>\n",
|
|||
|
" <td>273</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>884290328</td>\n",
|
|||
|
" <td>351</td>\n",
|
|||
|
" <td>272</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>618</td>\n",
|
|||
|
" <td>96</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>891307749</td>\n",
|
|||
|
" <td>617</td>\n",
|
|||
|
" <td>95</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>560</td>\n",
|
|||
|
" <td>24</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>879976772</td>\n",
|
|||
|
" <td>559</td>\n",
|
|||
|
" <td>23</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" user item rating timestamp user_code item_code\n",
|
|||
|
"0 664 525 4 876526580 663 524\n",
|
|||
|
"1 49 1 2 888068651 48 0\n",
|
|||
|
"2 352 273 2 884290328 351 272\n",
|
|||
|
"3 618 96 3 891307749 617 95\n",
|
|||
|
"4 560 24 2 879976772 559 23"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_and_test[:5]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n",
|
|||
|
"test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Take number of users and items\n",
|
|||
|
"(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n",
|
|||
|
"\n",
|
|||
|
"# Create sparse csr matrices\n",
|
|||
|
"train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n",
|
|||
|
"test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Above steps are the same for many algorithms, so I put the code in separate file:\n",
|
|||
|
"import helpers\n",
|
|||
|
"train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n",
|
|||
|
"test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n",
|
|||
|
"train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### CSR matrices - what is it?"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<3x4 sparse matrix of type '<class 'numpy.longlong'>'\n",
|
|||
|
"\twith 8 stored elements in Compressed Sparse Row format>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
|
|||
|
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
|
|||
|
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
|
|||
|
"sample_csr=sparse.csr_matrix((data, (row, col)))\n",
|
|||
|
"sample_csr"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Ratings matrix with missing entries replaced by zeros:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[4, 1, 3, 0],\n",
|
|||
|
" [0, 2, 0, 1],\n",
|
|||
|
" [2, 0, 5, 4]], dtype=int64)"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Number of ratings: 8 \n",
|
|||
|
"Number of users: 3 \n",
|
|||
|
"Number of items: 4 \n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print('Ratings matrix with missing entries replaced by zeros:')\n",
|
|||
|
"display(sample_csr.todense())\n",
|
|||
|
"\n",
|
|||
|
"print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n",
|
|||
|
" .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Ratings data: [4 1 3 2 1 2 5 4]\n",
|
|||
|
"Regarding items: [0 1 2 1 3 0 2 3]\n",
|
|||
|
"Where ratings from 0 to 2 belongs to user 0.\n",
|
|||
|
"Where ratings from 3 to 4 belongs to user 1.\n",
|
|||
|
"Where ratings from 5 to 7 belongs to user 2.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print('Ratings data:', sample_csr.data)\n",
|
|||
|
"\n",
|
|||
|
"print('Regarding items:', sample_csr.indices)\n",
|
|||
|
"\n",
|
|||
|
"for i in range(sample_csr.shape[0]):\n",
|
|||
|
" print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Efficient way to access items rated by user:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
|
|||
|
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"563 ns ± 16.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n",
|
|||
|
"Inefficient way to access items rated by user:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n",
|
|||
|
" 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"70.8 µs ± 2.93 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"user=123\n",
|
|||
|
"\n",
|
|||
|
"print('Efficient way to access items rated by user:')\n",
|
|||
|
"display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n",
|
|||
|
"%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n",
|
|||
|
"\n",
|
|||
|
"print('Inefficient way to access items rated by user:')\n",
|
|||
|
"display(train_ui[user].indices)\n",
|
|||
|
"%timeit train_ui[user].indices"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"###### Example: subtracting row means"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Our matrix:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[4, 1, 3, 0],\n",
|
|||
|
" [0, 2, 0, 1],\n",
|
|||
|
" [2, 0, 5, 4]], dtype=int64)"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"List of row sums:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[ 8, 3, 11]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print('Our matrix:')\n",
|
|||
|
"display(sample_csr.todense())\n",
|
|||
|
"print('List of row sums:')\n",
|
|||
|
"sample_csr.sum(axis=1).ravel()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Array with row means:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"array([2.66666667, 1.5 , 3.66666667])"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Diagonal csr matrix with inverse of row sums on diagonal:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[2.66666667, 0. , 0. ],\n",
|
|||
|
" [0. , 1.5 , 0. ],\n",
|
|||
|
" [0. , 0. , 3.66666667]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Let's apply them in nonzero entries:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n",
|
|||
|
" [0. , 1.5 , 0. , 1.5 ],\n",
|
|||
|
" [3.66666667, 0. , 3.66666667, 3.66666667]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Finally after subtraction:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n",
|
|||
|
" [ 0. , 0.5 , 0. , -0.5 ],\n",
|
|||
|
" [-1.66666667, 0. , 1.33333333, 0.33333333]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print('Array with row means:')\n",
|
|||
|
"row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n",
|
|||
|
"display(row_means)\n",
|
|||
|
"\n",
|
|||
|
"print('Diagonal csr matrix with inverse of row sums on diagonal:')\n",
|
|||
|
"display(sparse.diags(row_means).todense())\n",
|
|||
|
"\n",
|
|||
|
"print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n",
|
|||
|
"to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n",
|
|||
|
"display(to_subtract.todense())\n",
|
|||
|
"\n",
|
|||
|
"print(\"Finally after subtraction:\")\n",
|
|||
|
"sample_csr-to_subtract.todense()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"###### Transposing"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Sample matrix: \n",
|
|||
|
" [[4 1 3 0]\n",
|
|||
|
" [0 2 0 1]\n",
|
|||
|
" [2 0 5 4]]\n",
|
|||
|
"\n",
|
|||
|
"Indices: \n",
|
|||
|
" [0 1 2 1 3 0 2 3]\n",
|
|||
|
"\n",
|
|||
|
"Transposed matrix: \n",
|
|||
|
" [[4 0 2]\n",
|
|||
|
" [1 2 0]\n",
|
|||
|
" [3 0 5]\n",
|
|||
|
" [0 1 4]]\n",
|
|||
|
"\n",
|
|||
|
"Indices of transposed matrix: \n",
|
|||
|
" [0 1 2 1 3 0 2 3]\n",
|
|||
|
"\n",
|
|||
|
"Reason: <class 'scipy.sparse.csc.csc_matrix'>\n",
|
|||
|
"\n",
|
|||
|
"After converting to csr: \n",
|
|||
|
" [0 2 0 1 0 2 1 2]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"from scipy import sparse\n",
|
|||
|
"row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n",
|
|||
|
"col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n",
|
|||
|
"data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n",
|
|||
|
"sample=sparse.csr_matrix((data, (row, col)))\n",
|
|||
|
"print('Sample matrix: \\n', sample.A)\n",
|
|||
|
"print('\\nIndices: \\n', sample.indices)\n",
|
|||
|
"transposed=sample.transpose()\n",
|
|||
|
"print('\\nTransposed matrix: \\n', transposed.A)\n",
|
|||
|
"print('\\nIndices of transposed matrix: \\n', transposed.indices)\n",
|
|||
|
"\n",
|
|||
|
"print('\\nReason: ', type(transposed))\n",
|
|||
|
"\n",
|
|||
|
"print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Self made top popular"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"TopPop=[]\n",
|
|||
|
"train_iu=train_ui.transpose().tocsr()\n",
|
|||
|
"scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n",
|
|||
|
"\n",
|
|||
|
"for i in range(train_iu.shape[0]):\n",
|
|||
|
" TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n",
|
|||
|
" \n",
|
|||
|
"TopPop.sort(key=lambda x: x[1], reverse=True)\n",
|
|||
|
"#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n",
|
|||
|
"\n",
|
|||
|
"k=10\n",
|
|||
|
"result=[]\n",
|
|||
|
"\n",
|
|||
|
"for u in range(train_ui.shape[0]):\n",
|
|||
|
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
|
|||
|
" rec_user=[]\n",
|
|||
|
" item_pos=0\n",
|
|||
|
" while len(rec_user)<10:\n",
|
|||
|
" if TopPop[item_pos][0] not in user_rated:\n",
|
|||
|
" rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n",
|
|||
|
" item_pos+=1\n",
|
|||
|
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
|
|||
|
"\n",
|
|||
|
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
|
|||
|
"\n",
|
|||
|
"estimations=[]\n",
|
|||
|
"\n",
|
|||
|
"for user, item in zip(*test_ui.nonzero()):\n",
|
|||
|
" estimations.append([user_code_id[user], item_code_id[item],\n",
|
|||
|
" (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n",
|
|||
|
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Self made global average"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"GlobalAvg=[]\n",
|
|||
|
"avg=np.sum(train_ui)/train_ui.nnz\n",
|
|||
|
"\n",
|
|||
|
"for i in range(train_iu.shape[0]):\n",
|
|||
|
" GlobalAvg.append((i, avg))\n",
|
|||
|
" \n",
|
|||
|
"k=10\n",
|
|||
|
"result=[]\n",
|
|||
|
"\n",
|
|||
|
"for u in range(train_ui.shape[0]):\n",
|
|||
|
" user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n",
|
|||
|
" rec_user=[]\n",
|
|||
|
" item_pos=0\n",
|
|||
|
" while len(rec_user)<10:\n",
|
|||
|
" if GlobalAvg[item_pos][0] not in user_rated:\n",
|
|||
|
" rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n",
|
|||
|
" item_pos+=1\n",
|
|||
|
" result.append([user_code_id[u]]+list(chain(*rec_user)))\n",
|
|||
|
"\n",
|
|||
|
"(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n",
|
|||
|
"\n",
|
|||
|
"estimations=[]\n",
|
|||
|
"\n",
|
|||
|
"for user, item in zip(*test_ui.nonzero()):\n",
|
|||
|
" estimations.append([user_code_id[user], item_code_id[item], avg])\n",
|
|||
|
"(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>25</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>32</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>33</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>44</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>46</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>50</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>52</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>55</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" <td>11</td>\n",
|
|||
|
" <td>3.529975</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>2 rows × 21 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" 0 1 2 3 4 5 6 7 8 9 ... 11 \\\n",
|
|||
|
"0 1 5 3.529975 10 3.529975 25 3.529975 32 3.529975 33 ... 44 \n",
|
|||
|
"1 2 1 3.529975 2 3.529975 3 3.529975 4 3.529975 5 ... 6 \n",
|
|||
|
"\n",
|
|||
|
" 12 13 14 15 16 17 18 19 20 \n",
|
|||
|
"0 3.529975 46 3.529975 50 3.529975 52 3.529975 55 3.529975 \n",
|
|||
|
"1 3.529975 7 3.529975 8 3.529975 9 3.529975 11 3.529975 \n",
|
|||
|
"\n",
|
|||
|
"[2 rows x 21 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pd.DataFrame(result)[:2]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Project task 1 - self made top rated"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# project task 1: implement TopRated\n",
|
|||
|
"# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n",
|
|||
|
"# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n",
|
|||
|
"# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Self-made baseline"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"class selfBaselineUI():\n",
|
|||
|
" \n",
|
|||
|
" def fit(self, train_ui):\n",
|
|||
|
" self.train_ui=train_ui.copy()\n",
|
|||
|
" self.train_iu=train_ui.transpose().tocsr()\n",
|
|||
|
" \n",
|
|||
|
" result=self.train_ui.copy()\n",
|
|||
|
" \n",
|
|||
|
" self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n",
|
|||
|
" \n",
|
|||
|
" # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n",
|
|||
|
" # (other option is to define addition/multiplication in a desired way)\n",
|
|||
|
" row_means=self.row_means.copy()\n",
|
|||
|
" \n",
|
|||
|
" max_row_mean=np.max(row_means)\n",
|
|||
|
" row_means[row_means==0]=max_row_mean+1\n",
|
|||
|
" to_subtract_rows=sparse.diags(row_means)*result.power(0)\n",
|
|||
|
" to_subtract_rows.sort_indices() # needed to have valid .data\n",
|
|||
|
" \n",
|
|||
|
" subtract=to_subtract_rows.data\n",
|
|||
|
" subtract[subtract==max_row_mean+1]=0\n",
|
|||
|
" \n",
|
|||
|
" result.data=result.data-subtract\n",
|
|||
|
"# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n",
|
|||
|
" self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n",
|
|||
|
" out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n",
|
|||
|
" \n",
|
|||
|
" # again - it is possible that some mean will be zero, so let's use the same workaround\n",
|
|||
|
" col_means=self.col_means.copy()\n",
|
|||
|
" \n",
|
|||
|
" max_col_mean=np.max(col_means)\n",
|
|||
|
" col_means[col_means==0]=max_col_mean+1\n",
|
|||
|
" to_subtract_cols=result.power(0)*sparse.diags(col_means)\n",
|
|||
|
" to_subtract_cols.sort_indices() # needed to have valid .data\n",
|
|||
|
" \n",
|
|||
|
" subtract=to_subtract_cols.data\n",
|
|||
|
" subtract[subtract==max_col_mean+1]=0\n",
|
|||
|
" \n",
|
|||
|
" result.data=result.data-subtract\n",
|
|||
|
"\n",
|
|||
|
" return result\n",
|
|||
|
" \n",
|
|||
|
" \n",
|
|||
|
" def recommend(self, user_code_id, item_code_id, topK=10):\n",
|
|||
|
" estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n",
|
|||
|
" \n",
|
|||
|
" top_k = defaultdict(list)\n",
|
|||
|
" for nb_user, user in enumerate(estimations):\n",
|
|||
|
" \n",
|
|||
|
" user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n",
|
|||
|
" for item, score in enumerate(user):\n",
|
|||
|
" if item not in user_rated:\n",
|
|||
|
" top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n",
|
|||
|
" result=[]\n",
|
|||
|
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
|
|||
|
" for uid, item_scores in top_k.items():\n",
|
|||
|
" item_scores.sort(key=lambda x: x[1], reverse=True)\n",
|
|||
|
" result.append([uid]+list(chain(*item_scores[:topK])))\n",
|
|||
|
" return result\n",
|
|||
|
" \n",
|
|||
|
" def estimate(self, user_code_id, item_code_id, test_ui):\n",
|
|||
|
" result=[]\n",
|
|||
|
" for user, item in zip(*test_ui.nonzero()):\n",
|
|||
|
" result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n",
|
|||
|
" return result"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Training data:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n",
|
|||
|
" [0, 1, 2, 3, 0, 0, 0, 0],\n",
|
|||
|
" [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"After subtracting rows and columns:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
|
|||
|
" [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n",
|
|||
|
" [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Recommend best unseen item:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Print estimations on unseen items:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>user</th>\n",
|
|||
|
" <th>item</th>\n",
|
|||
|
" <th>est_score</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>60</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>40</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>70</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" user item est_score\n",
|
|||
|
"0 0 60 4.0\n",
|
|||
|
"1 10 40 3.0\n",
|
|||
|
"2 20 0 3.0\n",
|
|||
|
"3 20 20 4.0\n",
|
|||
|
"4 20 70 4.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
|
|||
|
"toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n",
|
|||
|
"\n",
|
|||
|
"toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n",
|
|||
|
"toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n",
|
|||
|
"\n",
|
|||
|
"print('Training data:')\n",
|
|||
|
"display(toy_train_ui.todense())\n",
|
|||
|
"\n",
|
|||
|
"model=selfBaselineUI()\n",
|
|||
|
"print('After subtracting rows and columns:')\n",
|
|||
|
"display(model.fit(toy_train_ui).todense())\n",
|
|||
|
"\n",
|
|||
|
"print('Recommend best unseen item:')\n",
|
|||
|
"display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n",
|
|||
|
"\n",
|
|||
|
"print('Print estimations on unseen items:')\n",
|
|||
|
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
|
|||
|
"estimations.columns=['user', 'item', 'est_score']\n",
|
|||
|
"display(estimations)\n",
|
|||
|
"\n",
|
|||
|
"top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n",
|
|||
|
"\n",
|
|||
|
"top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n",
|
|||
|
"estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"model=selfBaselineUI()\n",
|
|||
|
"model.fit(train_ui)\n",
|
|||
|
"\n",
|
|||
|
"top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n",
|
|||
|
"\n",
|
|||
|
"top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n",
|
|||
|
"estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# project task 2: implement self-made BaselineIU"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n",
|
|||
|
"# but first subtract col means then row means\n",
|
|||
|
"# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n",
|
|||
|
"# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Ready-made baseline - Surprise implementation"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Estimating biases using als...\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import surprise as sp\n",
|
|||
|
"import time\n",
|
|||
|
"\n",
|
|||
|
"# Based on surprise.readthedocs.io\n",
|
|||
|
"def get_top_n(predictions, n=10):\n",
|
|||
|
" \n",
|
|||
|
" # Here we create a dictionary which items are lists of pairs (item, score)\n",
|
|||
|
" top_n = defaultdict(list)\n",
|
|||
|
" for uid, iid, true_r, est, _ in predictions:\n",
|
|||
|
" top_n[uid].append((iid, est))\n",
|
|||
|
" \n",
|
|||
|
" result=[]\n",
|
|||
|
" # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n",
|
|||
|
" for uid, user_ratings in top_n.items():\n",
|
|||
|
" user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
|
|||
|
" result.append([uid]+list(chain(*user_ratings[:n]))) \n",
|
|||
|
" return result\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n",
|
|||
|
"trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n",
|
|||
|
"trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package\n",
|
|||
|
"\n",
|
|||
|
"testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n",
|
|||
|
"testset = sp.Trainset.build_testset(testset.build_full_trainset())\n",
|
|||
|
"\n",
|
|||
|
"algo = sp.BaselineOnly()\n",
|
|||
|
"# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n",
|
|||
|
"# observe how bad results gives above algorithm\n",
|
|||
|
"# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n",
|
|||
|
"\n",
|
|||
|
"algo.fit(trainset)\n",
|
|||
|
"\n",
|
|||
|
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
|
|||
|
"predictions = algo.test(antitrainset)\n",
|
|||
|
"\n",
|
|||
|
"top_n = get_top_n(predictions, n=10)\n",
|
|||
|
"\n",
|
|||
|
"top_n=pd.DataFrame(top_n)\n",
|
|||
|
"\n",
|
|||
|
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RMSE: 0.9495\n",
|
|||
|
"MAE: 0.7525\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0.7524871012820799"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Compute RMSE on testset using buildin functions\n",
|
|||
|
"predictions = algo.test(testset)\n",
|
|||
|
"sp.accuracy.rmse(predictions, verbose=True)\n",
|
|||
|
"\n",
|
|||
|
"# Let's also save the results in file\n",
|
|||
|
"predictions_df=[]\n",
|
|||
|
"for uid, iid, true_r, est, _ in predictions:\n",
|
|||
|
" predictions_df.append([uid, iid, est])\n",
|
|||
|
" \n",
|
|||
|
"predictions_df=pd.DataFrame(predictions_df)\n",
|
|||
|
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"sp.accuracy.mae(predictions, verbose=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"##### Let's compare with random"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"RMSE: 1.5186\n",
|
|||
|
"MAE: 1.2188\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"1.2187837474576546"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n",
|
|||
|
"algo = sp.NormalPredictor()\n",
|
|||
|
"algo.fit(trainset)\n",
|
|||
|
"\n",
|
|||
|
"antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n",
|
|||
|
"predictions = algo.test(antitrainset)\n",
|
|||
|
"\n",
|
|||
|
"top_n = get_top_n(predictions, n=10)\n",
|
|||
|
"\n",
|
|||
|
"top_n=pd.DataFrame(top_n)\n",
|
|||
|
"\n",
|
|||
|
"top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"# Compute RMSE on testset using buildin functions\n",
|
|||
|
"predictions = algo.test(testset)\n",
|
|||
|
"sp.accuracy.rmse(predictions, verbose=True)\n",
|
|||
|
"\n",
|
|||
|
"# Let's also save the results in file\n",
|
|||
|
"predictions_df=[]\n",
|
|||
|
"for uid, iid, true_r, est, _ in predictions:\n",
|
|||
|
" predictions_df.append([uid, iid, est])\n",
|
|||
|
" \n",
|
|||
|
"predictions_df=pd.DataFrame(predictions_df)\n",
|
|||
|
"predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n",
|
|||
|
"\n",
|
|||
|
"sp.accuracy.mae(predictions, verbose=True)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.6.9"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 4
|
|||
|
}
|