{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparing dataset" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scipy.sparse as sparse\n", "from collections import defaultdict\n", "from itertools import chain\n", "import random\n", "\n", "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "# Let's prepare dataset\n", "train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n", "train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n", "train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n", "\n", "user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n", "user_id_code = dict((v, k) for k, v in user_code_id.items())\n", "item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n", "item_id_code = dict((v, k) for k, v in item_code_id.items())" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingtimestampuser_codeitem_code
06645254876526580663524
14912888068651480
23522732884290328351272
361896389130774961795
456024287997677255923
\n", "
" ], "text/plain": [ " user item rating timestamp user_code item_code\n", "0 664 525 4 876526580 663 524\n", "1 49 1 2 888068651 48 0\n", "2 352 273 2 884290328 351 272\n", "3 618 96 3 891307749 617 95\n", "4 560 24 2 879976772 559 23" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_and_test[:5]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n", "test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "# Take number of users and items\n", "(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n", "\n", "# Create sparse csr matrices\n", "train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n", "test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Above steps are the same for many algorithms, so I put the code in separate file:\n", "import helpers\n", "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n", "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSR matrices - what is it?" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<3x4 sparse matrix of type ''\n", "\twith 8 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n", "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n", "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n", "sample_csr=sparse.csr_matrix((data, (row, col)))\n", "sample_csr" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ratings matrix with missing entries replaced by zeros:\n" ] }, { "data": { "text/plain": [ "matrix([[4, 1, 3, 0],\n", " [0, 2, 0, 1],\n", " [2, 0, 5, 4]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Number of ratings: 8 \n", "Number of users: 3 \n", "Number of items: 4 \n", "\n" ] } ], "source": [ "print('Ratings matrix with missing entries replaced by zeros:')\n", "display(sample_csr.todense())\n", "\n", "print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n", " .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ratings data: [4 1 3 2 1 2 5 4]\n", "Regarding items: [0 1 2 1 3 0 2 3]\n", "Where ratings from 0 to 2 belongs to user 0.\n", "Where ratings from 3 to 4 belongs to user 1.\n", "Where ratings from 5 to 7 belongs to user 2.\n" ] } ], "source": [ "print('Ratings data:', sample_csr.data)\n", "\n", "print('Regarding items:', sample_csr.indices)\n", "\n", "for i in range(sample_csr.shape[0]):\n", " print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Efficient way to access items rated by user:\n" ] }, { "data": { "text/plain": [ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n", " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "557 ns ± 15.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n", "Inefficient way to access items rated by user:\n" ] }, { "data": { "text/plain": [ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n", " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "65.2 µs ± 4.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "user=123\n", "\n", "print('Efficient way to access items rated by user:')\n", "display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n", "%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n", "\n", "print('Inefficient way to access items rated by user:')\n", "display(train_ui[user].indices)\n", "%timeit train_ui[user].indices" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Example: subtracting row means" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Our matrix:\n" ] }, { "data": { "text/plain": [ "matrix([[4, 1, 3, 0],\n", " [0, 2, 0, 1],\n", " [2, 0, 5, 4]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "List of row sums:\n" ] }, { "data": { "text/plain": [ "matrix([[ 8, 3, 11]])" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Our matrix:')\n", "display(sample_csr.todense())\n", "print('List of row sums:')\n", "sample_csr.sum(axis=1).ravel()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Array with row means:\n" ] }, { "data": { "text/plain": [ "array([2.66666667, 1.5 , 3.66666667])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Diagonal csr matrix with inverse of row sums on diagonal:\n" ] }, { "data": { "text/plain": [ "matrix([[2.66666667, 0. , 0. ],\n", " [0. , 1.5 , 0. ],\n", " [0. , 0. , 3.66666667]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Let's apply them in nonzero entries:\n" ] }, { "data": { "text/plain": [ "matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n", " [0. , 1.5 , 0. , 1.5 ],\n", " [3.66666667, 0. , 3.66666667, 3.66666667]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Finally after subtraction:\n" ] }, { "data": { "text/plain": [ "matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n", " [ 0. , 0.5 , 0. , -0.5 ],\n", " [-1.66666667, 0. , 1.33333333, 0.33333333]])" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Array with row means:')\n", "row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n", "display(row_means)\n", "\n", "print('Diagonal csr matrix with inverse of row sums on diagonal:')\n", "display(sparse.diags(row_means).todense())\n", "\n", "print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n", "to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n", "display(to_subtract.todense())\n", "\n", "print(\"Finally after subtraction:\")\n", "sample_csr-to_subtract.todense()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Transposing" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sample matrix: \n", " [[4 1 3 0]\n", " [0 2 0 1]\n", " [2 0 5 4]]\n", "\n", "Indices: \n", " [0 1 2 1 3 0 2 3]\n", "\n", "Transposed matrix: \n", " [[4 0 2]\n", " [1 2 0]\n", " [3 0 5]\n", " [0 1 4]]\n", "\n", "Indices of transposed matrix: \n", " [0 1 2 1 3 0 2 3]\n", "\n", "Reason: \n", "\n", "After converting to csr: \n", " [0 2 0 1 0 2 1 2]\n" ] } ], "source": [ "import numpy as np\n", "from scipy import sparse\n", "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n", "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n", "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n", "sample=sparse.csr_matrix((data, (row, col)))\n", "print('Sample matrix: \\n', sample.A)\n", "print('\\nIndices: \\n', sample.indices)\n", "transposed=sample.transpose()\n", "print('\\nTransposed matrix: \\n', transposed.A)\n", "print('\\nIndices of transposed matrix: \\n', transposed.indices)\n", "\n", "print('\\nReason: ', type(transposed))\n", "\n", "print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self made top popular" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "import os\n", "if not os.path.exists('./Recommendations generated/'):\n", " os.mkdir('./Recommendations generated/')\n", " os.mkdir('./Recommendations generated/ml-100k/')\n", " os.mkdir('./Recommendations generated/toy-example/')" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "TopPop=[]\n", "train_iu=train_ui.transpose().tocsr()\n", "scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n", "\n", "for i in range(train_iu.shape[0]):\n", " TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n", " \n", "TopPop.sort(key=lambda x: x[1], reverse=True)\n", "#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n", "\n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if TopPop[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", "\n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n", "\n", "\n", "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " estimations.append([user_code_id[user], item_code_id[item],\n", " (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self made global average" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "GlobalAvg=[]\n", "avg=np.sum(train_ui)/train_ui.nnz\n", "\n", "for i in range(train_iu.shape[0]):\n", " GlobalAvg.append((i, avg))\n", " \n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if GlobalAvg[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", "\n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n", "\n", "\n", "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " estimations.append([user_code_id[user], item_code_id[item], avg])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...11121314151617181920
0153.529975103.529975253.529975323.52997533...443.529975463.529975503.529975523.529975553.529975
1213.52997523.52997533.52997543.5299755...63.52997573.52997583.52997593.529975113.529975
\n", "

2 rows × 21 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 11 \\\n", "0 1 5 3.529975 10 3.529975 25 3.529975 32 3.529975 33 ... 44 \n", "1 2 1 3.529975 2 3.529975 3 3.529975 4 3.529975 5 ... 6 \n", "\n", " 12 13 14 15 16 17 18 19 20 \n", "0 3.529975 46 3.529975 50 3.529975 52 3.529975 55 3.529975 \n", "1 3.529975 7 3.529975 8 3.529975 9 3.529975 11 3.529975 \n", "\n", "[2 rows x 21 columns]" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(result)[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Project task 1 - self made top rated" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "# project task 1: implement TopRated\n", "# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n", "# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n", "# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "TopRated=[]\n", "train_iu=train_ui.transpose().tocsr()\n", "\n", "for i in range(train_iu.shape[0]):\n", " if(train_iu.indptr[i+1]-train_iu.indptr[i] != 0):\n", " avg = np.sum(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]])/(train_iu.indptr[i+1]-train_iu.indptr[i])\n", " TopRated.append((i, avg))\n", " \n", "TopRated.sort(key=lambda x: x[1], reverse=True)\n", "\n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if TopRated[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[TopRated[item_pos][0]], TopRated[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", "\n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n", "\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " estimations.append([user_code_id[user], item_code_id[item],\n", " (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...11121314151617181920
018145.011225.011895.012015.01293...13065.014675.014915.015005.015365.0
121195.08145.011225.011895.01201...12935.013065.014675.014915.015005.0
\n", "

2 rows × 21 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 11 12 13 \\\n", "0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 1293 ... 1306 5.0 1467 \n", "1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 1293 5.0 1306 \n", "\n", " 14 15 16 17 18 19 20 \n", "0 5.0 1491 5.0 1500 5.0 1536 5.0 \n", "1 5.0 1467 5.0 1491 5.0 1500 5.0 \n", "\n", "[2 rows x 21 columns]" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(result)[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self-made baseline" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "class selfBaselineUI():\n", " \n", " def fit(self, train_ui):\n", " self.train_ui=train_ui.copy()\n", " self.train_iu=train_ui.transpose().tocsr()\n", " \n", " result=self.train_ui.copy()\n", " \n", " self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n", " \n", " # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n", " # (other option is to define addition/multiplication in a desired way)\n", " row_means=self.row_means.copy()\n", " \n", " max_row_mean=np.max(row_means)\n", " row_means[row_means==0]=max_row_mean+1\n", " to_subtract_rows=sparse.diags(row_means)*result.power(0)\n", " to_subtract_rows.sort_indices() # needed to have valid .data\n", " \n", " subtract=to_subtract_rows.data\n", " subtract[subtract==max_row_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n", " self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n", " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n", " \n", " # again - it is possible that some mean will be zero, so let's use the same workaround\n", " col_means=self.col_means.copy()\n", " \n", " max_col_mean=np.max(col_means)\n", " col_means[col_means==0]=max_col_mean+1\n", " to_subtract_cols=result.power(0)*sparse.diags(col_means)\n", " to_subtract_cols.sort_indices() # needed to have valid .data\n", " \n", " subtract=to_subtract_cols.data\n", " subtract[subtract==max_col_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "\n", " return result\n", " \n", " \n", " def recommend(self, user_code_id, item_code_id, topK=10):\n", " estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n", " \n", " top_k = defaultdict(list)\n", " for nb_user, user in enumerate(estimations):\n", " \n", " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", " for item, score in enumerate(user):\n", " if item not in user_rated:\n", " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", " result=[]\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, item_scores in top_k.items():\n", " item_scores.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*item_scores[:topK])))\n", " return result\n", " \n", " def estimate(self, user_code_id, item_code_id, test_ui):\n", " result=[]\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n", " return result" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data:\n" ] }, { "data": { "text/plain": [ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n", " [0, 1, 2, 3, 0, 0, 0, 0],\n", " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After subtracting rows and columns:\n" ] }, { "data": { "text/plain": [ "matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n", " [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Recommend best unseen item:\n" ] }, { "data": { "text/plain": [ "[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Print estimations on unseen items:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemest_score
00604.0
110403.0
22003.0
320204.0
420704.0
\n", "
" ], "text/plain": [ " user item est_score\n", "0 0 60 4.0\n", "1 10 40 3.0\n", "2 20 0 3.0\n", "3 20 20 4.0\n", "4 20 70 4.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "\n", "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n", "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n", "\n", "print('Training data:')\n", "display(toy_train_ui.todense())\n", "\n", "model=selfBaselineUI()\n", "print('After subtracting rows and columns:')\n", "display(model.fit(toy_train_ui).todense())\n", "\n", "print('Recommend best unseen item:')\n", "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n", "\n", "print('Print estimations on unseen items:')\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.columns=['user', 'item', 'est_score']\n", "display(estimations)\n", "\n", "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n", "\n", "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "model=selfBaselineUI()\n", "model.fit(train_ui)\n", "\n", "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# project task 2: implement self-made BaselineIU" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n", "# but first subtract col means then row means\n", "# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n", "# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "class selfBaselineIU():\n", " \n", " def fit(self, train_ui):\n", " self.train_ui=train_ui.copy()\n", " self.train_iu=train_ui.transpose().tocsr()\n", " \n", " result=self.train_ui.copy()\n", " \n", " self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n", " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n", " \n", " col_means=self.col_means.copy()\n", " \n", " max_col_mean=np.max(col_means)\n", " col_means[col_means==0]=max_col_mean+1\n", " to_subtract_cols=result.power(0)*sparse.diags(col_means)\n", " to_subtract_cols.sort_indices()\n", " \n", " subtract=to_subtract_cols.data\n", " subtract[subtract==max_col_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "\n", " self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n", " \n", " row_means=self.row_means.copy()\n", " \n", " max_row_mean=np.max(row_means)\n", " row_means[row_means==0]=max_row_mean+1\n", " to_subtract_rows=sparse.diags(row_means)*result.power(0)\n", " to_subtract_rows.sort_indices()\n", " \n", " subtract=to_subtract_rows.data\n", " subtract[subtract==max_row_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "\n", " return result\n", " \n", " \n", " def recommend(self, user_code_id, item_code_id, topK=10):\n", " estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n", " \n", " top_k = defaultdict(list)\n", " for nb_user, user in enumerate(estimations):\n", " \n", " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", " for item, score in enumerate(user):\n", " if item not in user_rated:\n", " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", " result=[]\n", "\n", " for uid, item_scores in top_k.items():\n", " item_scores.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*item_scores[:topK])))\n", " return result\n", " \n", " def estimate(self, user_code_id, item_code_id, test_ui):\n", " result=[]\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n", " return result" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data:\n" ] }, { "data": { "text/plain": [ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n", " [0, 1, 2, 3, 0, 0, 0, 0],\n", " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After subtracting rows and columns:\n" ] }, { "data": { "text/plain": [ "matrix([[-0.375 , 1.125 , 0. , 0. , -0.375 ,\n", " 0. , 0. , -0.375 ],\n", " [ 0. , -0.66666667, 0.83333333, -0.16666667, 0. ,\n", " 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , 0.66666667, 0. ,\n", " -0.33333333, -0.33333333, 0. ]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Recommend best unseen item:\n" ] }, { "data": { "text/plain": [ "[[0, 30, 4.375], [10, 40, 4.166666666666667], [20, 40, 5.333333333333333]]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Print estimations on unseen items:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemest_score
00604.375000
110404.166667
22003.333333
320202.333333
420704.333333
\n", "
" ], "text/plain": [ " user item est_score\n", "0 0 60 4.375000\n", "1 10 40 4.166667\n", "2 20 0 3.333333\n", "3 20 20 2.333333\n", "4 20 70 4.333333" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "\n", "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n", "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n", "\n", "print('Training data:')\n", "display(toy_train_ui.todense())\n", "\n", "model=selfBaselineIU()\n", "print('After subtracting rows and columns:')\n", "display(model.fit(toy_train_ui).todense())\n", "\n", "print('Recommend best unseen item:')\n", "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n", "\n", "print('Print estimations on unseen items:')\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.columns=['user', 'item', 'est_score']\n", "display(estimations)\n", "\n", "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n", "\n", "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineIU_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineIU_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "model=selfBaselineIU()\n", "model.fit(train_ui)\n", "\n", "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv', index=False, header=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ready-made baseline - Surprise implementation" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Estimating biases using als...\n" ] } ], "source": [ "import surprise as sp\n", "import time\n", "\n", "# Based on surprise.readthedocs.io\n", "def get_top_n(predictions, n=10):\n", " \n", " # Here we create a dictionary which items are lists of pairs (item, score)\n", " top_n = defaultdict(list)\n", " for uid, iid, true_r, est, _ in predictions:\n", " top_n[uid].append((iid, est))\n", " \n", " result=[]\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, user_ratings in top_n.items():\n", " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*user_ratings[:n]))) \n", " return result\n", "\n", "\n", "reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n", "trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n", "trainset = trainset.build_full_trainset() # -> it is needed for using Surprise package\n", "\n", "testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n", "testset = sp.Trainset.build_testset(testset.build_full_trainset())\n", "\n", "algo = sp.BaselineOnly()\n", "# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n", "# observe how bad results gives above algorithm\n", "# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n", "\n", "algo.fit(trainset)\n", "\n", "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n", "predictions = algo.test(antitrainset)\n", "\n", "top_n = get_top_n(predictions, n=10)\n", "\n", "top_n=pd.DataFrame(top_n)\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 0.9495\n", "MAE: 0.7525\n" ] }, { "data": { "text/plain": [ "0.7524871012820799" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute RMSE on testset using buildin functions\n", "predictions = algo.test(testset)\n", "sp.accuracy.rmse(predictions, verbose=True)\n", "\n", "# Let's also save the results in file\n", "predictions_df=[]\n", "for uid, iid, true_r, est, _ in predictions:\n", " predictions_df.append([uid, iid, est])\n", " \n", "predictions_df=pd.DataFrame(predictions_df)\n", "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n", "\n", "sp.accuracy.mae(predictions, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Let's compare with random" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 1.5228\n", "MAE: 1.2225\n" ] }, { "data": { "text/plain": [ "1.2225008866215548" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n", "algo = sp.NormalPredictor()\n", "algo.fit(trainset)\n", "\n", "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n", "predictions = algo.test(antitrainset)\n", "\n", "top_n = get_top_n(predictions, n=10)\n", "\n", "top_n=pd.DataFrame(top_n)\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n", "\n", "# Compute RMSE on testset using buildin functions\n", "predictions = algo.test(testset)\n", "sp.accuracy.rmse(predictions, verbose=True)\n", "\n", "# Let's also save the results in file\n", "predictions_df=[]\n", "for uid, iid, true_r, est, _ in predictions:\n", " predictions_df.append([uid, iid, est])\n", " \n", "predictions_df=pd.DataFrame(predictions_df)\n", "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n", "\n", "sp.accuracy.mae(predictions, verbose=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }