{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparing dataset" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scipy.sparse as sparse\n", "from collections import defaultdict\n", "from itertools import chain\n", "import random\n", "\n", "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingtimestamp
06645254876526580
14912888068651
23522732884290328
3618963891307749
4560242879976772
...............
799952671275878970529
799967272055883710104
799976061355880926245
7999857973880952006
799994747894887927152
\n", "

80000 rows × 4 columns

\n", "
" ], "text/plain": [ " user item rating timestamp\n", "0 664 525 4 876526580\n", "1 49 1 2 888068651\n", "2 352 273 2 884290328\n", "3 618 96 3 891307749\n", "4 560 24 2 879976772\n", "... ... ... ... ...\n", "79995 267 127 5 878970529\n", "79996 727 205 5 883710104\n", "79997 606 135 5 880926245\n", "79998 579 7 3 880952006\n", "79999 474 789 4 887927152\n", "\n", "[80000 rows x 4 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_read" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Let's prepare dataset\n", "train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)\n", "train_and_test['user_code'] = train_and_test['user'].astype(\"category\").cat.codes\n", "train_and_test['item_code'] = train_and_test['item'].astype(\"category\").cat.codes\n", "\n", "user_code_id = dict(enumerate(train_and_test['user'].astype(\"category\").cat.categories))\n", "user_id_code = dict((v, k) for k, v in user_code_id.items())\n", "item_code_id = dict(enumerate(train_and_test['item'].astype(\"category\").cat.categories))\n", "item_id_code = dict((v, k) for k, v in item_code_id.items())" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingtimestampuser_codeitem_code
06645254876526580663524
14912888068651480
23522732884290328351272
361896389130774961795
456024287997677255923
.....................
9999580494487944619480393
999961096275880582133108626
999972394895889178833238488
999986632943889491811662293
999992062451888179772205244
\n", "

100000 rows × 6 columns

\n", "
" ], "text/plain": [ " user item rating timestamp user_code item_code\n", "0 664 525 4 876526580 663 524\n", "1 49 1 2 888068651 48 0\n", "2 352 273 2 884290328 351 272\n", "3 618 96 3 891307749 617 95\n", "4 560 24 2 879976772 559 23\n", "... ... ... ... ... ... ...\n", "99995 804 94 4 879446194 803 93\n", "99996 109 627 5 880582133 108 626\n", "99997 239 489 5 889178833 238 488\n", "99998 663 294 3 889491811 662 293\n", "99999 206 245 1 888179772 205 244\n", "\n", "[100000 rows x 6 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_and_test" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))\n", "test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# Take number of users and items\n", "(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)\n", "\n", "# Create sparse csr matrices\n", "train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))\n", "test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Above steps are the same for many algorithms, so I put the code in separate file:\n", "import helpers\n", "train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None)\n", "test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None)\n", "train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "943" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSR matrices - what is it?" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<3x4 sparse matrix of type ''\n", "\twith 8 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n", "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n", "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n", "sample_csr=sparse.csr_matrix((data, (row, col)))\n", "sample_csr" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ratings matrix with missing entries replaced by zeros:\n" ] }, { "data": { "text/plain": [ "matrix([[4, 1, 3, 0],\n", " [0, 2, 0, 1],\n", " [2, 0, 5, 4]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Number of ratings: 8 \n", "Number of users: 3 \n", "Number of items: 4 \n", "\n" ] } ], "source": [ "print('Ratings matrix with missing entries replaced by zeros:')\n", "display(sample_csr.todense())\n", "\n", "print('\\nNumber of ratings: {} \\nNumber of users: {} \\nNumber of items: {} \\n'\n", " .format(sample_csr.nnz, sample_csr.shape[0The sparsity of a matrix can be quantified with a score, which is the number of zero values in the matrix], sample_csr.shape[1]))" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ratings data: [4 1 3 2 1 2 5 4]\n", "Regarding items: [0 1 2 1 3 0 2 3]\n", "Where ratings from 0 to 2 belongs to user 0.\n", "Where ratings from 3 to 4 belongs to user 1.\n", "Where ratings from 5 to 7 belongs to user 2.\n" ] } ], "source": [ "print('Ratings data:', sample_csr.data)\n", "\n", "print('Regarding items:', sample_csr.indices)\n", "\n", "for i in range(sample_csr.shape[0]):\n", " print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Efficient way to access items rated by user:\n" ] }, { "data": { "text/plain": [ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n", " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "864 ns ± 28.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)\n", "Inefficient way to access items rated by user:\n" ] }, { "data": { "text/plain": [ "array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167,\n", " 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "95.7 µs ± 4.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "user=123\n", "\n", "print('Efficient way to access items rated by user:')\n", "display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])\n", "%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]\n", "\n", "print('Inefficient way to access items rated by user:')\n", "display(train_ui[user].indices)\n", "%timeit train_ui[user].indices" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Example: subtracting row means" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Our matrix:\n" ] }, { "data": { "text/plain": [ "matrix([[4, 1, 3, 0],\n", " [0, 2, 0, 1],\n", " [2, 0, 5, 4]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "List of row sums:\n" ] }, { "data": { "text/plain": [ "matrix([[ 8, 3, 11]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Our matrix:')\n", "display(sample_csr.todense())\n", "print('List of row sums:')\n", "sample_csr.sum(axis=1).ravel()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Array with row means:\n" ] }, { "data": { "text/plain": [ "array([2.66666667, 1.5 , 3.66666667])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Diagonal csr matrix with inverse of row sums on diagonal:\n" ] }, { "data": { "text/plain": [ "matrix([[2.66666667, 0. , 0. ],\n", " [0. , 1.5 , 0. ],\n", " [0. , 0. , 3.66666667]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Let's apply them in nonzero entries:\n" ] }, { "data": { "text/plain": [ "matrix([[2.66666667, 2.66666667, 2.66666667, 0. ],\n", " [0. , 1.5 , 0. , 1.5 ],\n", " [3.66666667, 0. , 3.66666667, 3.66666667]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Finally after subtraction:\n" ] }, { "data": { "text/plain": [ "matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ],\n", " [ 0. , 0.5 , 0. , -0.5 ],\n", " [-1.66666667, 0. , 1.33333333, 0.33333333]])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Array with row means:')\n", "row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)\n", "display(row_means)\n", "\n", "print('Diagonal csr matrix with inverse of row sums on diagonal:')\n", "display(sparse.diags(row_means).todense())\n", "\n", "print(\"\"\"Let's apply them in nonzero entries:\"\"\")\n", "to_subtract=sparse.diags(row_means)*sample_csr.power(0)\n", "display(to_subtract.todense())\n", "\n", "print(\"Finally after subtraction:\")\n", "sample_csr-to_subtract.todense()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###### Transposing" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sample matrix: \n", " [[4 1 3 0]\n", " [0 2 0 1]\n", " [2 0 5 4]]\n", "\n", "Indices: \n", " [0 1 2 1 3 0 2 3]\n", "\n", "Transposed matrix: \n", " [[4 0 2]\n", " [1 2 0]\n", " [3 0 5]\n", " [0 1 4]]\n", "\n", "Indices of transposed matrix: \n", " [0 1 2 1 3 0 2 3]\n", "\n", "Reason: \n", "\n", "After converting to csr: \n", " [0 2 0 1 0 2 1 2]\n" ] } ], "source": [ "import numpy as np\n", "from scipy import sparse\n", "row = np.array([0, 0, 0, 1, 1, 2, 2, 2])\n", "col = np.array([0, 1, 2, 1, 3, 2, 0, 3])\n", "data = np.array([4, 1, 3, 2,1, 5, 2, 4])\n", "sample=sparse.csr_matrix((data, (row, col)))\n", "print('Sample matrix: \\n', sample.A)\n", "print('\\nIndices: \\n', sample.indices)\n", "transposed=sample.transpose()\n", "print('\\nTransposed matrix: \\n', transposed.A)\n", "print('\\nIndices of transposed matrix: \\n', transposed.indices)\n", "\n", "print('\\nReason: ', type(transposed))\n", "\n", "print('\\nAfter converting to csr: \\n', transposed.tocsr().indices)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self made top popular" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import os\n", "if not os.path.exists('./Recommendations generated/'):\n", " os.mkdir('./Recommendations generated/')\n", " os.mkdir('./Recommendations generated/ml-100k/')\n", " os.mkdir('./Recommendations generated/toy-example/')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "TopPop=[]\n", "train_iu=train_ui.transpose().tocsr()\n", "scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))\n", "\n", "for i in range(train_iu.shape[0]):\n", " TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))\n", " \n", "TopPop.sort(key=lambda x: x[1], reverse=True)\n", "#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular\n", "\n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if TopPop[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", "\n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)\n", "\n", "\n", "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " estimations.append([user_code_id[user], item_code_id[item],\n", " (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self made global average" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "GlobalAvg=[]\n", "avg=np.sum(train_ui)/train_ui.nnz\n", "\n", "for i in range(train_iu.shape[0]):\n", " GlobalAvg.append((i, avg))\n", " \n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if GlobalAvg[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", "\n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)\n", "\n", "\n", "# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " estimations.append([user_code_id[user], item_code_id[item], avg])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...11121314151617181920
0153.529975103.529975253.529975323.52997533...443.529975463.529975503.529975523.529975553.529975
1213.52997523.52997533.52997543.5299755...63.52997573.52997583.52997593.529975113.529975
\n", "

2 rows × 21 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 11 \\\n", "0 1 5 3.529975 10 3.529975 25 3.529975 32 3.529975 33 ... 44 \n", "1 2 1 3.529975 2 3.529975 3 3.529975 4 3.529975 5 ... 6 \n", "\n", " 12 13 14 15 16 17 18 19 20 \n", "0 3.529975 46 3.529975 50 3.529975 52 3.529975 55 3.529975 \n", "1 3.529975 7 3.529975 8 3.529975 9 3.529975 11 3.529975 \n", "\n", "[2 rows x 21 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(result)[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Project task 1 - self made top rated" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# project task 1: implement TopRated\n", "# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating\n", "# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'\n", "# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/michal/anaconda3/lib/python3.6/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n", " out=out, **kwargs)\n", "/home/michal/anaconda3/lib/python3.6/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n", " ret = ret.dtype.type(ret / rcount)\n" ] } ], "source": [ "TopRated=[]\n", "train_transposed=train_ui.transpose().tocsr()\n", "for i in range(train_iu.shape[0]):\n", " average = np.mean(train_transposed.data[train_transposed.indptr[i]:train_transposed.indptr[i+1]])\n", " TopRated.append((i, average))\n", "#print(TopRated)\n", "TopRated.sort(key=lambda x: x[1], reverse=True)\n", "\n", "k=10\n", "result=[]\n", "\n", "for u in range(train_ui.shape[0]):\n", " user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]\n", " rec_user=[]\n", " item_pos=0\n", " while len(rec_user)<10:\n", " if TopRated[item_pos][0] not in user_rated:\n", " rec_user.append((item_code_id[TopRated[item_pos][0]], TopRated[item_pos][1]))\n", " item_pos+=1\n", " result.append([user_code_id[u]]+list(chain(*rec_user)))\n", " \n", "(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)\n", "\n", "estimations=[]\n", "\n", "for user, item in zip(*test_ui.nonzero()):\n", " avg = np.mean(train_iu.data[train_iu.indptr[item]:train_iu.indptr[item+1]])\n", " estimations.append([user_code_id[user], item_code_id[item], avg])\n", "(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...11121314151617181920
018145.011225.011895.012015.0408...3184.4853564834.4666675134.4000006034.392638504.385106
121195.08145.011225.011895.01201...1144.5090914084.5000001694.4945053184.4853564834.466667
\n", "

2 rows × 21 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 11 12 \\\n", "0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 408 ... 318 4.485356 \n", "1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 114 4.509091 \n", "\n", " 13 14 15 16 17 18 19 20 \n", "0 483 4.466667 513 4.400000 603 4.392638 50 4.385106 \n", "1 408 4.500000 169 4.494505 318 4.485356 483 4.466667 \n", "\n", "[2 rows x 21 columns]" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(result)[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Self-made baseline" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "class selfBaselineUI():\n", " \n", " def fit(self, train_ui):\n", " self.train_ui=train_ui.copy()\n", " self.train_iu=train_ui.transpose().tocsr()\n", " \n", " result=self.train_ui.copy()\n", " \n", " self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)\n", " \n", " # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed \n", " # (other option is to define addition/multiplication in a desired way)\n", " row_means=self.row_means.copy()\n", " \n", " max_row_mean=np.max(row_means)\n", " row_means[row_means==0]=max_row_mean+1\n", " to_subtract_rows=sparse.diags(row_means)*result.power(0)\n", " to_subtract_rows.sort_indices() # needed to have valid .data\n", " \n", " subtract=to_subtract_rows.data\n", " subtract[subtract==max_row_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "# we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n", " self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\\\n", " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings\n", " \n", " # again - it is possible that some mean will be zero, so let's use the same workaround\n", " col_means=self.col_means.copy()\n", " \n", " max_col_mean=np.max(col_means)\n", " col_means[col_means==0]=max_col_mean+1\n", " to_subtract_cols=result.power(0)*sparse.diags(col_means)\n", " to_subtract_cols.sort_indices() # needed to have valid .data\n", " \n", " subtract=to_subtract_cols.data\n", " subtract[subtract==max_col_mean+1]=0\n", " \n", " result.data=result.data-subtract\n", "\n", " return result\n", " \n", " \n", " def recommend(self, user_code_id, item_code_id, topK=10):\n", " estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])\n", " \n", " top_k = defaultdict(list)\n", " for nb_user, user in enumerate(estimations):\n", " \n", " user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", " for item, score in enumerate(user):\n", " if item not in user_rated:\n", " top_k[user_code_id[nb_user]].append((item_code_id[item], score))\n", " result=[]\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, item_scores in top_k.items():\n", " item_scores.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*item_scores[:topK])))\n", " return result\n", " \n", " def estimate(self, user_code_id, item_code_id, test_ui):\n", " result=[]\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])\n", " return result" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data:\n" ] }, { "data": { "text/plain": [ "matrix([[3, 4, 0, 0, 5, 0, 0, 4],\n", " [0, 1, 2, 3, 0, 0, 0, 0],\n", " [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "After subtracting rows and columns:\n" ] }, { "data": { "text/plain": [ "matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n", " [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Recommend best unseen item:\n" ] }, { "data": { "text/plain": [ "[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Print estimations on unseen items:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemest_score
00604.0
110403.0
22003.0
320204.0
420704.0
\n", "
" ], "text/plain": [ " user item est_score\n", "0 0 60 4.0\n", "1 10 40 3.0\n", "2 20 0 3.0\n", "3 20 20 4.0\n", "4 20 70 4.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, names=['user', 'item', 'rating', 'timestamp'])\n", "\n", "toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \\\n", "toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)\n", "\n", "print('Training data:')\n", "display(toy_train_ui.todense())\n", "\n", "model=selfBaselineUI()\n", "print('After subtracting rows and columns:')\n", "display(model.fit(toy_train_ui).todense())\n", "\n", "print('Recommend best unseen item:')\n", "display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))\n", "\n", "print('Print estimations on unseen items:')\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.columns=['user', 'item', 'est_score']\n", "display(estimations)\n", "\n", "top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))\n", "\n", "top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))\n", "estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "model=selfBaselineUI()\n", "model.fit(train_ui)\n", "\n", "top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)\n", "\n", "estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))\n", "estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# project task 2: implement self-made BaselineIU" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI\n", "# but first subtract col means then row means\n", "# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'\n", "# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "class selfBaselineIU():\n", " def fit(self, train_ui):\n", " self.train_ui = train_ui.copy()\n", " self.train_iu = train_ui.transpose().tocsr()\n", "\n", " result = self.train_ui.copy()\n", "\n", " # we can't do result=train_ui-to_subtract_rows since then 0 entries will \"disappear\" in csr format\n", " self.col_means = np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\n", " out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr) != 0) # handling items without ratings\n", "\n", " # again - it is possible that some mean will be zero, so let's use the same workaround\n", " col_means = self.col_means.copy()\n", "\n", " max_col_mean = np.max(col_means)\n", " col_means[col_means == 0] = max_col_mean+1\n", " to_subtract_cols = result.power(0)*sparse.diags(col_means)\n", " to_subtract_cols.sort_indices() # needed to have valid .data\n", "\n", " subtract = to_subtract_cols.data\n", " subtract[subtract == max_col_mean+1] = 0\n", "\n", " result.data = result.data-subtract\n", "\n", " self.row_means = np.asarray(result.sum(axis=1).ravel())[\n", " 0]/np.diff(result.indptr)\n", "\n", " # in csr format after addition or multiplication 0 entries \"disappear\" - so some workaraunds are needed\n", " # (other option is to define addition/multiplication in a desired way)\n", " row_means = self.row_means.copy()\n", "\n", " max_row_mean = np.max(row_means)\n", " row_means[row_means == 0] = max_row_mean+1\n", " to_subtract_rows = sparse.diags(row_means)*result.power(0)\n", " to_subtract_rows.sort_indices() # needed to have valid .data\n", "\n", " subtract = to_subtract_rows.data\n", " subtract[subtract == max_row_mean+1] = 0\n", "\n", " result.data = result.data-subtract\n", "\n", " return result\n", "\n", " def recommend(self, user_code_id, item_code_id, topK=10):\n", " estimations = np.tile(self.row_means[:, None], [\n", " 1, self.train_ui.shape[1]]) + np.tile(self.col_means, [self.train_ui.shape[0], 1])\n", "\n", " top_k = defaultdict(list)\n", " for nb_user, user in enumerate(estimations):\n", "\n", " user_rated = self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]\n", " for item, score in enumerate(user):\n", " if item not in user_rated:\n", " top_k[user_code_id[nb_user]].append(\n", " (item_code_id[item], score))\n", " result = []\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, item_scores in top_k.items():\n", " item_scores.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*item_scores[:topK])))\n", " return result\n", "\n", " def estimate(self, user_code_id, item_code_id, test_ui):\n", " result = []\n", " for user, item in zip(*test_ui.nonzero()):\n", " result.append([user_code_id[user], item_code_id[item],\n", " self.row_means[user]+self.col_means[item]])\n", " return result" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ready-made baseline - Surprise implementation" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Estimating biases using als...\n" ] } ], "source": [ "import surprise as sp\n", "import time\n", "\n", "# Based on surprise.readthedocs.io\n", "def get_top_n(predictions, n=10):\n", " \n", " # Here we create a dictionary which items are lists of pairs (item, score)\n", " top_n = defaultdict(list)\n", " for uid, iid, true_r, est, _ in predictions:\n", " top_n[uid].append((iid, est))\n", " \n", " result=[]\n", " # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)\n", " for uid, user_ratings in top_n.items():\n", " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", " result.append([uid]+list(chain(*user_ratings[:n]))) \n", " return result\n", "\n", "\n", "reader = sp.Reader(line_format='user item rating timestamp', sep='\\t')\n", "trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)\n", "trainset = trainset.build_full_trainset() # -> it is needed for using Surprise package\n", "\n", "testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)\n", "testset = sp.Trainset.build_testset(testset.build_full_trainset())\n", "\n", "algo = sp.BaselineOnly()\n", "# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})\n", "# observe how bad results gives above algorithm\n", "# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1\n", "\n", "algo.fit(trainset)\n", "\n", "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n", "predictions = algo.test(antitrainset)\n", "\n", "top_n = get_top_n(predictions, n=10)\n", "\n", "top_n=pd.DataFrame(top_n)\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 0.9495\n", "MAE: 0.7525\n" ] }, { "data": { "text/plain": [ "0.7524871012820799" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute RMSE on testset using buildin functions\n", "predictions = algo.test(testset)\n", "sp.accuracy.rmse(predictions, verbose=True)\n", "\n", "# Let's also save the results in file\n", "predictions_df=[]\n", "for uid, iid, true_r, est, _ in predictions:\n", " predictions_df.append([uid, iid, est])\n", " \n", "predictions_df=pd.DataFrame(predictions_df)\n", "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)\n", "\n", "sp.accuracy.mae(predictions, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Let's compare with random" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 1.5250\n", "MAE: 1.2234\n" ] }, { "data": { "text/plain": [ "1.2233519309444567" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set\n", "algo = sp.NormalPredictor()\n", "algo.fit(trainset)\n", "\n", "antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set\n", "predictions = algo.test(antitrainset)\n", "\n", "top_n = get_top_n(predictions, n=10)\n", "\n", "top_n=pd.DataFrame(top_n)\n", "\n", "top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)\n", "\n", "# Compute RMSE on testset using buildin functions\n", "predictions = algo.test(testset)\n", "sp.accuracy.rmse(predictions, verbose=True)\n", "\n", "# Let's also save the results in file\n", "predictions_df=[]\n", "for uid, iid, true_r, est, _ in predictions:\n", " predictions_df.append([uid, iid, est])\n", " \n", "predictions_df=pd.DataFrame(predictions_df)\n", "predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)\n", "\n", "sp.accuracy.mae(predictions, verbose=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 4 }