{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Building train and test sets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# if you don't have some library installed try using pip or pip3 to install it - you can do it from the notebook\n", "# example: !pip install tqdm\n", "# also on labs it's better to use python3 kernel - ipython3 notebook\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import scipy.sparse as sparse\n", "import time\n", "import random\n", "import evaluation_measures as ev\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "\n", "# df = pd.DataFrame(np.loadtxt( './Datasets/ml-1m.dat',delimiter='::'))\n", "df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n", "df.columns=['user', 'item', 'rating', 'timestamp']\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "train, test = train_test_split(df, test_size=0.2, random_state=30)\n", "\n", "train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n", "test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Interactions properties" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How data looks like?" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingtimestamp
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
\n", "
" ], "text/plain": [ " user item rating timestamp\n", "0 196 242 3 881250949\n", "1 186 302 3 891717742\n", "2 22 377 1 878887116\n", "3 244 51 2 880606923\n", "4 166 346 1 886397596" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sample properties" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "We have 943 users, 1682 items and 100000 ratings.\n", "\n", "Average number of ratings per user is 106.04. \n", "\n", "Average number of ratings per item is 59.453.\n", "\n", "Data sparsity (% of missing entries) is 6.3047%.\n" ] } ], "source": [ "users, items, ratings=len(set(df['user'])), len(set(df['item'])), len(df)\n", "\n", "print('We have {} users, {} items and {} ratings.\\n'.format(users, items, ratings))\n", "\n", "print('Average number of ratings per user is {}. \\n'.format(round(ratings/users,2)))\n", "print('Average number of ratings per item is {}.\\n'.format(round(ratings/items,4)))\n", "print('Data sparsity (% of missing entries) is {}%.'.format(round(100*ratings/(users*items),4)))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "items_per_user=df.groupby(['user']).count()['rating']\n", "\n", "plt.figure(figsize=(16,8))\n", "plt.hist(items_per_user, bins=100)\n", "\n", "# Let's add median\n", "t=items_per_user.median()\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n", "\n", "# Let's add also some percentiles\n", "t=items_per_user.quantile(0.25)\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n", "\n", "t=items_per_user.quantile(0.75)\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n", "\n", "plt.title('Number of ratings per user', fontsize=30)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "items_per_user=df.groupby(['item']).count()['rating']\n", "\n", "plt.figure(figsize=(16,8))\n", "plt.hist(items_per_user, bins=100)\n", "\n", "# Let's add median\n", "t=items_per_user.median()\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n", "\n", "# Let's add also some percentiles\n", "t=items_per_user.quantile(0.25)\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n", "\n", "t=items_per_user.quantile(0.75)\n", "plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n", "plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n", "\n", "plt.title('Number of ratings per item', fontsize=30)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "rating\n", "1 0.06110\n", "2 0.11370\n", "3 0.27145\n", "4 0.34174\n", "5 0.21201\n", "Name: user, dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['rating']).count()['user']/len(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Item attributes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n", " encoding='latin-1')\n", "genres=dict(zip(genres[1], genres[0]))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'unknown',\n", " 1: 'Action',\n", " 2: 'Adventure',\n", " 3: 'Animation',\n", " 4: \"Children's\",\n", " 5: 'Comedy',\n", " 6: 'Crime',\n", " 7: 'Documentary',\n", " 8: 'Drama',\n", " 9: 'Fantasy',\n", " 10: 'Film-Noir',\n", " 11: 'Horror',\n", " 12: 'Musical',\n", " 13: 'Mystery',\n", " 14: 'Romance',\n", " 15: 'Sci-Fi',\n", " 16: 'Thriller',\n", " 17: 'War',\n", " 18: 'Western'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...14151617181920212223
01Toy Story (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Toy%20Story%2...00011...0000000000
12GoldenEye (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?GoldenEye%20(...01100...0000000100
23Four Rooms (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Four%20Rooms%...00000...0000000100
\n", "

3 rows × 24 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "0 1 Toy Story (1995) 01-Jan-1995 NaN \n", "1 2 GoldenEye (1995) 01-Jan-1995 NaN \n", "2 3 Four Rooms (1995) 01-Jan-1995 NaN \n", "\n", " 4 5 6 7 8 9 ... \\\n", "0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n", "1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n", "2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n", "\n", " 14 15 16 17 18 19 20 21 22 23 \n", "0 0 0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 1 0 0 \n", "2 0 0 0 0 0 0 0 1 0 0 \n", "\n", "[3 rows x 24 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies[:3]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "for i in range(19):\n", " movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "movies=movies[[0,1,'genre']]\n", "movies.columns=['id', 'title', 'genres']" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitlegenres
01Toy Story (1995)Animation, Children's, Comedy
12GoldenEye (1995)Action, Adventure, Thriller
23Four Rooms (1995)Thriller
34Get Shorty (1995)Action, Comedy, Drama
45Copycat (1995)Crime, Drama, Thriller
\n", "
" ], "text/plain": [ " id title genres\n", "0 1 Toy Story (1995) Animation, Children's, Comedy\n", "1 2 GoldenEye (1995) Action, Adventure, Thriller\n", "2 3 Four Rooms (1995) Thriller\n", "3 4 Get Shorty (1995) Action, Comedy, Drama\n", "4 5 Copycat (1995) Crime, Drama, Thriller" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n", "movies[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Toy example" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import os\n", "if not os.path.exists('./Datasets/toy-example/'):\n", " os.mkdir('./Datasets/toy-example/')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n", " [10,10,1,0], [10,20,2,0], [10,30,3,0],\n", " [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n", "toy_test=pd.DataFrame([[0,60,3,0],\n", " [10,40,5,0],\n", " [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n", "\n", "toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n", "toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 4 }