workshops_recommender_systems/.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb

696 lines
74 KiB
Plaintext
Raw Normal View History

2020-06-08 17:39:37 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building train and test sets"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'sklearn'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-f3289905bb9e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrandom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mevaluation_measures\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mev\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Documents\\workshop\\workshops_recommender_systems\\evaluation_measures.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmath\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnormalize\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatetime\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
]
}
],
"source": [
"# if you don't have some library installed try using pip or pip3 to install it - you can do it from the notebook\n",
"# example: !pip install tqdm\n",
"# also on labs it's better to use python3 kernel - ipython3 notebook\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"import time\n",
"import random\n",
"import evaluation_measures as ev\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# df = pd.DataFrame(np.loadtxt( './Datasets/ml-1m.dat',delimiter='::'))\n",
"df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
"df.columns=['user', 'item', 'rating', 'timestamp']\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
"\n",
"train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
"test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactions properties"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How data looks like?"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>196</td>\n",
" <td>242</td>\n",
" <td>3</td>\n",
" <td>881250949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>186</td>\n",
" <td>302</td>\n",
" <td>3</td>\n",
" <td>891717742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22</td>\n",
" <td>377</td>\n",
" <td>1</td>\n",
" <td>878887116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>244</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>880606923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>166</td>\n",
" <td>346</td>\n",
" <td>1</td>\n",
" <td>886397596</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 196 242 3 881250949\n",
"1 186 302 3 891717742\n",
"2 22 377 1 878887116\n",
"3 244 51 2 880606923\n",
"4 166 346 1 886397596"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample properties"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 943 users, 1682 items and 100000 ratings.\n",
"\n",
"Average number of ratings per user is 106.04. \n",
"\n",
"Average number of ratings per item is 59.453.\n",
"\n",
"Data sparsity (% of missing entries) is 6.3047%.\n"
]
}
],
"source": [
"users, items, ratings=len(set(df['user'])), len(set(df['item'])), len(df)\n",
"\n",
"print('We have {} users, {} items and {} ratings.\\n'.format(users, items, ratings))\n",
"\n",
"print('Average number of ratings per user is {}. \\n'.format(round(ratings/users,2)))\n",
"print('Average number of ratings per item is {}.\\n'.format(round(ratings/items,4)))\n",
"print('Data sparsity (% of missing entries) is {}%.'.format(round(100*ratings/(users*items),4)))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6gAAAHvCAYAAABZg/LVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3X2cFXXd//H3hzvTsERFA9TWzBvkxoU9FKYZ5WWa3KSZib80zQKzvEpNryhC8KbC1CukrgxIL7AbrEsvkRszb9JU1HQXgcuIUnRVbkQE7xBQ0c/vj5ndPbOcnXNgd2dmmdfz8TiPPWdmzpz3+ewsnM+Zme+YuwsAAAAAgLR1SjsAAAAAAAASDSoAAAAAICNoUAEAAAAAmUCDCgAAAADIBBpUAAAAAEAm0KACAAAAADKBBhUAEmJmk8zMw9uwtPN0NGa2p5n9yMyeMLM3zOy9sJavpp2tvZjZzKJtpirtPAAAtDcaVABtrugDdcPt4xU8Z3TR8pMSiIkOxMz2lVQn6QeSqiV1l2SphtoBZjYs/KJiEg0nAADb6pJ2AAC58GNJx6YdAh3aeElV4f2Fkn4raa0kl/ROSpl2xDBJE8P790uqTysIAABZRIMKIAmfMbN/c/d70g6CDuvE8Ocrkj7r7pvSDJMUdz9b0tkpxwAAIDEc4gugPRU3ET9JLQV2BvuHP/+Zl+YUAIA8okEF0J5ekHRbeL9gZl9IMww6tG7hz7dSTQEAANoVDSqA9vZDSe+F9680s847uqKiQZTub+2yZnZ/wzLh405mdk44/SUze9PM/s/MxpvZ7s2e+yEzu8LMlprZ62b2mpk9YGan7cB7OtHMbjezlWb2VvhztpkduR3r2DvM+aCZvWhmb5vZuvDxfzTPX+L59WEt6sPH7zOzb5vZQ2a2Nhwt9/7tfW9F69/NzC40s/vCfG+FNX7IzL5vZh9s4XmNox4XTf5UiUG4hm1nnrOLnnt2OK1gZr82s6fD331kvRb4ZDiK8F/MbHX4Pt40s2fN7GYzG2lmJQduangvajr/VJLuK/Fe7m/2vNhRfMNBlyKDi5nZAWZ2rZktD/O9amYPm9k3zayiU3vM7GQzWxD+/reE28hvLRzwrFQNW1jP583sj2b2jJltCte1ysyWmNlvzOzLZtajkkwtrD9SNzPraWaXh3+7r4V/n3VmNs7Mdt2O9Z4Q1v4pC0aM3mRmK8JpR5d57nZvXxVmqnhE50qWteDv/JtmdreZrQm3543h7/oxM7vOzD5nZl3LvNaRZna9mS0Lt7UtZva8mf3BzIaXeW6p7fdQM5tiZv8If3+x2xiAnZC7c+PGjVub3hQMXOOSloePZxZNO7uF54wuWmZSmfXevx0ZSi6rYICahmW6S7qn6HHz2yJJPcLnHSnppZhlr4nJNKlouWGS/itmPe9KmljB+zxb0usx63FJL0o6MmYd9eFy9ZIOlPRkiXWUrXkL6x4qaVWZfC8rOK80rl5xt2Hbmens4u1R0jhJW+PWK+m/K8zyJ0kfaMV7ub/Z82YWzasqsd5hRfMnSTpBwXm6La3/Lkm7xNSmq6Q/xjx/q6TvNq9hifXsKml+he/5gjb4t+Z+SYMkrYx5nackHVhmfT0l3VtB5l9L6tpW21eF7zV2W9jO7eagsB6V/H6qW3iN90uaXcHz50vavYV1NN9+v6Lg1JDm69hmG+PGjdvOe2OQJABJmCjpdAWHaU4ys9+7+9spZyr23wpGGV6o4MP5i5I+LOlb4c9BkqaY2URJf1bwPn4t6SFJb0v6pKQxCgae+66Z/dnd7y7zmt+RdJKC5uzXkpZK2k1Bg3GKgiNcJpnZenf/RakVmNl3JE0JH74l6VZJD0paL2nPcF2fl7SvpHvMbIi7L4vJtIuk/5XUL3xvt0pareBD+75l3k+pfNWS/qKgWZGkJyT9XtLzkj4k6UuSjpK0l6T5ZvZZd7+/aBU3S1oc3m84VPzvCvbKF3tye7MV+ZKkz0l6TdIsBZeyeVfSEeG0BrsqqPFfJT0maYWkNxXU5hBJZ6qp5jcp+N0Wa3gvoyU17GmfUCL7y614L9WSLlFw+Z1pkh4JMxckfUNBQ3GcghGRL21hHdMlnRre36Kg0XlEQU0Kkr4m6RpJt5TJ8mNJDXvP1igYdfnvkjYq+ELoowq+7Dmm8rcX64MKtt0+CprwOZI2SDo0zHxA+Jr3mlm1u7/efAVmtqeC93pQOGmZpP+R9C8FR4H0U9Bw7heus4vKD2BV6faVmHAv//8oqIcUbJe3SHpGwYjYPST1lfRpBdtUqXXsouBLvaHhpOcVNKt/V7DNfVRBs3mogu1gjpkd5+7vlVhdg6MUbJvvSrpBwb/HW8J1vLgDbxVAR5V2h8yNG7ed76amb72XF02bWjT92yWek+YeVJf0gxLL9FTT3r+tCj7IvSRpYIllzyxa1x0tvOakZq/5pKR9Six3koIPiq6gCTqgxDI1Rcssl3RwC685XEET7ZL+1sIy9c1yXdgG20AnRffETpHUqcRyE4qWeV7S+1r7u68g29nN3u8/JPUu85xPStojZv77Fd3z+KkKtoFhFWSdWbR8VYn5w5q9l+dKbQuSPla0vWxQib2oCr6kaVjPOkn9SyxTVWJ7ObvZMp0lvRrOqy+1jTf7GzusFb/L5nvavlVime6S7ita5uctrOu2omV+2ML22l3Bl1QNy53QFttXhe81dluodFkFXzQ0zJsnqXPMeg6XtFeJ6T8rWsf1krqVWKargqa8YblvVLD9rpF0eGtrxY0bt4594xxUAEm5UkGzJUnjzax7mmGa+bO7/7j5RHdfJ6lh72VnBXs9znf3pSWW/Y2CQ+Yk6dgKzvXbKuk0d3+pxLrmSLo2fLibpPNKPH+igj04b0ka4e5PlVhG7r5A0uTw4cfM7BNlct3m7j8rs0wlRijY4yRJjypoerfZe+LuV0haED7cX9IZbfDa28MljXb31bELuT/o7q/GzH9TwV61hm38zLaLuF3OKLUtuPtjkv4QPuyhoGFt7sKi++e7+zZ7pt29XuX3GvZUsEdTkm4vtY0XrW+duy8vs75K3ezu/1XiNTYq+AKsYa/p18xsj+JlzGywmvZ63+juV7awvTasq2Hv50VlMlW0fSXso0X3b3T3d1ta0N2Xufv64mlm1kvSN8OH97r7eV7iiBh3f0fS1xXsmZXK10qSzvX4ozwA5AANKoBEhB9SGw5H3UfSBSnGaa7kIbShhUX31yr+0MaHwp/d1HSYYEv+7O5/j5k/RcGhbpJ0cvGMcFCZhsMnb3f3p8u81m+L7n+2zLI/LzO/UsUjNl/t7h6z7OSi+0mP9Pyguy9pixW5+xuS/i98+PG2WOd2esLdH4yZ/5ei+4cXzzCz96lp21ij4BDQkjw4DHubL2mKbG7pddrZtS3NcPe1avo72FXBodjFir9QuCbuRdz9FUl3hA+PCQ93bUmbbV9tqPgyTf1aXKplX1LTqNot1lxqbFIbvhg5uMzgTs8p2KMLIOc4BxVAkq5WsDdwT0kXm9kv3X1Dypkk6W8x89YW3a8rtVelhWXLjUx6b9xMd3/RzP4hqb+kQ8zsg+7esNfmKDV9wbjFzJqf79hc8SicfWOWe1fBOXhtoWEPnUsqdz7uw2o6NzHpxi6uoYsIG5EvKTiv9wgF5+V2V3DOZ3P7tUm67fNomfmriu433z6PUNN28tcy27kUHCI/sNQMd3/NzB5TsA38m5ndpuCLjwfDhqU9vKbg/M44f1HTnr8hCs4LbvDJ8Ofbkg41s0PLrGuXop8fUXAYbykVb18JekjBlwi7SpoYfuE1q9SRIS34ZNH9fSr496d4W+ur4LDvkrnKfJEFICdoUAEkJvzgepWkqxQcAjhO0n+km0pSMKhQS4qvuxm3XPNl31dm2XJ7PRuW6a+gAfqQmg4rrCpa5ivhrVJxjfN6d9+yHeuK0yv8+WK4Z7FF7v6ema1Q0CTtaWbdSh0y2E5WlV9EMrMBCgaNOrjC9X5ghxPtuHIDLMVtn72L7j+j8sot8y0FX8J8QMGhsydJetPM/qagQbpH0sIKGuF
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['user']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per user', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6gAAAHvCAYAAABZg/LVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3XmcVNWZ//HvwxYXXEARWVQcRGNQbOnWYEwMMeNAFHBBE5y4EB2aREk0ZtHEHwJqMjrRCaLGsRkVyDgaR2PCYsCVuMStG8EYV1AMS4uIoiAoQZ/fH/c2XTTVVQVcTtfBz/v1qldX3XvufU49dWn6qXPvuebuAgAAAACgpbVq6Q4AAAAAACBRoAIAAAAAygQFKgAAAACgLFCgAgAAAADKAgUqAAAAAKAsUKACAAAAAMoCBSoABGBmY83M00f/lu5PbMyso5n9wsyeM7NVZvZpmsuVLd23bcXMJuUcMz1auj9oOWbWP+dYGNvS/QGAbalNS3cAwPbFzJreXLmfuz9dZJthku5IX45z97Hbom+Ik5l1lvSUpB4t3JWtkn4x0T99OcndF7ZYZ7DdMbMLJe0uaaW7j2/p/gDAlqJABbCt/VLS11u6E4japWosTp+Q9D+SlklySf9ooT5tif6SxqTPZ0ta2FIdwXbpQkn7SXpTEgUqgGhRoALY1o41s3929wdbuiOI1vHpz/ck/Yu7r2nJzoTi7sMlDW/hbqAMuPtsSdbS/QCAELgGFcC2kltE/HuL9QLbg33Sn698VopTAAA+qyhQAWwriyTdmz6vMrNTWrIziFq79OfHLdoLAACwzVGgAtiW/p+kT9PnV5pZ6y3dUc4MlrO3tq2ZzW5ok75uZWbnpMvfNrMPzeyvZnapme3SZNu9zewKM3vezD4ws/fN7FEz+9YWvKfjzeyPZrbYzD5Of95hZkdtxj72TPv5mJm9ZWbrzGx5+vqnTfufZ/uFaS4Wpq93MLMfmNnjZrYsnS139ua+t5z972RmPzSzR9L+fZzm+HEz+5mZ7dbMdhtmPc5Z/NWcz3aLZkQ2s+E52w5Pl1WZ2X+b2fz0s99ov5b4SjqL8MNmtjR9Hx+a2RtmdqeZDTazvKdgNrwXNV5/KkmP5Hkvs5tsV3AW33wzu5rZvmZ2rZm9nPZvpZn9xczOM7OSLusxs5PNbEb6+X+UHiP/Y2ZfbC6HzeznRDO7y8xeN7M16b6WmNk8M/utmX3bzDqU0qdm9r9R3sysk5ldnv7bfT/991lnZpeY2Y6bsd+Bae5fs2TG6DVmtiBd9uUi22728VVin5qdxbfh37CS608lab88x1azn5WZtTOzc81sqpktSj+nlZb8jrs237HXZPtNjlMzG2pmf0r/raxNj8dfmVmnJtvuZmY/ST+n98xstZk9a2bfNTP+RgU+q9ydBw8ePDJ7KJm4xiW9nL6elLNseDPbDMtpM7bIfmdvRh/ytlUyQU1Dm/aSHsx53fQxR1KHdLujJL1doO01Bfo0Nqddf0k3FtjPJ5LGlPA+h0v6oMB+XNJbko4qsI+FabuFkvaX9EKefRTNeTP77idpSZH+vaPkutJC+Sr06L+ZfRqeezxKukTS+kL7lXRbiX35k6Rdt+K9zG6y3aScdT3y7Ld/zvqxkgYquU63uf3fL+lzBXLTVtJdBbZfL+lHTXOYZz87Sppe4nu+MIPfNbMlHS5pcYE4r0nav8j+Okl6qIQ+/7ektlkdXyW+140+62b+DRd75PusqiS9XmS7jyWNLNC33OO0p5JJzJrb10JJ+6XbHSRpfoG2d0myLT0+ePDgEe+DSZIAbGtjJJ2u5DTNsWb2v+6+roX7lOs2JbMMP6HkD6K3lIxEnJ/+PFzSeDMbI2mWkvfx35Iel7RO0lckjVAy6dyPzGyWuz9QJOYFkk5SUpz9t6TnJe2kpMAYquTslrFmtsLdb8i3AzO7QI0zdX4s6R5Jj0laIaljuq8TJXWW9KCZHeHuLxbo0+ck/V5S7/S93SNpqZI/2jsXeT/5+lch6WElxYokPSfpfyX9XdLekr4p6WhJe0iabmb/4slEMA3ulDQ3fd5wqvjflIzK53phc/uW45uSviHpfUmTJdUp+XLgsHRZgx2V5PjPkp6RtEDSh0pyc6CkM9WY8ylKPttcDe9lmKSGkfbRefr+zla8lwpJP1Eykc7Nkp5M+1wl6buSdpZ0nJIZkS9rZh81kk5Ln3+kpPB4UklOqiSdK+kaSXcX6csvJZ2QPq9XUrD8TdJqJV8IHaDky55jSn97Be2m5NjtpqQI/4Okd5UUQOdK2jeN+ZCZVbj7B013YGYdlbzXnumiFyX9n6RXlZwF0ltJwdk93WcbFZ/AqtTja2tVK/n9UaPkmFyeLmtqTu4LS87UeDDdVkqK8z8puTxjByWf0Vnp+v8ys4/dfVKRvlwl6VQlv9P+R8mMwnsr+R15iJLfqVPM7ERJDyj5zH6X9mOVpL5KfvfurORYvF/J70gAnyUtXSHz4MFj+3qo8dvvl3OWTchZ/oM827TkCKpL+nmeNp3UOPq3XkmB8bakPnnanpmzr/uaiTm2ScwXJO2Vp91JSm6d4kqKoH3ztKnMafOypF7NxDxBSRHtkp5ups3CJv36YQbHQCttPBI7XlKrPO1G57T5u6QdtvazL6Fvw5u835ckdS2yzVck7V5g/c7aeOTxqyUcA/1L6OuknPY98qzv3+S9vJnvWJB0ZM7x8q7yjKIq+ZKmYT/LJR2Sp02PPMfL8CZtWktaqcbRsk2O8Sb/xj6/FZ9l0xG38/O0aS/pkZw21zezr3tz2vy/Zo7X9kq+pGpoNzCL46vE95r7WY9tpk3DZ7OwhP3tkv6bcyVfHHyjmXYHpMdVQ7s9ixynLum/muZPyZc883La1CqZSO/YPPs7RsmXAi7pxa3NHQ8ePOJ7cH4/gBCuVFJsSdKlZta+JTvTxCx3/2XThe6+XFLD6GVrJaMeo9z9+Txtf6vkFEJJ+noJ1/qtl/Qtd387z77+IOna9OVOkr6XZ/sxSkZwPpY0yN1fy9NG7j5DyYiGJB1pZl8q0q973f3XRdqUYpCSESdJekpJ0ftp00bufoWkGenLfSSdkUHszeGShrn70oKN3B9z95UF1n+oZFSt4Rg/M7subpYz8h0L7v6MklEqSeqgpGBt6oc5z0e5+yYj0+6+UMVHDTspGdGUpD/mO8Zz9rfc3V8usr9S3enuN+aJsVrJF2ANo6bnmtnuuW3MrK8aR71vdfcrmzleG/bVMPp5UZE+lXR8tZARapwd+3vu/qd8jdx9vqTvpC93Vv6R2VwvKDl+Nsqfu69V4+8iKfmSbay7P5wn5qNKRlQl6WAz26dpGwDbNwpUANtc+kdqw+moeym5oXy5yHsKbeqJnOfLVPjUxsfTn+3UeJpgc2a5+98KrB+v5FRASTo5d0U6qUzD6ZN/TP+ALOR/cp7/S5G21xdZX6rcGZt/5e5eoG3uH62hZ3p+zN3nZbEjd18l6a/pyy9msc/N9Jy7P1ZgfW4h8IXcFWa2gxqPjXolp7bm5clp2Jt8SZNjbXNxtrFrm1vh7svU+O9gRyWnYufK/ULhmkJB3P09SfelL48xs88VaJ7Z8bUNNLznekm3F2qYFpENRXax3yE3u/v6Ztbl/j79RMmp6M15POd5yOMIQBngGlQAofxKyWhgR0k/NrPfuPu7LdwnSXq6wLplOc/r8o2qNNO22MykDxVa6e5vmdlLSq7ZOtDMdnP3hlGbo9X45eJHZtb0esem2uY8P7hAu0+UXIOXhYYROldynVkhf1HjtYmhC7tCBd1G0kLkm0qu6z1MyXW57ZVc89lU90x6t3meKrJ+Sc7zpsfnYWo8Tv5c5DiXklPk++Rb4e7vm9kzSo6Bfzaze5V88fGYu/+jyH631PtKru8s5GFJ56XPj1ByXXCDr6Q/10k6yMwOKrKvz+X8/Cclp/HmU/LxFZIlM2c3fH71koZY/gmoc61Ofxb6HSKV/vv0lZzfacXabvFMzwDiRIEKIIj0D9erJV2t5BTASyT9tGV7JSmZVKg5uffdLNSuadsdirQ
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['item']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per item', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rating\n",
"1 0.06110\n",
"2 0.11370\n",
"3 0.27145\n",
"4 0.34174\n",
"5 0.21201\n",
"Name: user, dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['rating']).count()['user']/len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item attributes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
" encoding='latin-1')\n",
"genres=dict(zip(genres[1], genres[0]))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'unknown',\n",
" 1: 'Action',\n",
" 2: 'Adventure',\n",
" 3: 'Animation',\n",
" 4: \"Children's\",\n",
" 5: 'Comedy',\n",
" 6: 'Crime',\n",
" 7: 'Documentary',\n",
" 8: 'Drama',\n",
" 9: 'Fantasy',\n",
" 10: 'Film-Noir',\n",
" 11: 'Horror',\n",
" 12: 'Musical',\n",
" 13: 'Mystery',\n",
" 14: 'Romance',\n",
" 15: 'Sci-Fi',\n",
" 16: 'Thriller',\n",
" 17: 'War',\n",
" 18: 'Western'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 \\\n",
"0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
"1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
"2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
"\n",
" 4 5 6 7 8 9 ... \\\n",
"0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
"1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
"2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
"\n",
" 14 15 16 17 18 19 20 21 22 23 \n",
"0 0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 1 0 0 \n",
"\n",
"[3 rows x 24 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[:3]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"for i in range(19):\n",
" movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"movies=movies[[0,1,'genre']]\n",
"movies.columns=['id', 'title', 'genres']"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation, Children's, Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Get Shorty (1995)</td>\n",
" <td>Action, Comedy, Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Copycat (1995)</td>\n",
" <td>Crime, Drama, Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title genres\n",
"0 1 Toy Story (1995) Animation, Children's, Comedy\n",
"1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
"2 3 Four Rooms (1995) Thriller\n",
"3 4 Get Shorty (1995) Action, Comedy, Drama\n",
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
"movies[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Toy example"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.exists('./Datasets/toy-example/'):\n",
" os.mkdir('./Datasets/toy-example/')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
" [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
" [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
"toy_test=pd.DataFrame([[0,60,3,0],\n",
" [10,40,5,0],\n",
" [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
"\n",
"toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
"toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}