WSS-project/P0. Data preparation.ipynb

685 lines
71 KiB
Plaintext
Raw Permalink Normal View History

2021-03-20 20:01:22 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building train and test sets"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 1,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
2021-03-23 21:52:46 +01:00
"# if you don't have some library installed try using pip (or pip3) to install it - you can do it from the notebook\n",
"# example: !pip install tqdm\n",
"# also on labs it's better to use python3 kernel - ipython3 notebook\n",
"\n",
2021-03-20 20:01:22 +01:00
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"import time\n",
"import random\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import helpers\n",
"\n",
"os.makedirs('./Datasets/', exist_ok = True)\n",
"\n",
"helpers.download_movielens_100k_dataset()\n",
"\n",
"df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
"df.columns=['user', 'item', 'rating', 'timestamp']\n",
"\n",
"train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
"\n",
"train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
"test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactions properties"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How data looks like?"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 2,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>196</td>\n",
" <td>242</td>\n",
" <td>3</td>\n",
" <td>881250949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>186</td>\n",
" <td>302</td>\n",
" <td>3</td>\n",
" <td>891717742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22</td>\n",
" <td>377</td>\n",
" <td>1</td>\n",
" <td>878887116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>244</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>880606923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>166</td>\n",
" <td>346</td>\n",
" <td>1</td>\n",
" <td>886397596</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 196 242 3 881250949\n",
"1 186 302 3 891717742\n",
"2 22 377 1 878887116\n",
"3 244 51 2 880606923\n",
"4 166 346 1 886397596"
]
},
2021-03-20 20:13:28 +01:00
"execution_count": 2,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample properties"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 3,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 943 users, 1682 items and 100000 ratings.\n",
"\n",
2021-03-23 21:52:46 +01:00
"Average number of ratings per user is 106.0445. \n",
2021-03-20 20:01:22 +01:00
"\n",
"Average number of ratings per item is 59.453.\n",
"\n",
2021-03-27 11:58:44 +01:00
"Data sparsity (% of missing entries) is 93.6953%.\n"
2021-03-20 20:01:22 +01:00
]
}
],
"source": [
2021-03-23 21:52:46 +01:00
"users, items, ratings=df['user'].nunique(), df['item'].nunique(), len(df)\n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
"print(f'We have {users} users, {items} items and {ratings} ratings.\\n')\n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
"print(f'Average number of ratings per user is {round(ratings/users,4)}. \\n')\n",
"print(f'Average number of ratings per item is {round(ratings/items,4)}.\\n')\n",
2021-03-27 11:58:44 +01:00
"print(f'Data sparsity (% of missing entries) is {round(100*(1-ratings/(users*items)),4)}%.')"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 4,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzdeZgcZbn+8fvJAoKgLIaYBGQQWUJImCSNBBGMC8vJJohL+ImAeBL0wEEE0ShCAqIGgSPimkQQUFlUDpgFkUWQXZgJCQdDFIID2QghrNmAwPP7o2omXVM9PZ3MTL01qe/nuvqa7qrq6rufqUn66ap6y9xdAAAAAACE0CN0AAAAAABAcdGUAgAAAACCoSkFAAAAAARDUwoAAAAACIamFAAAAAAQDE0pAAAAACAYmlIAyIiZTTEzj28jQ+fpbsxsJzP7npk9amavmdnbcS1fDp2tq5jZVWXbTF3oPAAAdAWaUgCdruxDdPPtoBqeM75s+SkZxEQ3YmZ9JTVK+rakeknbSbKgoTaDmY2Mv5yYQpMJAECkV+gAAArh+5I+HjoEurVzJNXF9++X9FtJKyS5pDcDZdocIyVNju/fLakpVBAAAPKCphRAFj5mZp9w9ztCB0G3NSr++ZKkI9x9bcgwWXH3kySdFDgGAABdisN3AXSl8sbh+8FSYEuwW/zzn0VpSAEAKAqaUgBdabGkm+L7B5rZp0KGQbe2Vfzz9aApAABAp6MpBdDVviPp7fj+hWbWc3NXVDYQ0t0dXdbM7m5eJn7cw8xOjqc/b2ZrzOxxM/uOmW3f6rnvNbPvmtljZvaqmb1iZveY2Wc34z2NMrM/mdkSM3s9/nmdmR28Cet4j5mdY2b3mtlzZvaGma2MH3/DzLZr5/lNcS2a4sfvMLPTzew+M1sRj3J796a+t7L1b2tmXzOzu+J8r8c1vs/MvmVm727jeS2jFZdN/kiFgbRGbmKek8qee1I8rWRmvzKzp+LffWK9Fjk0Hv33r2a2LH4fa8zs32Z2vZmNrfKaU+L3Mbls8l0V3svdrZ5XdfTdeOCkxABhZvY+M7vUzBbG+V42swfM7L/MrKbTdszsGDObE//+18fbyG8tHrSsUg3bWM8nzez3Zva0ma2N17XUzOab2R/iTDvXkqmN9SfqZmZ9zOwCM/u/+G/zVTNrNLNJZrbNJqz3qLj2T1o00vNaM1sUT/twO8/d5O2rxkw1j8Rcy7Lx3/l/mdntZrY83p5Xx7/rR8zsCjP7jJltVen5Zes52Mx+YWYL4m1tvZk9a2Y3mNnodp5bafvdx8wuM7Mn4t9f1W0MwBbC3blx48atU2+KBp9xSQvjx1eVTTupjeeML1tmSjvrvXsTMlRcVtEgM83LbCfpjrLHrW9zJe0YP+9gSc9XWfaSKpmmlC03UtLPqqznLUmTa3ifJ0l6tcp6XNJzkg6uso6meLkmSXtIerzCOtqteRvrHiFpaTv5XlB0nmi1elW7jdzETCeVb4+SJknaUG29kn5dY5Y/S3pXB97L3a2ed1XZvLoK6x1ZNn+KpKMUnXfb1vpvk7R1ldr0lvT7Ks/fIOms1jWssJ5tJM2u8T2f0Qn/1twtaaikJVVe50lJe7Szvj6S7qwh868k9e6s7avG91p1W9jE7WbPuB61/H7q23iNd0q6vobnz5a0fRvraL39nqDotI/W60htY9y4cduybgx0BCALUyQdp+gQzClmdq27vxE2UsKvFY0OfL+iD+TPSdpd0qnxz6GSLjOzyZL+ouh9/ErSfZLekHSopAmKBo87y8xu9fYHdfqqpKMVNWS/kvSYpG0VNRXHKjqSZYqZrXL3n1ZagZl9VdJl8cO1kv4o6QFJqyS9J17XOEl9Jd1hZge6+4IqmbaW9L+SBsXv7UZJyxR9UO/bzvuplG+opL8qalAk6VFJ10p6VtJ7JX1W0iGSdpY028yOcPe7y1ZxvaR58f3mw8D/oWjve7nHNzVbmc8pqtMrkq5WdNmZtyQdEE9rto2iQ4f/JulhSYskrVFUm70lfUHSTvG6rlH0uy3X/F7Gx68pSedWyP5CB95LvaSzFV0qZ5qkB+PMJUlfVtREHK5oJOPz2ljHdEmfie+vV9TcPKioJiVJX5J0iaJtrZrvS2reS7Zc0WjJ/5C0WtGXQB9Q9AXPYbW/varerWjbHaCo8b5Z0ouS9okzvy9+zTvNrN7dX229AjPbSdF73TOeND9e51OKjvbYX1GTOSBeZy+1PwhVrdtXZszMJP1BUT2kaLv8o6SnFY1kvaOkgZI+qmibqrSOrRV9kTcinrRI0g2SnojX8QFFDebeiraDm83scHd/u8Lqmh2iaNt8S9IViv49Xq/od/jcZrxVAN1J6K6YGzduW95NG7/dXlg27fKy6adXeE7IPaUu6dsVlumjjXv5Nij68Pa8pCEVlv1C2bpuaeM1p7R6zccl7VJhuaMVfbBzRY3P+yosUypb5tFKy8TLjVHUOLukh9pYpqlVrq91wjbQQ8k9rpdJ6lFhuXPLlnlW0js6+ruvIdtJrd7vE5L6t/OcQyXtUGX+O5Xcw/iRGraBkTVkvaps+boK80e2ei/PSNqrwnIfLNteXlSFvaWKvphpXs9KSftXWKauwvZyUqtlekp6OZ7XVGkbb/U3tm8Hfpet96idWmGZ7STdVbbMT9pY103x/LclfbWNZbZT9MVU87qO6oztq8b3WnVbqHVZRf92NM+bJalnlfXsJ2nnCtN/VLaOH0rqVWGZ3ooa8eblvlzD9rtc0n4drRU3bty6341zSgFk5XuKGixJOsfaOc8xY39x99TowO6+UlLzXsqeivZunObuj1VY9jeKDoeTokvgtHckygZJn3P35yus62ZJl8YPt5X0lQrPP0/RnprXJI1x92crvYi7z5Y0NX54kJl9qJ1cN7n7j9pZphZjFO1xlaSHFDW6qb0k7v5dSXPih7tJOr4TXntTuKTx7r6s6kLu97r7y1Xmr1G096x5G/9C50XcJMe7+5OtJ7r7w4r2ZEnRnrAPVnju18run+buqT3Q7t6k9vcO9lG051KS/lRpGy9b30p3X9jO+mp1vbv/rMJrrFb0pVfz3tEvmdkO5cuY2TBt3Lv9I3f/cRt5m9fVvJfzzHYy1bR9ZewDZfevdPe32lrQ3Re4+6ryaWbWT9J/xQ//192/4e4bKjz3TUn/qWgPrNR+rSTpFK9+NAeALRRNKYBMuPsKbTzUdBdJZwSM01rFw2Nj95fdX6Hqhy3eF//cWhsPAWzLX9z9H1XmX6boMDZJOqZ8hpntqI2HRl7n7kvbea3flt0/op1lf9LO/FqVj7R8sbt7lWWnlt3PeoTme919fmesyN1fk/R/8cODOmOdm+hRd7+3yvy/lt3fr3yGmb1DG7eNZYoO76zIo0OsU1/MlFnX1ut0sUvbmhH/+9P8d7CNokNqyzV/ieDV1hOv6yVJt8QPD4sPZW1Lp21fnaj8kkqD2lyqbZ/VxtGwL6m2YNyYNn8Zslc7AzQ9o2jPLYAC4pxSAFm6WNFev50kfd3Mfu7uLwbOJEl/rzJvRdn9xkp7+9pYdsd2XvPOajPd/Tkze0LReWx7m9m73b1578wh2vil4ltm1vr8xdZ6l90fWGW5txSdU9cZmvfEuaTb21n2AW081zDrZq5aE5cQNx+flfRJRXvN+yrKbBUW37VT0m2ah9qZX/7lRevt8wBt3E7uaWc7l6LD34dUmuHur5jZw4q2gU+Y2U2Kvuy4N25SusIris7XrOav2riH70BF5/k2OzT++bKkD0anXVa1ddnP9ys6RLeSmrevDN2n6IuDbSRNjr/kurrSESBtOLTs/q41/PtTvq0NVHRId8Vc7Xx5BWALRlMKIDPxh9WLJF2k6PC+SZK+ETaVpGhgoLaUXxez2nKtl31HO8s+1c785mX2V9T0vFcbDxmsK1vmK6p8eG9bqjXLq9x9/Sasq5p+8c/n4j2IbXL3t81skaLGaCcz28qzGwirvb3MkiQzG6xo4Ke9alzvuzY70eZrb5Ckattn/7L7T6t97S1zqqIvXt6l6LDYoyWtMbO/K2q
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['user']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per user', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 5,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzde5wU5ZX/8e8BREW8oYgwqBhEgyiO0CrG1RCzBhYB7wluvLC6DJto1JgYSfwp4CWriSZeYxxWBXa9xNXVABrwHqPxkhkEY7xCgkFARBTlohL0/P6oGmiGnu4Giqeq8PN+vfpFd9VTdZ4+XTPM6afqKXN3AQAAAACQhlZpdwAAAAAA8MVFUQoAAAAASA1FKQAAAAAgNRSlAAAAAIDUUJQCAAAAAFJDUQoAAAAASA1FKQAEYGZjzMzjR/+0+5M3ZtbBzK4wsxfNbKmZfR7ncknafdtUzGx80THTLe3+ID1m1r/oWBiTdn8AIGlt0u4AgM2LmTW/+XE/d3++wjbDJN0Vvxzr7mM2Rd+QT2bWSdJzkrql3JWNEn8Z0T9+Od7d56TWGWx2zOw8STtIWuLu16bdHwBYHxSlADa1n0r6etqdQK5dpDUF6TOS/kfSQkku6R8p9WlD9Jc0On7+pKQ5aXUEm6XzJO0h6S1JFKUAcoWiFMCmdqSZ/bO7P5p2R5Bbg+J/P5D0DXdfkWZnQnH34ZKGp9wNZIC7PynJ0u4HAGwqXFMKYFMpLhx+mlovsDnYLf739S9KQQoAwBcJRSmATWWupPvj5weZ2fFpdga51jb+99NUewEAADYJilIAm9L/k/R5/PxyM2u9oTsqmnnyyY1ta2ZPNrWJX7cyszPi5e+a2XIze9nM/p+Zbdts213N7DIze8nMPjKzD83sKTP75ga8p0Fm9lsze9vMPo3/vcvMDl2PfexsZheZ2R/M7B0zW2lmi+LXPzKz9hW2nxPnYk78eiszO8fMnjazhfEst0+u73sr2n87M/u+mT0R9+/TOMdPm9mPzWz7FrZbPVtx0eKvFn22GzSTsZkNL9p2eLysYGb/ZWaz4s9+rf1a5PB49t/HzWx+/D6Wm9nfzOxuMxtSJuaY+H2MLlr8RIn38mSz7crOvltqRlYz293MrjGz1+L+LTGzP5rZd82sqkt2zOw4M3sw/vw/iY+R/zGzQ1rKYQv7OcbM7jGzv5rZinhf88xsppn9b9ynnarpUwv7XytvZtbRzC41sz/HP5sfmVmjmY0ys63XY78D49y/adFMzyvMbHa87J8qbLvex1eVfWpx9t2mn2FF15NK0h4ljq0WPysza2tmZ5rZJDObG39OSyz6HXdNqWOv2fbrHKdmdoKZ/S7+Wfk4Ph5/bmYdm227vZldEH9OH5jZMjP7k5n9h5nxNyrwReLuPHjw4JHYQ9HkMy7ptfj1+KJlw1vYZlhRmzEV9vvkevShZFtFk8w0tWkv6dGi180f0yXtGG93qKR3y7S9ukyfxhS16y/ppjL7+UzS6Cre53BJH5XZj0t6R9KhZfYxJ243R9Kekl4usY+KOW9h3/0kzavQv/cUXSdaLl/lHv3Xs0/Di49HSaMkrSq3X0m3V9mX30nabiPey5PNthtftK5bif32L1o/RtJARdfdtrT/hyVtWSY3W0i6p8z2qyT9oHkOS+xna0lTqnzP5yXwu+ZJSQdKertMnDcl7Vlhfx0lPVZFn/9L0hZJHV9Vvte1PusWfoYrPUp9VgVJf62w3aeSRpbpW/Fx2l3RRGQt7WuOpD3i7faRNKtM23sk2YYeHzx48MjXg4mOAGxqYySdrOgUzDFmdqe7r0y3S2u5XdHswM8o+iPoHUUjDmfF/x4o6VozGy1pmqL38V+Snpa0UtLhkkYomjjuB2Y21StP6nSupGMVFWT/JeklSe0UFRUnKDqLZYyZLXb3G0vtwMzO1ZoZNldIulfSHyUtlrRzvK+hkjpJetTMDnL3V8r0aUtJ/yepV/ze7pM0X9Ef6p0qvJ9S/TtQ0uOKChRJelHSnZL+LmlXSd+UdJiknSRNMbNveDSZS5O7Jc2InzedBv4XRaPvxV5e374V+ZaiPH0oaYKkRkVfCBwQL2uytaI/zH8v6QVJsyUtV5SbvSWdKqlDvK+Jij7bYk3vZVgcU5IuLtH39zbivdRKukDRZDi3SHo27nNB0n9I2kbSUYpmMr6khX3USzopfv6JomLjWUU5KUg6U9LVio61cn4q6ej4+QJFRcpfJC1T9CXQXoq+4Dmi+rdX1vaKjt0aRYX3A5LeV1T0nClp9zjmY2ZW6+4fNd+BmXVQ9F67x4tmxvucpehsj/0UFZk18T7bqPIkVNUeXxurTtHvj3pFx+SieFlz04tfWHRGxqPxtq7o99vDir5I2lrRZ3RqvP7XZvapu4+v0JcrJZ2o6Hfa/yiaCXhXRb8j91P0O3WimR0j6RFF+fxN3I+lkvoo+t27jaJj8WFFvyMBbO7Srop58OCxeT205lvu14qWXV+0/JwS26Q5UuqSflKiTUetGeVbpaioeFdS7xJtTy3a10MtxBzTLObLknYp0e5YRbc5cUWFz+4l2hSK2rxYqk3cbrCiwtklPddCmznN+vX9BI6BVlp7xPVaSa1KtLu4qM3fJW21sZ99FX0b3uz9viqpS4VtDpe0Q5n122jtEcavVnEM9K+ir+OL2ncrsb5/s/fylqQeJdodXHS8vK8So6WKvphp2s8iSfuVaNOtxPEyvFmb1pKWaM2o2DrHeLOfsS9vxGfZfGTtrBJt2kt6oqjNDS3s6/54/eeSzm2hTXtFhVvTvgYmcXxV+V6LP+sxLbRp+mzmVLG/beOfOVc0ut7SMbtXfFy5oi8Vdq5wnLqkX6vZz7uiIndmUZsGRV+mHVlif0fEn4NLemVjc8eDB498PDhfH0AIVygqsCTpIqtwnWNg09x9ndmB3X2RpKZRytaKRjfOdveXSrT9b0WnB0rRLXAqnYWyStK33P3dEvt6QNI18ct2kr5TYvtLFI3ULJU02N3/XiqIu09RNHIhSYeY2Vcq9Ot+d/9lhTbVGKxoxFWSnlNU6H7evJG7XybpwfjlbpJOSSD2+nBJw9x9ftlG7n9w9yVl1i9XNHrWdIyfmlwX18sp7v5m84Xu/oKi0ShJ2lFRkdrc94uen+3u64xAu/scVR4d7Kho5FKSflvqGC/a3yJ3f63C/qp1t7vfVCLGMkVfejWNjp5pZjsUtzGzPlozuv1Ld7+uhf427atplPP8Cn2q6vhKyQitmdX6NHf/falG7j5L0r/FL7dR6RHYYi8rOn7W+nl394+15neRJPVVVFw/XiLmU4pGTiWpp5nt1rwNgM0PRSmATc7dF2rNqaa7KLrJe1aUPD029kzR84Uqf9ri0/G/W2rNKYAtmebufymz/lpFp/lJ0nHFK8xsR605NfIud59XIdb/FD3/RoW2N1RYX63imZZ/7u5epm3xH6qhZ2j+g7vPTGJH7r5U0p/jl4cksc/19KK7/6HM+uI//vctXmFmW2nNsTFf0v+2tBOPTrFe54uZIh+3FGcTu6alFfHvn6afg60VnVJbrOlLBC+3n3hfH0h6KH55hJltWaZ5YsfXJtD0nt9w98nlGsaFY1NhXel3yC3uvqqFdcW/Tz9TdJp5S54ueh7yOAKQEq4pBRDKzxWN+nWQ9EMz+5W7v59ynyTp+TLrFhY9byw12tdC2x0rxHys3Ep3f8fMXlV0DdbeZra9uzeNzhymNV8ofmZmza9fbG6Louc9y7T7TNE1dUloGolzRdeNlfNHrbnWMHQxV66IW0tcfHxT0jGKRs07KeqzlWjeNZHerZ/nKqwv/vKi+fF5gNYcJ09VOM6l6PT33qVWuPuHZvaComPgn83sfkVfdvzB3f9RYb8b6kNF12uW87ik78bPD1J0nW+Tw+N/l0g62KzUR7qWLYv+/ZKiU3RLqfr4CsmiGa+bPr+FVfwOkaKfUan87xCp+t+nrxf9TqvUttLvUwCbAYpSAEHEf6xeJekqRaf3jZL0o3R7JSmaGKglxffFLNeuedutKrSdVWF9U5v9FBU9u2r
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
2021-03-26 21:00:52 +01:00
"users_per_item=df.groupby(['item']).count()['rating']\n",
2021-03-20 20:01:22 +01:00
"\n",
"plt.figure(figsize=(16,8))\n",
2021-03-26 21:00:52 +01:00
"plt.hist(users_per_item, bins=100)\n",
2021-03-20 20:01:22 +01:00
"\n",
"# Let's add median\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.median()\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.quantile(0.25)\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.quantile(0.75)\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per item', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 6,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rating\n",
"1 0.06110\n",
"2 0.11370\n",
"3 0.27145\n",
"4 0.34174\n",
"5 0.21201\n",
"Name: user, dtype: float64"
]
},
2021-03-20 20:13:28 +01:00
"execution_count": 6,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['rating']).count()['user']/len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item attributes"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 7,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
" encoding='latin-1')\n",
"genres=dict(zip(genres[1], genres[0]))"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 8,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'unknown',\n",
" 1: 'Action',\n",
" 2: 'Adventure',\n",
" 3: 'Animation',\n",
" 4: \"Children's\",\n",
" 5: 'Comedy',\n",
" 6: 'Crime',\n",
" 7: 'Documentary',\n",
" 8: 'Drama',\n",
" 9: 'Fantasy',\n",
" 10: 'Film-Noir',\n",
" 11: 'Horror',\n",
" 12: 'Musical',\n",
" 13: 'Mystery',\n",
" 14: 'Romance',\n",
" 15: 'Sci-Fi',\n",
" 16: 'Thriller',\n",
" 17: 'War',\n",
" 18: 'Western'}"
]
},
2021-03-20 20:13:28 +01:00
"execution_count": 8,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 9,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 10,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 \\\n",
"0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
"1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
"2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
"\n",
" 4 5 6 7 8 9 ... \\\n",
"0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
"1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
"2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
"\n",
" 14 15 16 17 18 19 20 21 22 23 \n",
"0 0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 1 0 0 \n",
"\n",
"[3 rows x 24 columns]"
]
},
2021-03-20 20:13:28 +01:00
"execution_count": 10,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[:3]"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 11,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"for i in range(19):\n",
" movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 12,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 13,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies=movies[[0,1,'genre']]\n",
"movies.columns=['id', 'title', 'genres']"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 14,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation, Children's, Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Get Shorty (1995)</td>\n",
" <td>Action, Comedy, Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Copycat (1995)</td>\n",
" <td>Crime, Drama, Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title genres\n",
"0 1 Toy Story (1995) Animation, Children's, Comedy\n",
"1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
"2 3 Four Rooms (1995) Thriller\n",
"3 4 Get Shorty (1995) Action, Comedy, Drama\n",
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
2021-03-20 20:13:28 +01:00
"execution_count": 14,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
"movies[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Toy example"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 15,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"os.makedirs('./Datasets/toy-example/', exist_ok = True)"
]
},
{
"cell_type": "code",
2021-03-20 20:13:28 +01:00
"execution_count": 16,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
" [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
" [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
"toy_test=pd.DataFrame([[0,60,3,0],\n",
" [10,40,5,0],\n",
" [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
"\n",
"toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
"toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}