workshops_recommender_systems/P0. Data preparation.ipynb

692 lines
71 KiB
Plaintext
Raw Normal View History

2020-06-13 22:14:04 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building train and test sets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# if you don't have some library installed try using pip or pip3 to install it - you can do it from the notebook\n",
"# example: !pip install tqdm\n",
"# also on labs it's better to use python3 kernel - ipython3 notebook\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"import time\n",
"import random\n",
"import evaluation_measures as ev\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# df = pd.DataFrame(np.loadtxt( './Datasets/ml-1m.dat',delimiter='::'))\n",
"df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
"df.columns=['user', 'item', 'rating', 'timestamp']\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
"\n",
"train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
"test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactions properties"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How data looks like?"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>196</td>\n",
" <td>242</td>\n",
" <td>3</td>\n",
" <td>881250949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>186</td>\n",
" <td>302</td>\n",
" <td>3</td>\n",
" <td>891717742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22</td>\n",
" <td>377</td>\n",
" <td>1</td>\n",
" <td>878887116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>244</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>880606923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>166</td>\n",
" <td>346</td>\n",
" <td>1</td>\n",
" <td>886397596</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 196 242 3 881250949\n",
"1 186 302 3 891717742\n",
"2 22 377 1 878887116\n",
"3 244 51 2 880606923\n",
"4 166 346 1 886397596"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample properties"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 943 users, 1682 items and 100000 ratings.\n",
"\n",
"Average number of ratings per user is 106.04. \n",
"\n",
"Average number of ratings per item is 59.453.\n",
"\n",
"Data sparsity (% of missing entries) is 6.3047%.\n"
]
}
],
"source": [
"users, items, ratings=len(set(df['user'])), len(set(df['item'])), len(df)\n",
"\n",
"print('We have {} users, {} items and {} ratings.\\n'.format(users, items, ratings))\n",
"\n",
"print('Average number of ratings per user is {}. \\n'.format(round(ratings/users,2)))\n",
"print('Average number of ratings per item is {}.\\n'.format(round(ratings/items,4)))\n",
"print('Data sparsity (% of missing entries) is {}%.'.format(round(100*ratings/(users*items),4)))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nOzde7wVdb3/8fdHQNOovKEBZtvMu+IGloZZSnlMk0uamfhL06MHzPKkWZ4oUtCsMPVE1smA9IhdtNKjcjHzirc03RvRYxxS0a1yERGvCKjg5/fHzN6s2bP22ot9me9s5vV8PNZjrzUza9Z7ffZsWJ81M98xdxcAAAAAACFsFjoAAAAAAKC4aEoBAAAAAMHQlAIAAAAAgqEpBQAAAAAEQ1MKAAAAAAiGphQAAAAAEAxNKQBkxMwmmZnHt+Gh8/Q0Zratmf3IzB41szfN7L24lq+FztZdzOzqsm2mLnQeAAC6A00pgC5X9iG6+faJGp4zpmz5SRnERA9iZjtKapT0fUn1kvpKsqChOsDMhsdfTkyiyQQAINI7dAAAhfBjSYeFDoEebYKkuvj+A5J+J2m5JJf0bqBMHTFc0sT4/lxJTaGCAACQFzSlALLwWTP7F3e/I3QQ9FhHxT9flfQ5d18dMkxW3P0USacEjgEAQLfi8F0A3am8cfhJsBTYFHwk/vnPojSkAAAUBU0pgO70gqQb4/slM/tiyDDo0TaPf74dNAUAAOhyNKUAutsPJL0X37/IzHp1dEVlAyHN7eyyZja3eZn48WZmdmo8/SUze8vM/tfMJpjZB1o998Nm9kMze9zM3jCz183sXjM7vgPv6Sgzu9nMFpvZ2/HPa83soI1Yx/ZxzvvM7EUze8fMVsSP/6N1/grPb4pr0RQ/fp+ZfdPM7jez5fEot3M39r2VrX8rM/uWmd0d53s7rvH9ZvY9M/tQG89rGa24bPKhFQbSGr6ReU4pe+4p8bSSmf3GzJ6Of/eJ9Vrk0/Hov3eZ2dL4fbxlZs+a2XVmNsrMKg6+1PxetOF8Ukm6u8J7mdvqeVVH340HTkoMEGZmO5vZZWa2MM73mpn9zcy+bmY1nbZjZseY2Zz497823kZ+Z/GgZZVq2MZ6vmBmfzKzZ8xsdbyuJWb2mJn91sy+Ymbb1JKpjfUn6mZm/czswvhv9/X477PRzMab2ZYbsd4j49o/ZdFIz6vNbFE87VPtPHejt68aM9U8EnMty1r0d/51M7vdzJbF2/Oq+Hf9sJn93Mw+b2Z92nmtg8zsCjNbEG9ra83seTP7o5mNaOe5lbbfPcxsipn9X/z7q7qNAdhEuDs3bty4delN0eAzLmlh/PjqsmmntPGcMWXLTGpnvXM3IkPFZRUNMtO8TF9Jd5Q9bn2bJ2mb+HkHSXqpyrKXVsk0qWy54ZL+q8p61kuaWMP7PEXSG1XW45JelHRQlXU0xcs1SdpF0hMV1tFuzdtY9zBJS9rJ97Ki80Sr1avabfhGZjqlfHuUNF7SumrrlfTfNWb5i6QPduK9zG31vKvL5tVVWO/wsvmTJB2p6LzbttZ/m6QtqtSmj6Q/VXn+Oknfbl3DCuvZUtLsGt/z2V3wb81cSYMlLa7yOk9J2qWd9fWTdGcNmX8jqU9XbV81vteq28JGbje7xvWo5fdT38ZrvF/StTU8f7akD7Sxjtbb71cVnfbReh2pbYwbN26b1o2BjgBkYaKkExQdgjnJzP7g7u8EzlTuvxWNDvyAog/kL0r6qKRvxD8HS5piZhMl/VXR+/iNpPslvSPp05LGKho87ttm9ld3v72d1zxL0tGKGrLfSHpc0laKmopjFR3JMsnMVrr7LyutwMzOkjQlfvi2pBsk3SdppaRt43V9QdKOku4wswPcfUGVTFtI+h9J+8Tv7QZJSxV9UN+xnfdTKV+9pLsUNSiS9KikP0h6XtKHJX1Z0sGStpM028w+5+5zy1ZxnaT58f3mw8D/oWjve7knNjZbmS9L+ryk1yXNUHTZmfWS9o+nNdtSUY3vkfSwpEWS3lJUm90lnaQNNb9G0e+2XPN7GSOpeY/6eRWyv9yJ91Iv6VxFl8qZKunBOHNJ0tcUNRGHKxrJ+Pw21jFN0nHx/bWKmpsHFdWkJOk0SZdKur6dLD+W1LyXbJmi0ZL/IWmVoi+BPq7oC55Dan97VX1I0bY7UFHjfZOkVyTtEWfeOX7NO82s3t3faL0CM9tW0XvdNZ60QNKfJT2p6GiPfRQ1mTvF6+yt9gehqnX7yky8N//PiuohRdvl9ZKeUTSS9TaS9pL0GUXbVKV1bKHoi7xh8aTnFTWo/1C0zX1cUYO5h6Lt4CYzO9zd36uwumYHK9o210u6UtG/x2vjdbzYgbcKoCcJ3RVz48Zt07tpw7fbC8umXV42/ZsVnhNyT6lL+n6FZfppw16+dYo+vL0kaVCFZU8qW9ctbbzmpFav+YSkHSosd7SiD4euqPHZucIyQ8uWWShptzZec4Sixtkl/b2NZZpa5fpWF2wDmym5x3WKpM0qLHde2TLPS3pfZ3/3NWQ7pdX7/T9JA9p5zqclbV1l/vuV3MN4aA3bwPAasl5dtnxdhfnDW72X5yptC5IOLNteXlGFvaWKvphpXs8KSftWWKauwvZySqtlekl6LZ7XVGkbb/U3tmcnfpet96h9o8IyfSXdXbbML9pY141ly/ygje21r6IvppqXO7Irtq8a32vVbaHWZRV9udA8b5akXlXWs7ek7SpM/1nZOq6QtHmFZfooasSbl/taDdvvMkl7d7ZW3Lhx63k3zikFkJWLFDVYkjTBzPqGDNPKX939x60nuvsKSc17KXsp2rtxprs/XmHZ3yo6HE6SDqvh3L11ko5395cqrOsmSZfFD7eSdEaF509UtKfmbUkj3f2pCsvI3edImhw/PNDMPtlOrhvd/WftLFOLkYr2LEnSQ4oa3dReEnf/oaQ58cOPSDqxC157Y7ikMe6+tOpC7ve5+2tV5r+laO9Z8zZ+UtdF3CgnVtoW3P1hSX+MH26jqElt7Vtl989099QeaHdvUvt7B/sp2nMpSTdX2sbL1rfC3Re2s75aXefu/1XhNVYp+tKree/oaWa2dfkyZjZEG/ZuX+XuF7WxvTavq3kv5zntZKpp+8rYx8vuX+Xu69ta0N0XuPvK8mlm1l/S1+OHd7r7GV7hyBd3f1fSvynaAyu1XytJOt2rH80BYBNFUwogE/EH0+ZDTXeQdHbAOK1VPDw29kDZ/eWqftji/fHPzbXhEMC2/NXd/1Fl/hRFh7FJ0jHlM+KBYZoPjbzZ3Z9u57V+V3b/c+0s+4t25teqfKTlS9zdqyw7uex+1iM03+fuj3XFitz9TUn/Gz/8RFescyM96u73VZl/V9n9vctnmNn7tGHbWKbo8M6KPDrEOvXFTJk1bb1ON7usrRnuvlwb/g62VHSYdbnyLxEurfYi7v6qpFvih4fEh7K2pcu2ry5Ufkmlfdpcqm1f1obRsNusudTSmDZ/GbJbOwM0Padozy2AAuKcUgBZukTRXr9tJX3HzH7l7q8EziRJf68yb3nZ/cZKe0/aWLa9EUXvrDbT3V80s/+TtK+k3c3sQ+7evHfmYG34UnGtmbU+f7G18tEz96qy3HpF59R1heY9cS6pvfNr/6YN5xpm3cxVa+IS4ubjy4rO091f0Xm2fRWdw9naTl2SbuM81M78JWX3W2+f+2vDdnJPO9u5FB3+PqjSDHd/3cweVrQN/IuZ3ajoy4774ialO7yu6HzNau7Shj18Byg6z7fZp+Of70jaw8z2aGddW5T9/JiiQ3QrqXn7ytD9ir442FLSxPhLrhmVjgBpw6fL7u9Qw78/5dvaXooO6a6Yq50vrwBswmhKAWQm/rB6saSLFR3eN17Sf4RNJSkaGKgt5dfFrLZc62Xf186y7e3dbF5mX0VNz4e14ZDBurJlvhrfalWtWV7p7ms3Yl3V9I9/vhjvQWyTu79nZosUNUbbmtnmlQ4H7CZL2l9EMrP9FA38tFuN6/1ghxN1XHuDJFXbPgeU3X9G7WtvmW8o+uLlg4oOiz1a0ltm9ndFTdE
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['user']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per user', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nOzdeZwU9Z3/8feHKx6oEUXkUDF4RFEcYTAkJoaYzUIU8CAmuPFgzTJslERza/wheGV1o7t4r8NGwawrcTVGjgQ8IvGIijMIxluIKMeIiEFBUAJ+fn9UDdMMPd0N1HyrCl/Px6MffdS36vPtTxdDf/pb9S1zdwEAAAAAkIY2aXcAAAAAAPDJRVEKAAAAAEgNRSkAAAAAIDUUpQAAAACA1FCUAgAAAABSQ1EKAAAAAEgNRSkABGBm483M49vAtPuTN2bWycyuNLNnzWy1mX0c53JV2n1rLWY2qWCf6Zl2f5AeMxtYsC+MT7s/AJC0dml3AMCOxcyaX/x4gLs/XWadEZLuip9e6u7jW6NvyCcz6yLpKUk9U+7Kdol/jBgYP53k7otS6wx2OGZ2gaRPS1rl7hPS7g8AbA2KUgCt7ReSvpp2J5BrF6upIH1C0v9IWi7JJf09pT5ti4GSxsWPZ0talFZHsEO6QNIBkt6QRFEKIFcoSgG0tuPN7B/c/aG0O4LcOiG+/5ukf3T3tWl2JhR3HylpZMrdQAa4+2xJlnY/AKC1cE4pgNZSWDj8W2q9wI5gv/j+lU9KQQoAwCcJRSmA1rJY0n3x42ozOzXNziDXOsT3H6XaCwAA0CooSgG0pv8n6eP48RVm1nZbN1Qw8+Ts7W1rZrMb28TP25jZOfHrb5vZB2b2FzO72Mx2a7buvmZ2uZk9Z2bvm9l7ZvaomX1rG97TCWZ2v5ktMbOP4vu7zOzzW7GNveN+PmZmb5nZejNbET//afP+F1l/UZyLRfHznczs+2b2uJktj2e5nb21761g+7uY2Q/M7JG4fx/FOX7czC4ysz1aWG/TbMUFL3+54LPdppmMzWxkwboj49eqzey/zWxB/Nlvtl2LfCme/fePZrYsfh8fmNnrZjbFzIaaWdHDKxvfi5rOJ5WkR4q8l9nN1is5+26xGVnNbH8zu9bMXo77t8rM/mxm55pZRafsmNkpZjYj/vw/jPeR/zGzz7WUwxa2c5KZ3W1mfzWztfG2lprZfDP7tZl928z2rKRPLWx/s7yZWWczuyz+t/te/O+z3swuNLOdt2K7g+Pcv2bRTM9rzWxh/NoXy6y71ftXhX1qcfbdxn/Dis4nlaQDiuxbLX5WZtbBzL5jZlPNbHH8Oa2y6G/ctcX2vWbrb7GfmtlwM/tD/G9lXbw//tLMOjdbdw8z+0n8Of3NzNaY2TNm9q9mxndU4JPE3blx48YtsZuiyWdc0svx80kFr41sYZ0RBW3Gl9nu7K3oQ9G2iiaZaWzTUdJDBc+b3+ZK2jNe7/OS3i7R9poSfRpf0G6gpJtKbGejpHEVvM+Rkt4vsR2X9Jakz5fYxqK43SJJB0p6vsg2yua8hW0PkLS0TP/eUXSeaKl8lboN3Mo+jSzcHyVdKGlDqe1Kur3CvvxB0u7b8V5mN1tvUsGynkW2O7Bg+XhJgxWdd9vS9h+Q9KkSuWkv6e4S62+Q9KPmOSyynZ0lTa/wPV+QwN+a2ZKOlrSkRJzXJB1YZnudJT1cQZ//W1L7pPavCt/rZp91C/+Gy92KfVbVkv5aZr2PJI0u0bfC/bSXoonIWtrWIkkHxOsdKmlBibZ3S7Jt3T+4ceOWrxsTHQFobeMkna7oEMzxZva/7r4+5T4Vul3R7MBPKPoS9JaiEYfz4vujJU0ws3GSZil6H/8t6XFJ6yV9SdIoRRPH/cjMZrn7g2Vini/pZEUF2X9Lek7SLoqKiuGKjmIZb2Yr3f3GYhsws/PVNMPmR5LulfSYpJWSOsXbOklSF0kPmVl/d3+xRJ8+Jem3knrH7+1eScsUfVHvUub9FOtflaQ/KipQJOlZSf8r6U1J+0r6pqRjJe0labqZ/aNHk7k0miJpXvy48TDwFxSNvhd6fmv7VuCbkr4u6T1JkyXVK/pB4Kj4tUY7K8rxnyTNkbRQ0geKcnOIpDPVlPM7FH22hRrfywhJjSPqY4v0/Z3teC9Vkn6iaDKcWyU9Gfe5WtK/StpV0tcUzWR8SQvbqJV0Wvz4Q0XFxpOKclIt6TuSrpF0T5m+/ELSifHjBkVFyguS1ij6EeggRT/wHFf52ytpD0X7bndFhffvJL2rqOj5jqT945gPm1mVu7/ffANm1knRe+0Vv/SipP+T9Kqioz16Kyoye8TbbKfyk1BVun9trxpFfz9qFe2TK+LXmptb+MSiIzIeiteVooL8D4pOvdhJ0Wd0Vrz8v8zsI3efVKYvV0n6hqK/af+jaCbgfRX9jTxC0d/UO8zsJEkPKvrMfhP3Y7Wkvor+9u6qaF98QNHfSAA7urSrYm7cuO1YNzX9yv1ywWvXF7z+/SLrpDlS6pJ+XqRNZzWN8m1QVFS8LalPkbZnFmzr9y3EHN8s5vOS9inS7mRFlzlxRYXP/kXa9Cto87Kkg1uIeaKiwtklPd1Cm0XN+vWDBPaBNtp8xHWCpDZF2o0taPOmpJ2297OvoG8jm73flyR1K7POlyR9usTyXbX5COOXK9gHBlbQ10kF7XsWWT6w2Xt5o9i+IOmYgv3lXRUZLVX0w0zjdlZIOqJIm55F9peRzdq0lbRKTaNiW+zjzf6NfXY7PsvmI2vnFWnTUdIjBW1uaGFb9xW0+X8t7K8dFf0w1dhucBL7V4XvtfCzHt9Cm8bPZlEF29st/jfnin4s+HoL7Q6K96vGdnuX2U9d0n81z5+iH3bmF7SpUzQZ3vFFtnecoh8CXNKL25s7bty45ePG8foAQrhCUYElSRebWcc0O9PMLHf/RfMX3X2FpMZRyraKRjfGuPtzRdr+WtHhgZL01QrO3dsg6Vvu/naRbf1O0rXx010kfbfI+uMUjdR8JGmIu79WpI3cfYaikQtJOsbMvlCmX/e5+3+WaVOJIYpGliTpKUWF7sfNG7n75ZJmxE/3k3RGArG3hksa4e7LSjZyf8zdV5VY/oGi0bPGffzM5Lq4Vc4oti+4+xxFo1GStKeiIrW5HxQ8HuPuW4xAu/silR8d7Kxo5FKS7i+2jxdsb4W7v1xme5Wa4u43FYmxRtGPXo2jo98xs08XtjGzvmoa3b7N3a9oYX9t3FbjKOcPy/Spov0rJaPUNKv1d939D8UaufsCSf8cP91VxUdgCz2vaP/ZLH/uvk5Nf4uk6Ie18e7+xyIxH1U0cipJh5nZfs3bANjxUJQCaHXxF9PGQ033UXSR96woenhs7ImCx8tV+rDFx+P7Dmo6BLAls9z9hRLLJyg6zE+STilcEE8M03ho5P3xl8ZS/qfg8T+WaXtDmeWVKpxp+Zfu7iXaFn5RDT1D82PuPj+JDbn7akl/iZ9+LoltbqVn3f2xEssLv/wfXrjAzHZS077RoOiw1aI8OsR6ix9mCqxrKU4ru7alBe6+XE3/DnZWdJh1ocIfEa4pFcTd/ybp9/HT48zsUyWaJ7Z/tYLG99wg6c5SDePCsbGwLvc35FZ339DCssK/pxsVHWbekscLHofcjwCkhHNKAYTyS0Wjfp0k/djMbnb3d1PukyQ9XWLZ8oLH9cVGT1poW25G0YdLLXT3t8zsJUXnYB1iZnu4e+PozLFq+kHxQzNrfv5ic+0LHh9Wot1GRefUJaFxJM4VnTdWyp/VdK5h6GKuVBG3mbj4+Kai83SPUnSebUdF53A21yOR3m2dp8osX1rwuPn+eZSa9pM/ldnPpejw9z7FFrj7e2Y2R9E+8A9mdp+iHzsec/e/l9nutnpP0fmapfxR0rnx4/6KzvNt9KX4fr2kQ83s0DLb+lTB/WcUHaJbTMX7V0gWzXjd+Pk1SBpmxSeOLrQmvi/1N0Sq/O/pKwV/08q13eYZmgHkB0UpgCDiL6tXS7pa0eF9F0r6abq9khRNDNSSwutilmrXvO1OZdqWG91sbHOEoqJ
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['item']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per item', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rating\n",
"1 0.06110\n",
"2 0.11370\n",
"3 0.27145\n",
"4 0.34174\n",
"5 0.21201\n",
"Name: user, dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['rating']).count()['user']/len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item attributes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
" encoding='latin-1')\n",
"genres=dict(zip(genres[1], genres[0]))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'unknown',\n",
" 1: 'Action',\n",
" 2: 'Adventure',\n",
" 3: 'Animation',\n",
" 4: \"Children's\",\n",
" 5: 'Comedy',\n",
" 6: 'Crime',\n",
" 7: 'Documentary',\n",
" 8: 'Drama',\n",
" 9: 'Fantasy',\n",
" 10: 'Film-Noir',\n",
" 11: 'Horror',\n",
" 12: 'Musical',\n",
" 13: 'Mystery',\n",
" 14: 'Romance',\n",
" 15: 'Sci-Fi',\n",
" 16: 'Thriller',\n",
" 17: 'War',\n",
" 18: 'Western'}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 \\\n",
"0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
"1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
"2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
"\n",
" 4 5 6 7 8 9 ... \\\n",
"0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
"1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
"2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
"\n",
" 14 15 16 17 18 19 20 21 22 23 \n",
"0 0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 1 0 0 \n",
"\n",
"[3 rows x 24 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[:3]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for i in range(19):\n",
" movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"movies=movies[[0,1,'genre']]\n",
"movies.columns=['id', 'title', 'genres']"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation, Children's, Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Get Shorty (1995)</td>\n",
" <td>Action, Comedy, Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Copycat (1995)</td>\n",
" <td>Crime, Drama, Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title genres\n",
"0 1 Toy Story (1995) Animation, Children's, Comedy\n",
"1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
"2 3 Four Rooms (1995) Thriller\n",
"3 4 Get Shorty (1995) Action, Comedy, Drama\n",
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
"movies[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Toy example"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.exists('./Datasets/toy-example/'):\n",
" os.mkdir('./Datasets/toy-example/')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
" [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
" [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
"toy_test=pd.DataFrame([[0,60,3,0],\n",
" [10,40,5,0],\n",
" [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
"\n",
"toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
"toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}