WSR-432813/P0. Data preparation.ipynb

708 lines
71 KiB
Plaintext
Raw Normal View History

2021-06-11 01:28:24 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building train and test sets"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# if you don't have some library installed try using pip (or pip3) to install it - you can do it from the notebook\n",
"# example: !pip install tqdm\n",
"# also on labs it's better to use python3 kernel - ipython3 notebook\n",
"#!pip install pandas\n",
"#!pip install numpy\n",
"#!pip install scipy\n",
"#!pip install time\n",
"#!pip install random\n",
"#!pip install matplotlib\n",
"#!pip install os\n",
"#!pip install sklearn\n",
"#!pip install surprise\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"import time\n",
"import random\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import helpers\n",
"\n",
"os.makedirs('./Datasets/', exist_ok = True)\n",
"\n",
"helpers.download_movielens_100k_dataset()\n",
"\n",
"df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
"df.columns=['user', 'item', 'rating', 'timestamp']\n",
"\n",
"train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
"\n",
"train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
"test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactions properties"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How data looks like?"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>196</td>\n",
" <td>242</td>\n",
" <td>3</td>\n",
" <td>881250949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>186</td>\n",
" <td>302</td>\n",
" <td>3</td>\n",
" <td>891717742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22</td>\n",
" <td>377</td>\n",
" <td>1</td>\n",
" <td>878887116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>244</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>880606923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>166</td>\n",
" <td>346</td>\n",
" <td>1</td>\n",
" <td>886397596</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 196 242 3 881250949\n",
"1 186 302 3 891717742\n",
"2 22 377 1 878887116\n",
"3 244 51 2 880606923\n",
"4 166 346 1 886397596"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample properties"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 943 users, 1682 items and 100000 ratings.\n",
"\n",
"Average number of ratings per user is 106.0445. \n",
"\n",
"Average number of ratings per item is 59.453.\n",
"\n",
"Data sparsity (% of missing entries) is 93.6953%.\n"
]
}
],
"source": [
"users, items, ratings=df['user'].nunique(), df['item'].nunique(), len(df)\n",
"\n",
"print(f'We have {users} users, {items} items and {ratings} ratings.\\n')\n",
"\n",
"print(f'Average number of ratings per user is {round(ratings/users,4)}. \\n')\n",
"print(f'Average number of ratings per item is {round(ratings/items,4)}.\\n')\n",
"print(f'Data sparsity (% of missing entries) is {round(100*(1-ratings/(users*items)),4)}%.')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAABNeUlEQVR4nO3dfZwVdd3/8fdHbkzD8iYkbtQ1M8UbXOComGVkl8kFSJrdwC9TswusSyvTSsoULCtMuzKzTEhTK7HSNG7M27TUMt1FMENKsVXuRATvEO/Qz++PmV3O7Jw9u7C7852z83o+HvvYc2bmzHnvd2ZhP+f7ne+YuwsAAAAAgBC2Ch0AAAAAAFBcFKUAAAAAgGAoSgEAAAAAwVCUAgAAAACCoSgFAAAAAARDUQoAAAAACIaiFAAyYmbTzczjr9Gh89QaM9vRzL5jZg+a2Ytm9mbcls+FztZdzOzKsnOmLnQeAAC6A0UpgC5X9kd089fBHXjNxLLtp2cQEzXEzAZIapT0DUn1kvpJspCZtoSZjY4/nJhOkQkAQKR36AAACuG7kj4UOgRq2lmS6uLH90r6laTVklzS64EybYnRkqbFj++S1BQqCAAAeUFRCiALh5vZf7n77aGDoGaNjb8/K+nD7r4hZJisuPuJkk4MHAMAgG7F8F0A3am8cPhusBToCXaJv/+rKAUpAABFQVEKoDstk3RD/PhAM/toyDCoaX3j768GTQEAALocRSmA7vZNSW/Gj88zs15buqOyiZDu6uy2ZnZX8zbx863M7KR4+dNm9pKZPWxm3zSz7Vq99p1m9m0ze8jMXjCz583sL2b2iS34mcaa2R/MbLmZvRp/n21mh2zGPt5hZmeZ2d1m9pSZvWZma+LnXzOzfu28vilui6b4+VvM7Itmdo+ZrY5nub1rc3+2sv1va2ZfNrM743yvxm18j5l93cze3sbrWmYrLlv8gQoTaY3ezDwnlr32xHhZycx+bmaPxcc+sV+LvD+e/fdPZrYy/jleMrP/mNm1ZnZUlfecHv8c08oW31nhZ7mr1euqzr4bT5yUmCDMzHY1sx+Y2ZI433Nm9lcz+18z69BlO2Z2jJnNj4//K/E58iuLJy2r1IZt7OcjZvZbM3vczDbE+1phZovM7Hdxpp06kqmN/Sfazcz6m9m3zOwf8e/mC2bWaGZTzWybzdjvmLjtH7VopucNZrY0Xva+dl672edXBzN1eCbmjmxr0e/5/5rZbWa2Kj6f18fH+gEzu9zMPm5mfSu9vmw/h5jZpWa2OD7XXjGzJ83sN2Y2rp3XVjp/9zKzi8zskfj4VT3HAPQQ7s4XX3zx1aVfiiafcUlL4udXli07sY3XTCzbZno7+71rMzJU3FbRJDPN2/STdHvZ89ZfCyTtEL/uEElPV9n2wiqZppdtN1rST6rs5w1J0zrwc54o6YUq+3FJT0k6pMo+muLtmiTtLunhCvtot83b2PcoSSvayfeMoutEq7VXta/Rm5npxPLzUdJUSRur7VfSLzqY5Y+S3taJn+WuVq+7smxdXYX9ji5bP13SGEXX3ba1/1slbV2lbfpI+m2V12+UdEbrNqywn20kzevgz3xaF/xbc5ek4ZKWV3mfRyXt3s7++ku6owOZfy6pT1edXx38WaueC5t53uwRt0dHjk99G+/xVknXduD18yRt18Y+Wp+/xyu67KP1PlLnGF988dWzvpjoCEAWpkuapGgI5nQzu8bdXwsbKeEXimYHvlfRH+RPSdpN0inx9+GSLjKzaZJuUfRz/FzSPZJek/R+SZMVTR53hpnd7O1P6vQlSUcrKsh+LukhSdsqKiqOVTSSZbqZrXX3SyrtwMy+JOmi+OkGSddJ+quktZLeEe9rgqQBkm43swPdfXGVTFtL+r2kfeOf7XpJKxX9oT6gnZ+nUr7hkv6kqECRpAclXSPpSUnvlPQJSYdK2knSPDP7sLvfVbaLayUtjB/fEH//p6Le93IPb262Mp9U1E7PS7pK0W1n3pB0QLys2TaKhg7/WdL9kpZKeklR27xH0qcl7Rjv62pFx7Zc888yMX5PSTq7QvZnOvGz1Ev6qqJb5Vwm6W9x5pKkzykqIo5QNJPxOW3sY6akj8ePX1FU3PxNUZuUJH1W0oWKzrVqviupuZdslaLZkv8pab2iD4HeregDnsM6/NNV93ZF5+5gRYX3jZLWSdorzrxr/J53mFm9u7/QegdmtqOin3WPeNGieJ+PKRrtsZ+iInNwvM/ean8Sqo6eX5kxM5P0O0XtIUXn5XWSHlc0k/UOkoZK+qCic6rSPrZW9EHeqHjRUkm/kfRIvI93Kyow36PoPLjRzI5w9zcr7K7ZoYrOzTckXa7o3+NXFB3Dpzb7BwVQW0JXxXzxxVfP+9KmT7eXlC27uGz5Fyu8JmRPqUv6RoVt+mtTL99GRX+8PS1pWIVtP122r5vaeM/prd7zYUk7V9juaEV/2LmiwmfXCtuUyrZ5sNI28XbjFRXOLum+NrZpapXry11wDmylZI/rRZK2qrDd2WXbPCnpLZ099h3IdmKrn/cRSYPaec37JW1fZf1blexh/EAHzoHRHch6Zdn2dRXWj271szwhac8K2x1Udr6sU4XeUkUfzDTvZ42k/SpsU1fhfDmx1Ta9JD0Xr2uqdI63+h3buxPHsnWP2ikVtukn6c6ybX7cxr5uiNe/KelLbWzTT9EHU837GtMV51cHf9aq50JHt1X0b0fzurmSelXZzz6Sdqqw/Idl+/i+pN4VtumjqBBv3u5zHTh/V0nap7NtxRdffNXeF9eUAsjKdxQVWJJ0lrVznWPGbnH31OzA7r5GUnMvZS9FvRunuvtDFbb9paLhcFJ0C5z2RqJslPRJd3+6wr5ulPSD+Om2kj5f4fXnKOqpeVHSeHd/stKbuPs8STPipweb2XvbyXWDu/+wnW06YryiHldJuk9RoZvqJXH3b0uaHz/dRdJxXfDem8MlTXT3lVU3cr/b3Z+rsv4lRb1nzef4p7ss4eY5zt0fbb3Q3e9X1JMlRT1hB1V47ZfLHp/q7qkeaHdvUvu9g/0V9VxK0h8qneNl+1vj7kva2V9HXevuP6nwHusVfejV3Dv6WTPbvnwbMxuhTb3bP3T3H7WRt3lfzb2cp7eTqUPnV8beXfb4Cnd/o60N3X2xu68tX2ZmAyX9b/z09+7+NXffWOG1r0v6H0U9sFL7bSVJJ3v10RwAeiiKUgCZcPfV2jTUdGdJpwULk1ZxeGzs3rLHq1V92OI98fettWkIYFtucfd/Vll/kaJhbJJ0TPkKM9tBm4ZGznb3Fe2816/KHn+4nW1/3M76jiqfafkCd/cq284oe5z1DM13u/uirtiRu78o6R/x04O7Yp+b6UF3v7vK+j+VPd6nfIWZvUWbzo2VioZ3VuTREOvUBzNlXm7rfbrZD9paEf/70/x7sI2iIbXlmj9E8Gr7iff1rKSb4qeHxUNZ29Jl51cXKr+l0r5tbtW2T2jTbNgXVtswLkybPwzZs50Jmp5Q1HMLoIC4phRAli5Q1Ou3o6SvmNlP3X1d4EyS9Pcq61aXPW6s1NvXxrY7tPOed1Rb6e5Pmdkjiq5je4+Zvd3dm3tnDtWmDxXfMLOj23mvPmWPh1bZ7g1F19R1heaeOJd0Wzvb/lWbrjXMupirVsQlxMXHJyR9RFGv+QBFma3C5kO6JN3mua+d9eUfXrQ+Pw/QpvPkL+2c51I0/H1YpRXu/ryZ3a/oHPgvM7tB0Ycdd8dFSnd4XtH1mtX8SZt6+A5UdJ1vs/fH35+TdFB02WVVW5d9f5eiIbqVdPj8ytA9ij442EbStPhDrqsqjQBpw/vLHg/pwL8/5efaUEVDuivmaufDKwA9GEUpgMzEf6yeL+l8RcP7pkr6WthUkqKJgdpSfl/Matu13vYt7Wz7WDvrm7fZT1HR805tGjJYV7bN51V5eG9bqhXLa939lc3YVzUD4+9PxT2IbXL3N81sqaLCaEcz6+vZTYTVXi+zJMnM9lc08dOeHdzv27Y40ZZrb5KkaufnoLLHj6t97W1ziqIPXt6maFjs0ZJeMrO/KyqKbpd0bweK345a2oG
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['user']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per user', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAABOLUlEQVR4nO3de5wU5ZX/8e8RJF7whiLCoGIQDaI4wqi4JoaYNbAKqNFkdaNidBk2kURzlcSfAlGzuokbrzEMUYFdo3F1NVwMeF+jiZcZBGO8gsFwE/CCgqgEPb8/qgaasae7h6l5qmrm8369+jXdVU/Xefp0Tc+cfqqeMncXAAAAAABp2CbtDgAAAAAAOi6KUgAAAABAaihKAQAAAACpoSgFAAAAAKSGohQAAAAAkBqKUgAAAABAaihKASAAM5toZh7fhqbdn7wxs25mdrmZPWNma83s4ziXa9LuW1sxs6kF+0yftPuD9JjZ0IJ9YWLa/QGApHVOuwMA2hcza3rx4yHu/mSZ55wm6bb44SR3n9gWfUM+mVkPSU9I6pNyV1ol/jJiaPxwqrsvTqsvaH/M7AJJu0pa4+5Xp9oZAGghilIAbe2nkr6YdieQaxdpc0H6uKT/lrRSkkv6e0p92hpDJU2I7z8iaXFaHUG7dIGkfSW9JunqVHsCAC1EUQqgrR1rZv/o7g+k3RHk1vHxz7clfcnd16fZmVDc/WxJZ6fcDWSAuz8iydLuBwC0Fc4pBdBWCguHn6bWC7QHe8c/X+ooBSkAAB0JRSmAtrJE0t3x/cPN7Mtpdga51iX++WGqvQAAAG2CohRAW/p/kj6O719mZp22dkMFM08+0tq2ZvZIY5v48TZmdk68fJWZvWdmz5nZ/zOznZo8dy8zu9TMnjWzd83sHTN71My+uhWv6Xgz+52ZLTWzD+Oft5nZUS3Yxh5mdpGZ/cHMXjezDWa2On78QzPrWub5i+NcLI4fb2dm3zazx8xsZTzL7SMtfW0F29/BzL5jZg/H/fswzvFjZvYjM9ulmedtmq24YPHnC97brZrJ2MzOLnju2fGyGjP7tZktjN/7LbZrkc/Fs/8+ZGbL49fxnpn91cxuN7ORJWJOjF/HhILFDxd5LY80eV7J2XeLzchqZvuY2VVm9mLcvzVm9kcz+6aZVXTKjpmdbGaz4/f/g3gf+W8zO7K5HDaznRPN7A4ze9XM1sfbWmZmC8zsf+I+7V5Jn5rZ/hZ5M7PuZvYTM/tz/Lv5rpk1mNl4M9u+BdsdHuf+FYtmel5vZoviZZ8t89wW718V9qnZ2Xfj98cVnU8qSfsW2beafa/MrIuZnWtmM8xsSfw+rbHoM+6qYvtek+d/Yj81s1PM7Pfx78r78f74MzPr3uS5u5jZD+L36W0zW2dmT5vZv5kZ/6MCHYm7c+PGjVtiN0WTz7ikF+PHUwuWnd3Mc04raDOxzHYfaUEfirZVNMlMY5uukh4oeNz0Nk/SbvHzjpK0qkTbn5fo08SCdkMl3VBiOx9JmlDB6zxb0rsltuOSXpd0VIltLI7bLZa0n6TnimyjbM6b2fYQScvK9O8NReeJlspXqdvQFvbp7ML9UdJ4SRtLbVfSLRX25feSdm7Fa3mkyfOmFqzrU2S7QwvWT5Q0XNF5t81t/z5JnyqRm20l3VHi+Rslfa9pDotsZ3tJsyp8zRck8FnziKTDJC0tEecVSfuV2V53SQ9W0OdfS9o2qf2rwte6xXvdzO9wuVux96pG0qtlnvehpLEl+la4n/ZVNBFZc9taLGnf+HkHSlpYou0dkmxr9w9u3Ljl68ZERwDa2kRJpys6BHOimf3G3Tek26Ut3KJoduDHFf0T9LqiEYfz4p+HSbrazCZImqvodfxa0mOSNkj6nKQxiiaO+56ZzfHykzqdL+kkRQXZryU9K2kHRUXFKYqOYploZm+6+/XFNmBm52vzDJvrJd0p6Y+S3pS0R7ytUZJ6SHrAzA539+dL9OlTkv5X0oD4td0labmif9R7lHk9xfp3mKSHFBUokvSMpN9I+pukvSR9VdLRknaXNMvMvuTRZC6Nbpc0P75/d/zzL4pG3ws919K+FfhnRXl6R9I0SQ2KvhA4NF7WaHtF/5j/n6SnJC2S9J6i3Bwg6UxJ3eJtTVf03hZqfC2nxTEl6eIifX+jFa+lWtIPFE2GM1nSn+I+10j6N0k7SjpO0UzGlzSzjTpJX4nvf6Co2PiTopzUSDpX0s8V7Wul/FTSCfH9FYqKlL9IWqfoS6D9FX3Bc0zFr660XRTtu1WKCu97JL2lqOg5V9I+ccwHzaza3d9tugEz66botfaNFy2It7lQ0dEeBysqMqvibXZW+UmoKt2/WqtW0edHnaJ9cnW8rKl5hQ8sOiLjgfi5rujz7T5FXyRtr+g9OjNe/ysz+9Ddp5bpyxWSTlX0mfbfimYC3kvRZ+TBij5Tp5vZiZLuV5TP38b9WCtpkKLP3h0V7Yv3KfqMBNDepV0Vc+PGrX3dtPlb7hcLll1bsPzbRZ6T5kipS/pxkTbdtXmUb6OiomKVpIFF2p5ZsK17m4k5sUnM5yTtWaTdSYouc+KKCp99irSpKWjzTLE2cbsRigpnl/REM20WN+nXdxLYB7bRliOuV0vapki7iwva/E3Sdq197yvo29lNXu8LknqVec7nJO1aYv2O2nKE8fMV7ANDK+jr1IL2fYqsH9rktbwmqV+RdkcU7C9vqchoqaIvZhq3s1rSwUXa9Cmyv5zdpE0nSWu0eVTsE/t4k9+xz7TivWw6snZekTZdJT1c0Oa6ZrZ1d7z+Y0nnN9Omq6LCrXFbw5PYvyp8rYXv9cRm2jS+N4sr2N5O8e+cKxpdb26f3T/er1zRlwp7lNlPXdKv1OT3XVGRu6CgTb2iL9OOLbK9Y+L3wSU939rccePGLR83jtcHEMLligosSbrIypznGNhcd//E7MDuvlpS4yhlJ0WjG+Pc/dkibf9L0eGBUnQJnHJHoWyU9M/uvqrItu6RdFX8cAdJ3yjy/EsUjdSslTTC3f9WLIi7z1I0ciFJR5rZP5Tp193u/osybSoxQtGIqyQ9oajQ/bhpI3e/VNLs+OHeks5IIHZLuKTT3H15yUbuf3D3NSXWv6do9KxxHz8zsR62zBnu/krThe7+lKLRKEnaTVGR2tR3Cu6Pc/dPjEC7+2KVHx3srmjkUpJ+V2wfL9jeand/scz2KnW7u99QJMY6RV96NY6Onmtmuxa2MbNB2jy6/Qt3v6aZ/jZuq3GU87tl+lTR/pWSMdo8q/VZ7v5/xRq5+0JJX48f7qjiI7CFnlO0/2zx++7u72vzZ5EkDVZUXD9UJOajikZOJam/me3dtA2A9oeiFECbc/eV2nyo6Z6KLvKeFUUPj409XnB/pUoftvhY/PNT2nwIYHPmuvtfSqy/WtFhfpJ0cuEKM9tNmw+NvM3dl5WJ9d8F979Upu11ZdZXqnCm5Z+5u5doW/iPaugZmv/g7guS2JC7r5X05/jhkUlss4Wecfc/lFhf+M//QYUrzGw7bd43lkv6n+Y24tEh1p/4YqbA+83FaWNXNbci/vxp/D3YXtEhtYUav0TwUtuJt/W2pHvjh8eY2adKNE9s/2oDja/5ZXefWaphXDg2FtblPkMmu/vGZtYVfp5+pOgw8+Y8VnA/5H4EICWcUwoglJ8pGvXrJun7ZvZLd38r5T5J0pMl1q0suN9QbLSvmba7lYn5YKmV7v66mb2g6BysA8xsF3dvHJ05Wpu/UPzIzE4qE2vbgvv9S7T7SNE5dUloHIlzReeNlfJHbT7XMHQxV6qI20JcfHxV0omKRs17KOqzFWneO5HetcwTZdYXfnnRdP88VJv3k0fL7OdSdPj7wGIr3P0dM3tK0T7wj2Z2t6IvO/7g7n8vs92t9Y6i8zVLeUjSN+P7hys6z7fR5+KfayQdYVbsLd3Cpwp+flrRIbrFVLx/hWTRjNeN79/KCj5DpOh3VCr9GSJV/nn6UsFnWrm25T5PAbQDFKUAgoj/Wb1S0pWKDu8bL+mH6fZKUjQxUHMKr4tZql3TttuVabuwzPrGNgcrKnr20uZDBvsUtPmGih/e25x
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"users_per_item=df.groupby(['item']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(users_per_item, bins=100)\n",
"\n",
"# Let's add median\n",
"t=users_per_item.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=users_per_item.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=users_per_item.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per item', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rating\n",
"1 0.06110\n",
"2 0.11370\n",
"3 0.27145\n",
"4 0.34174\n",
"5 0.21201\n",
"Name: user, dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['rating']).count()['user']/len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item attributes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
" encoding='latin-1')\n",
"genres=dict(zip(genres[1], genres[0]))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'unknown',\n",
" 1: 'Action',\n",
" 2: 'Adventure',\n",
" 3: 'Animation',\n",
" 4: \"Children's\",\n",
" 5: 'Comedy',\n",
" 6: 'Crime',\n",
" 7: 'Documentary',\n",
" 8: 'Drama',\n",
" 9: 'Fantasy',\n",
" 10: 'Film-Noir',\n",
" 11: 'Horror',\n",
" 12: 'Musical',\n",
" 13: 'Mystery',\n",
" 14: 'Romance',\n",
" 15: 'Sci-Fi',\n",
" 16: 'Thriller',\n",
" 17: 'War',\n",
" 18: 'Western'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 \\\n",
"0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
"1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
"2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
"\n",
" 4 5 6 7 8 9 ... \\\n",
"0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
"1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
"2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
"\n",
" 14 15 16 17 18 19 20 21 22 23 \n",
"0 0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 1 0 0 \n",
"\n",
"[3 rows x 24 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[:3]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"for i in range(19):\n",
" movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"movies=movies[[0,1,'genre']]\n",
"movies.columns=['id', 'title', 'genres']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation, Children's, Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Get Shorty (1995)</td>\n",
" <td>Action, Comedy, Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Copycat (1995)</td>\n",
" <td>Crime, Drama, Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title genres\n",
"0 1 Toy Story (1995) Animation, Children's, Comedy\n",
"1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
"2 3 Four Rooms (1995) Thriller\n",
"3 4 Get Shorty (1995) Action, Comedy, Drama\n",
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
"movies[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Toy example"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs('./Datasets/toy-example/', exist_ok = True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
" [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
" [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
"toy_test=pd.DataFrame([[0,60,3,0],\n",
" [10,40,5,0],\n",
" [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
"\n",
"toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
"toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}