WSS-project/P0. Data preparation.ipynb

699 lines
71 KiB
Plaintext
Raw Normal View History

2021-03-20 20:01:22 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Building train and test sets"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 1,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
2021-03-23 21:52:46 +01:00
"# if you don't have some library installed try using pip (or pip3) to install it - you can do it from the notebook\n",
"# example: !pip install tqdm\n",
"# also on labs it's better to use python3 kernel - ipython3 notebook\n",
"\n",
2021-03-20 20:01:22 +01:00
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.sparse as sparse\n",
"import time\n",
"import random\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import helpers\n",
"\n",
"os.makedirs('./Datasets/', exist_ok = True)\n",
"\n",
"helpers.download_movielens_100k_dataset()\n",
"\n",
"df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\\t', header=None)\n",
"df.columns=['user', 'item', 'rating', 'timestamp']\n",
"\n",
"train, test = train_test_split(df, test_size=0.2, random_state=30)\n",
"\n",
"train.to_csv('./Datasets/ml-100k/train.csv', sep='\\t', header=None, index=False)\n",
"test.to_csv('./Datasets/ml-100k/test.csv', sep='\\t', header=None, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactions properties"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How data looks like?"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 2,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user</th>\n",
" <th>item</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>196</td>\n",
" <td>242</td>\n",
" <td>3</td>\n",
" <td>881250949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>186</td>\n",
" <td>302</td>\n",
" <td>3</td>\n",
" <td>891717742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22</td>\n",
" <td>377</td>\n",
" <td>1</td>\n",
" <td>878887116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>244</td>\n",
" <td>51</td>\n",
" <td>2</td>\n",
" <td>880606923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>166</td>\n",
" <td>346</td>\n",
" <td>1</td>\n",
" <td>886397596</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" user item rating timestamp\n",
"0 196 242 3 881250949\n",
"1 186 302 3 891717742\n",
"2 22 377 1 878887116\n",
"3 244 51 2 880606923\n",
"4 166 346 1 886397596"
]
},
2021-06-09 22:16:22 +02:00
"execution_count": 2,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sample properties"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 3,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 943 users, 1682 items and 100000 ratings.\n",
"\n",
2021-03-23 21:52:46 +01:00
"Average number of ratings per user is 106.0445. \n",
2021-03-20 20:01:22 +01:00
"\n",
"Average number of ratings per item is 59.453.\n",
"\n",
2021-03-27 11:58:44 +01:00
"Data sparsity (% of missing entries) is 93.6953%.\n"
2021-03-20 20:01:22 +01:00
]
}
],
"source": [
2021-03-23 21:52:46 +01:00
"users, items, ratings=df['user'].nunique(), df['item'].nunique(), len(df)\n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
"print(f'We have {users} users, {items} items and {ratings} ratings.\\n')\n",
2021-03-20 20:01:22 +01:00
"\n",
2021-03-23 21:52:46 +01:00
"print(f'Average number of ratings per user is {round(ratings/users,4)}. \\n')\n",
"print(f'Average number of ratings per item is {round(ratings/items,4)}.\\n')\n",
2021-03-27 11:58:44 +01:00
"print(f'Data sparsity (% of missing entries) is {round(100*(1-ratings/(users*items)),4)}%.')"
2021-03-20 20:01:22 +01:00
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 4,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
2021-06-07 19:19:39 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABM+UlEQVR4nO3deZgcZbn+8fshC4JB2UJMwjKIyB4mSYNBFKIcJJIQUUSTnyAcPAkqKIhyjHIgQVGD4DHiwiERDkExqHCALMi+CAjCTAgIMQKBAbIQwk4gLAnP74+qmXRN9fRMZqm3eur7ua6+uruquvrut2qSfvqtesvcXQAAAAAAhLBJ6AAAAAAAgOKiKAUAAAAABENRCgAAAAAIhqIUAAAAABAMRSkAAAAAIBiKUgAAAABAMBSlAJARM5tmZh7fRofOU2vMbGsz+5GZPWBmr5nZu3Fbvhw6W08xs0vL9pm60HkAAOgJFKUAul3Zl+jm20c68JoJZctPyyAmaoiZDZLUKOn7kuolDZBkITN1hpmNjn+cmEaRCQBApG/oAAAK4ceSDgkdAjXtDEl18eO7Jf1e0ipJLumdQJk6Y7SkqfHj2yU1hQoCAEBeUJQCyMInzezf3P3m0EFQsw6P71+S9Cl3fyNkmKy4+/GSjg8cAwCAHsXhuwB6Unnh8JNgKdAb7BDf/6soBSkAAEVBUQqgJz0j6er4ccnMPhcyDGpa//j+raApAABAt6MoBdDT/kvSu/Hjc8ysT2dXVDYQ0u1dXdbMbm9eJn6+iZmdEE9/zsxeN7N/mNkZZrZFq9d+wMx+aGYPmdmrZvaKmf3VzL7Yic90uJlda2bLzOyt+H6OmR2wEevYNs55p5k9a2Zvm9nq+Pl/ts5f4fVNcVs0xc/fY2bfNLO7zGxVPMrt7Rv72crWv7mZfcvMbovzvRW38V1m9j0ze38br2sZrbhs8sEVBtIavZF5ji977fHxtJKZ/dbMHo+3fWK9Fvl4PPrvrWa2Iv4cr5vZk2Z2hZkdYWYVB19q/izacD6pJN1W4bPc3up1VUffjQdOSgwQZmY7mtnPzGxJnO9lM/ubmX3dzDp02o6ZfdbMFsTb/814H/m9xYOWVWrDNtbzGTP7k5k9YWZvxOtabmYPmtnvzOxLZrZVRzK1sf5Eu5nZQDP7gUV/u6/Ef5+NZjbFzDbbiPWOidv+MYtGen7DzJbG0z7Wzms3ev/qYKYOj8TckWUt+jv/upndZGYr4/15Tbyt7zOzX5jZp82sXzvvdYCZXWhmi+N97U0ze9rM/mhmY9t5baX9dzczm2Fm/4y3X9V9DEAv4e7cuHHj1q03RYPPuKQl8fNLy6Yd38ZrJpQtM62d9d6+ERkqLqtokJnmZQZIurnseevbQklbxa87QNJzVZY9v0qmaWXLjZb06yrrWS9pagc+5/GSXq2yHpf0rKQDqqyjKV6uSdLOkh6usI5227yNdY+StLydfM8rOk+0WntVu43eyEzHl++PkqZIWldtvZL+t4NZ/iLpfV34LLe3et2lZfPqKqx3dNn8aZLGKDrvtq313yhp0ypt00/Sn6q8fp2kb7duwwrr2UzS/A5+5lO74d+a2yUNl7Ssyvs8JmnndtY3UNItHcj8W0n9umv/6uBnrbovbOR+s0vcHh3ZPvVtvMd7Jc3pwOvnS9qijXW03n+/rOi0j9brSO1j3Lhx6103BjoCkIWpkiYqOgRzmpn9wd3fDpyp3P8qGh34bkVfyJ+VtJOkk+L74ZJmmNlUSTco+hy/lXSXpLclfVzSJEWDx33bzG5w95vaec9TJB2pqCD7raSHJG2uqKg4StGRLNPM7AV3/1WlFZjZKZJmxE/fknSVpDslvSBp63hdn5E0SNLNZrafuy+ukmlTSf8naa/4s10laYWiL+qD2vk8lfLVS7pVUYEiSQ9I+oOkpyV9QNIXJB0oaRtJ883sU+5+e9kqrpC0KH58dXz/iKLe93IPb2y2Ml+Q9GlJr0iareiyM+sl7RtPa7aZoja+Q9J9kpZKel1R23xY0rHa0OaXKdq25Zo/ywRJzT3qZ1bI/nwXPku9pNMVXSrnIkn3xJlLkr6qqIg4VNFIxme1sY6Zko6OH7+pqLi5R1GblCR9RdL5kq5sJ8uPJTX3kq1UNFryI5LWKPoR6EOKfuA5qMOfrrr3K9p3hyoqvK+R9KKk3eLMO8bveYuZ1bv7q61XYGZbK/qsu8STFkv6s6RHFR3tsZeiInP7eJ191f4gVB3dvzIT9+b/WVF7SNF+eaWkJxSNZL2VpD0kfULRPlVpHZsq+iFvVDzpaUUF6iOK9rkPKSowd1O0H1xjZoe6+7sVVtfsQEX75npJFyv69/jNeB3PbvQHBVBbQlfF3Lhx6303bfh1e0nZtAvKpn+zwmtC9pS6pO9XWGagNvTyrVP05e05ScMqLHts2bqua+M9p7V6z4clbVdhuSMVfTl0RYXPjhWWGVm2zBJJu7bxnmMVFc4u6e9tLNPUKte3umEf2ETJHtcZkjapsNyZZcs8Lek9Xd32Hch2fKvP+09JQ9p5zcclbVll/nuV7GE8uAP7wOgOZL20bPm6CvNHt/osT1XaFyTtX7a/vKgKvaWKfphpXs9qSXtXWKauwv5yfKtl+kh6OZ7XVGkfb/U3tnsXtmXrHrWTKiwzQNJtZcv8so11XV22zH+1sb8OUPTDVPNyY7pj/+rgZ626L3R0WUU/LjTPmyepT5X17ClpmwrTf162jgsl9a+wTD9FhXjzcl/twP67UtKeXW0rbty41d6Nc0oBZOUcRQWWJJ1hZgNChmnlBnf/ceuJ7r5aUnMvZR9FvRsnu/tDFZb9naLD4STpkA6cu7dO0hfd/bkK67pG0s/ip5tL+lqF109V1FPzlqRx7v5YhWXk7gskTY+f7m9mH20n19Xu/vN2lumIcYp6liTpXkWFbqqXxN1/KGlB/HQHScd0w3tvDJc0wd1XVF3I/U53f7nK/NcV9Z417+PHdlvCjXNMpX3B3e+T9Mf46VaKitTWvlX2+GR3T/VAu3uT2u8dHKio51KSrq20j5etb7W7L2lnfR11hbv/usJ7rFH0o1dz7+hXzGzL8mXMbIQ29G5f4u7ntLG/Nq+ruZfztHYydWj/ytiHyh5f4u7r21rQ3Re7+wvl08xssKSvx09vcfeveYUjX9z9HUn/oagHVmq/rSTpRK9+NAeAXoqiFEAm4i+mM+Kn20k6NViYtIqHx8buLnu8StUPW7wrvu+vDYcAtuUGd3+kyvwZig5jk6TPls+IB4ZpPjTyWnd/vJ33+n3Z40+1s+wv25nfUeUjLZ/n7l5l2ellj7MeoflOd3+wO1bk7q9J+kf89CPdsc6N9IC731ll/q1lj/csn2Fm79GGfWOlosM7K/LoEOvUDzNl1rb1Pj3sZ23NcPdV2vB3sJmiw6zLlf+IcH61N3H3lyRdFz89KD6UtS3dtn91o/JLKu3V5lJt+4I2jIbdZptLLYVp848hu7YzQNNTinpuARQQ55QCyNJ5inr9tpb0HTP7jbu/GDiTJP29yrxVZY8bK/WetLFseyOK3lJtprs/a2b/lLS3pA+b2fvdvbl35kBt+FHxTTM7sp33Kh89c48qy61XdE5dd2juiXNJ7Z1f+zdtONcw62KuWhGXEBcfX1B0nu6+is6zHaDoHM7Wtu+WdBvn3nbmLy973Hr/3Fcb9pM72tnPpejw92GVZrj7K2Z2n6J94N/M7GpFP3bcGRcpPeEVRedrVnOrNvTw7afoPN9mH4/v35a0m5nt1s66Ni27/6CiQ3Qr6fD+laG7FP1wsJmkqfGPXLMrHQHSho+XPd6uA//+lO9reyg6pLtirnZ+vALQi1GUAshM/GX1XEnnKjq8b4qk/wybSlI0MFBbyq+LWW251su+p51l2+vdbF5mb0VFzwe04ZDBurJlvhzfOqpasfyCu7+5EeuqZnB8/2zcg9gmd3/XzJYqKoy2NrP+lQ4H7CHL219EMrN9FA38tGsH1/u+TifqvPYGSaq2fw4pe/yE2tfeMicp+uHlfYoOiz1S0utm9ndFRdHNku7uQPHbUUs7UNCU/80NaTW
2021-03-20 20:01:22 +01:00
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"items_per_user=df.groupby(['user']).count()['rating']\n",
"\n",
"plt.figure(figsize=(16,8))\n",
"plt.hist(items_per_user, bins=100)\n",
"\n",
"# Let's add median\n",
"t=items_per_user.median()\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
"t=items_per_user.quantile(0.25)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
"t=items_per_user.quantile(0.75)\n",
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per user', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 5,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
2021-06-07 19:19:39 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAHvCAYAAACsfXllAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABOH0lEQVR4nO3de5wU5ZX/8e8RJF7whiJyUTGIRlEcYTBkTQwxm0AU0ISY4EbFNcuwURLNVYw/BaLu6ibu4n0ZEhWzrsbVuHJJwEtkvUTFGQTjXYgYLiMgBgVFCXh+f1QNNENPTw9T81TVzOf9evWru6uq6zx9+mHo00/VU+buAgAAAAAgDbuk3QAAAAAAQPtFUQoAAAAASA1FKQAAAAAgNRSlAAAAAIDUUJQCAAAAAFJDUQoAAAAASA1FKQAEYGaTzMzj25C025M3ZtbFzK4ys+fMbL2ZfRzncl3abWstZnZ7QZ/pnXZ7kB4zG1LQFyal3R4ASFrHtBsAoG0xs4YXPx7s7s808ZrRku6Kn05290mt0Tbkk5l1k/S0pN4pN6VF4h8jhsRPb3f3pWm1BW2PmV0kaV9J69x9SqqNAYBmoigF0Nr+RdIX024Ecu1SbStIn5T0X5JWSXJJf0upTTtjiKSJ8eN5kpam1RC0SRdJOlTSm5KmpNoSAGgmilIAre1kM/t7d3847YYgt06J7/8q6cvu/kGajQnF3c+VdG7KzUAGuPs8SZZ2OwCgtXBOKYDWUlg4/GtqrUBbcHB8/2p7KUgBAGhPKEoBtJZlku6PH1ea2dfSbAxyrVN8/1GqrQAAAK2CohRAa/p/kj6OH19pZh12dkcFM0/Oa+m2Zjavfpv4+S5mdl68fLWZvW9mfzKzS81srwavPcjMrjCz583sPTN718weM7Nv7sR7OsXMHjCz5Wb2UXx/l5l9phn7OCBu5+Nm9paZbTKzNfHznzRsf5HXL41zsTR+vpuZfc/MnjCzVfEst/Oa+94K9r+HmX3fzB6N2/dRnOMnzOwSM9unkddtna24YPHnCz7bnZrJ2MzOLXjtufGySjP7pZktjj/77fZrkc/Fs//+wcxWxu/jfTN7w8zuNrMRZlb08Mr696Jt55NK0qNF3su8Bq8rOftusRlZzewQM7vWzF6J27fOzP5oZuebWVmn7JjZV81sdvz5fxj3kf8ys083lsNG9nOamd1jZn82sw/ifa0ws0Vm9msz+5aZ7VdOmxrZ/3Z5M7OuZvYzi/7tvhv/+6w1swlmtnsz9jsszv3rFs30/IGZLYmXfbaJ1za7f5XZpkZn340/H1d0PqkkHVqkbzX6WZlZJzP7tpnNMLNl8ee0zqK/cdcW63sNXr9DPzWzUWb2+/jfysa4P/7czLo2eO0+Zvbj+HP6q5ltMLNnzeyfzYzvqEB74u7cuHHjlthN0eQzLumV+PntBcvObeQ1owu2mdTEfuc1ow1Ft1U0yUz9Np0lPVzwvOFtgaT94td9RtLqEtv+okSbJhVsN0TSTSX2s0XSxDLe57mS3iuxH5f0lqTPlNjH0ni7pZIOk/RCkX00mfNG9j1Y0oom2ve2ovNES+Wr1G1IM9t0bmF/lDRB0uZS+5V0W5lt+b2kvVvwXuY1eN3tBet6F9nvkIL1kyQNU3TebWP7f1DSJ0rkZldJ95R4/WZJP2yYwyL72V3SrDLf80UJ/K2ZJ+l4SctLxHld0mFN7K+rpEfKaPMvJe2aVP8q871u91k38m+4qVuxz6pS0p+beN1HksaVaFthP+2jaCKyxva1VNKh8euOlLS4xLb3SLKd7R/cuHHL142JjgC0tomSzlR0COYkM/tvd9+UcpsK3aZoduAnFX0JekvRiMMF8f3xkqaY2URJcxW9j19KekLSJkmfkzRW0cRxPzSzue7+UBMxL5R0uqKC7JeSnpe0h6KiYpSio1gmmdlad7+x2A7M7EJtm2HzI0n3SXpc0lpJXeJ9nSapm6SHzWyQu79Uok2fkPRbSf3i93afpJWKvqh3a+L9FGtfhaQ/KCpQJOk5Sf8t6S+SDpL0DUknStpf0iwz+7JHk7nUu1vSwvjx/fH9i4pG3wu90Ny2FfiGpK9IelfSdEm1in4QOC5eVm93RTn+P0nzJS2R9L6i3Bwh6Wxty/kdij7bQvXvZbSk+hH1y4q0/e0WvJcKST9WNBnOVElPxW2ulPTPkvaU9CVFMxlf3sg+qiWdET/+UFGx8ZSinFRK+rakX0i6t4m2/IukU+PHdYqKlBclbVD0I9Dhin7gOansd1faPor6bk9Fhff/SnpHUdHzbUmHxDEfMbMKd3+v4Q7MrIui99onXvSSpP+R9Jqioz36KSoye8X77KimJ6Eqt3+1VJWivx/VivrkmnhZQwsKn1h0RMbD8WulqCD/vaJTL3ZT9BmdE6//TzP7yN1vb6ItV0v6uqK/af+laCbggxT9jTxG0d/UO8zsNEkPKfrMfhO3Y72kAYr+9u6pqC8+qOhvJIC2Lu2qmBs3bm3rpm2/cr9SsOz6guXfK/KaNEdKXdJPi2zTVdtG+TYrKipWS+pfZNuzC/b1u0ZiTmoQ8wVJBxbZ7nRFlzlxRYXPIUW2GViwzSuS+jYS81RFhbNLeqaRbZY2aNf3E+gDu2j7EdcpknYpst1lBdv8RdJuLf3sy2jbuQ3e78uSejTxms9J2rfE+j21/Qjj58voA0PKaOvtBdv3LrJ+SIP38maxviDphIL+8o6KjJYq+mGmfj9rJB1TZJveRfrLuQ226SBpnbaNiu3Qxxv8G/tUCz7LhiNrFxTZprOkRwu2uaGRfd1fsM3/a6S/dlb0w1T9dsOS6F9lvtfCz3pSI9vUfzZLy9jfXvG/OVf0Y8FXGtnu8Lhf1W93QBP91CX9Z8P8KfphZ1HBNjWKJsM7ucj+TlL0Q4BLeqmluePGjVs+bhyvDyCEKxUVWJJ0qZl1TrMxDcx1939puNDd10iqH6XsoGh0Y7y7P19k218rOjxQkr5Yxrl7myV9091XF9nX/0q6Nn66h6TvFHn9REUjNR9JGu7urxfZRu4+W9HIhSSdYGZ/10S77nf3/2him3IMVzSyJElPKyp0P264kbtfIWl2/PRgSWclELs5XNJod19ZciP3x919XYn17ysaPavv42cn1sLmOatYX3D3+YpGoyRpP0VFakPfL3g83t13GIF296VqenSwq6KRS0l6oFgfL9jfGnd/pYn9letud7+pSIwNin70qh8d/baZ7Vu4jZkN0LbR7Vvd/cpG+mv9vupHOX/QRJvK6l8pGatts1p/x91/X2wjd18s6R/jp3uq+AhsoRcU9Z/t8ufuG7Xtb5EU/bA2yd3/UCTmY4pGTiXpKDM7uOE2ANoeilIArS7+Yjolfnqgoou8Z0XRw2NjTxY8XqXShy0+Ed930rZDABsz191fLLF+iqLD/CTpq4Ur4olh6g+NfCD+0ljKfxU8/nIT297QxPpyFc60/HN39xLbFn5RDT1D8+PuviiJHbn7ekl/ip9+Ool9NtNz7v54ifWFX/6PLlxhZrtpW9+oU3TYalEeHWK9ww8zBTY2FqeVXdvYCndfpW3/DnZXdJh1ocIfEX5RKoi7/1XS7+KnJ5nZJ0psnlj/agX177lO0p2lNowLx/rCuqm/IVPdfXMj6wr/nm5RdJh5Y54oeByyHwFICeeUAgjl54pG/bpI+pGZ3ezu76TcJkl6psS6VQWPa4uNnjSybVMzij5SaqW7v2VmLys6B+sIM9vH3etHZ07Uth8UPzSz05uItWvB46NKbLdF0Tl1SagfiXNF542V8kdtO9cwdDFXqojbTlx8fEPRebrHKTrPtrOiczgb6pVI65rn6SbWryh43LB/Hqdt/eT/mujnUnT4e/9iK9z9XTObr6gP/L2Z3a/ox47H3f1vTex3Z72r6HzNUv4g6fz48SBF5/nW+1x8v0nSkWZ2ZBP7+kTB/ScVHaJbTNn9KySLZryu//zqJI204hNHF9oQ35f6GyKV//f01YK/aU1tu9MzNAPID4pSAEHEX1avkXSNosP7Jkj6SbqtkhRNDNSYwutiltqu4ba7NbFtU6Ob9dsco6joOUjbDhn
2021-03-20 20:01:22 +01:00
"text/plain": [
"<Figure size 1152x576 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
2021-03-26 21:00:52 +01:00
"users_per_item=df.groupby(['item']).count()['rating']\n",
2021-03-20 20:01:22 +01:00
"\n",
"plt.figure(figsize=(16,8))\n",
2021-03-26 21:00:52 +01:00
"plt.hist(users_per_item, bins=100)\n",
2021-03-20 20:01:22 +01:00
"\n",
"# Let's add median\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.median()\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))\n",
"\n",
"# Let's add also some percentiles\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.quantile(0.25)\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))\n",
"\n",
2021-03-26 21:00:52 +01:00
"t=users_per_item.quantile(0.75)\n",
2021-03-20 20:01:22 +01:00
"plt.axvline(t, color='k', linestyle='dashed', linewidth=1)\n",
"plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))\n",
"\n",
"plt.title('Number of ratings per item', fontsize=30)\n",
"plt.show()"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 6,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rating\n",
"1 0.06110\n",
"2 0.11370\n",
"3 0.27145\n",
"4 0.34174\n",
"5 0.21201\n",
"Name: user, dtype: float64"
]
},
2021-06-09 22:16:22 +02:00
"execution_count": 6,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['rating']).count()['user']/len(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Item attributes"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 7,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,\n",
" encoding='latin-1')\n",
"genres=dict(zip(genres[1], genres[0]))"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 8,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'unknown',\n",
" 1: 'Action',\n",
" 2: 'Adventure',\n",
" 3: 'Animation',\n",
" 4: \"Children's\",\n",
" 5: 'Comedy',\n",
" 6: 'Crime',\n",
" 7: 'Documentary',\n",
" 8: 'Drama',\n",
" 9: 'Fantasy',\n",
" 10: 'Film-Noir',\n",
" 11: 'Horror',\n",
" 12: 'Musical',\n",
" 13: 'Mystery',\n",
" 14: 'Romance',\n",
" 15: 'Sci-Fi',\n",
" 16: 'Thriller',\n",
" 17: 'War',\n",
" 18: 'Western'}"
]
},
2021-06-09 22:16:22 +02:00
"execution_count": 8,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 9,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 10,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>01-Jan-1995</td>\n",
" <td>NaN</td>\n",
" <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 \\\n",
"0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
"1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
"2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
"\n",
" 4 5 6 7 8 9 ... \\\n",
"0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... \n",
"1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... \n",
"2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... \n",
"\n",
" 14 15 16 17 18 19 20 21 22 23 \n",
"0 0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 1 0 0 \n",
"2 0 0 0 0 0 0 0 1 0 0 \n",
"\n",
"[3 rows x 24 columns]"
]
},
2021-06-09 22:16:22 +02:00
"execution_count": 10,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[:3]"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 11,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"for i in range(19):\n",
" movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 12,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 13,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"movies=movies[[0,1,'genre']]\n",
"movies.columns=['id', 'title', 'genres']"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 14,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation, Children's, Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Four Rooms (1995)</td>\n",
" <td>Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Get Shorty (1995)</td>\n",
" <td>Action, Comedy, Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Copycat (1995)</td>\n",
" <td>Crime, Drama, Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title genres\n",
"0 1 Toy Story (1995) Animation, Children's, Comedy\n",
"1 2 GoldenEye (1995) Action, Adventure, Thriller\n",
"2 3 Four Rooms (1995) Thriller\n",
"3 4 Get Shorty (1995) Action, Comedy, Drama\n",
"4 5 Copycat (1995) Crime, Drama, Thriller"
]
},
2021-06-09 22:16:22 +02:00
"execution_count": 14,
2021-03-20 20:01:22 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)\n",
"movies[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Toy example"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 15,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"os.makedirs('./Datasets/toy-example/', exist_ok = True)"
]
},
{
"cell_type": "code",
2021-06-09 22:16:22 +02:00
"execution_count": 16,
2021-03-20 20:01:22 +01:00
"metadata": {},
"outputs": [],
"source": [
"toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],\n",
" [10,10,1,0], [10,20,2,0], [10,30,3,0],\n",
" [20,30,5,0], [20,50,3,0], [20,60,4,0]])\n",
"toy_test=pd.DataFrame([[0,60,3,0],\n",
" [10,40,5,0],\n",
" [20,0,5,0], [20,20,4,0], [20,70,2,0]])\n",
"\n",
"toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\\t', header=None, index=False)\n",
"toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\\t', header=None, index=False)"
]
2021-06-07 19:19:39 +02:00
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2021-03-20 20:01:22 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2021-06-07 19:19:39 +02:00
"version": "3.8.8"
2021-03-20 20:01:22 +01:00
}
},
"nbformat": 4,
"nbformat_minor": 4
}