REK-proj-2/project_1_data_preparation.ipynb
2021-06-28 20:18:14 +02:00

2187 lines
76 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "alike-morgan",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from IPython.display import Markdown, display, HTML\n",
"from collections import defaultdict\n",
"\n",
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
"import os\n",
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "friendly-herald",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from data_preprocessing.dataset_specification import DatasetSpecification\n",
"from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit\n",
"from data_preprocessing.people_identifier import PeopleIdentifier"
]
},
{
"cell_type": "markdown",
"id": "prepared-signal",
"metadata": {},
"source": [
"# Load original data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "solid-crisis",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>reservation_id</th>\n",
" <th>group_id</th>\n",
" <th>room_id</th>\n",
" <th>room_group_id</th>\n",
" <th>date_from</th>\n",
" <th>date_to</th>\n",
" <th>booking_date</th>\n",
" <th>booking_time</th>\n",
" <th>n_people</th>\n",
" <th>n_children_1</th>\n",
" <th>n_children_2</th>\n",
" <th>n_children_3</th>\n",
" <th>discount</th>\n",
" <th>accomodation_price</th>\n",
" <th>meal_price</th>\n",
" <th>service_price</th>\n",
" <th>paid</th>\n",
" <th>rate_plan</th>\n",
" <th>client_id</th>\n",
" <th>client_name</th>\n",
" <th>email</th>\n",
" <th>phone</th>\n",
" <th>is_company</th>\n",
" <th>reservation_status</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14160</td>\n",
" <td></td>\n",
" <td>135</td>\n",
" <td>135</td>\n",
" <td>2017-09-01</td>\n",
" <td>2018-03-30</td>\n",
" <td>2017-07-04</td>\n",
" <td>2017-07-04 10:52:00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>Standard</td>\n",
" <td>51665</td>\n",
" <td>86bd787ca115281ad9642c5fd6e79e6f2d87841c2fd9c6812b32b6a761109b62</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16075</td>\n",
" <td></td>\n",
" <td>118</td>\n",
" <td>118</td>\n",
" <td>2018-02-10</td>\n",
" <td>2018-02-12</td>\n",
" <td>2017-08-17</td>\n",
" <td>2017-08-17 15:01:00</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>992.29</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>Standard</td>\n",
" <td>54117</td>\n",
" <td>ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c</td>\n",
" <td></td>\n",
" <td>318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16076</td>\n",
" <td></td>\n",
" <td>270</td>\n",
" <td>270</td>\n",
" <td>2018-02-28</td>\n",
" <td>2018-03-02</td>\n",
" <td>2017-08-17</td>\n",
" <td>2017-08-17 15:08:00</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>693.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>693.40</td>\n",
" <td>Standard</td>\n",
" <td>54118</td>\n",
" <td>4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a</td>\n",
" <td>f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3</td>\n",
" <td>cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16635</td>\n",
" <td></td>\n",
" <td>294</td>\n",
" <td>294</td>\n",
" <td>2018-02-14</td>\n",
" <td>2018-02-15</td>\n",
" <td>2017-08-29</td>\n",
" <td>2017-08-29 13:58:00</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>366.80</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>Standard</td>\n",
" <td>54790</td>\n",
" <td>e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee</td>\n",
" <td>f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35</td>\n",
" <td>1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>16964</td>\n",
" <td></td>\n",
" <td>183</td>\n",
" <td>183</td>\n",
" <td>2018-02-03</td>\n",
" <td>2018-02-09</td>\n",
" <td>2017-09-04</td>\n",
" <td>2017-09-04 15:52:00</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>1064.60</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>Standard</td>\n",
" <td>55177</td>\n",
" <td>5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf</td>\n",
" <td>6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72</td>\n",
" <td>3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>17173</td>\n",
" <td></td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-01-29</td>\n",
" <td>2018-02-02</td>\n",
" <td>2017-09-07</td>\n",
" <td>2017-09-07 13:21:00</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>713.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>Standard</td>\n",
" <td>55412</td>\n",
" <td>4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9</td>\n",
" <td>0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086</td>\n",
" <td>ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>17308</td>\n",
" <td></td>\n",
" <td>111</td>\n",
" <td>111</td>\n",
" <td>2018-03-28</td>\n",
" <td>2018-03-31</td>\n",
" <td>2017-09-11</td>\n",
" <td>2017-09-11 10:31:00</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>800.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>800.00</td>\n",
" <td>Standard</td>\n",
" <td>55560</td>\n",
" <td>1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44</td>\n",
" <td>6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9</td>\n",
" <td>516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>120165</td>\n",
" <td></td>\n",
" <td>162</td>\n",
" <td>162</td>\n",
" <td>2018-11-16</td>\n",
" <td>2018-11-17</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>402.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>402.00</td>\n",
" <td>Standard</td>\n",
" <td>63419</td>\n",
" <td>d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9</td>\n",
" <td>5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251</td>\n",
" <td>6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>120183</td>\n",
" <td></td>\n",
" <td>45</td>\n",
" <td>45</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>660.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>660.00</td>\n",
" <td>Standard</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>120184</td>\n",
" <td></td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-08-17</td>\n",
" <td>2018-08-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>320.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>320.00</td>\n",
" <td>Standard</td>\n",
" <td>61778</td>\n",
" <td>364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>120185</td>\n",
" <td></td>\n",
" <td>126</td>\n",
" <td>126</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>720.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>720.00</td>\n",
" <td>Standard</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>120186</td>\n",
" <td></td>\n",
" <td>65</td>\n",
" <td>65</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>480.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>480.00</td>\n",
" <td>Standard</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>120194</td>\n",
" <td></td>\n",
" <td>267</td>\n",
" <td>267</td>\n",
" <td>2018-08-10</td>\n",
" <td>2018-08-11</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>595.98</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>595.98</td>\n",
" <td>Standard</td>\n",
" <td>61378</td>\n",
" <td>f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0</td>\n",
" <td>a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5</td>\n",
" <td>446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>120209</td>\n",
" <td></td>\n",
" <td>370</td>\n",
" <td>362</td>\n",
" <td>2018-08-03</td>\n",
" <td>2018-08-05</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>915.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>915.00</td>\n",
" <td>Standard</td>\n",
" <td>61105</td>\n",
" <td>8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9</td>\n",
" <td>37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7</td>\n",
" <td>27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>120232</td>\n",
" <td></td>\n",
" <td>321</td>\n",
" <td>321</td>\n",
" <td>2018-07-22</td>\n",
" <td>2018-07-30</td>\n",
" <td>2018-02-19</td>\n",
" <td>2018-02-19 17:44:00</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>3579.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3579.40</td>\n",
" <td>Standard</td>\n",
" <td>61807</td>\n",
" <td>cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751</td>\n",
" <td></td>\n",
" <td>5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data_path = os.path.join(\"data\", \"hotel_data\")\n",
"\n",
"original_data = pd.read_csv(os.path.join(data_path, \"hotel_data_original.csv\"), index_col=0)\n",
"\n",
"original_data = original_data.replace({\"\\\\N\": \"\"})\n",
"original_data = original_data.fillna(\"\")\n",
"\n",
"numeric_columns = [\"n_people\", \"n_children_1\", \"n_children_2\", \"n_children_3\",\n",
" \"discount\", \"accomodation_price\", \"meal_price\", \"service_price\",\n",
" \"paid\"]\n",
"\n",
"for column in numeric_columns:\n",
" original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors=\"coerce\")\n",
"\n",
"original_data = original_data.astype(\n",
" {\n",
" \"date_from\": np.datetime64,\n",
" \"date_to\": np.datetime64,\n",
" \"booking_time\": np.datetime64,\n",
" \"booking_date\": np.datetime64,\n",
" \"n_people\": np.int64,\n",
" \"n_children_1\": np.int64,\n",
" \"n_children_2\": np.int64,\n",
" \"n_children_3\": np.int64,\n",
" \"discount\": np.float64,\n",
" \"accomodation_price\": np.float64,\n",
" \"meal_price\": np.float64,\n",
" \"service_price\": np.float64,\n",
" \"paid\": np.float64,\n",
" }\n",
" )\n",
"\n",
"display(HTML(original_data.head(15).to_html()))"
]
},
{
"cell_type": "markdown",
"id": "endangered-lingerie",
"metadata": {},
"source": [
"# Preprocess the data\n",
"\n",
"- Identify users by client_id, name hash, phone hash, email hash.\n",
"- Fix date_to - originally it points to the last full day of stay, not the departure date.\n",
"- Add length of stay.\n",
"- Add book to arrival.\n",
"- Add number of rooms (important for group reservations).\n",
"- Add indicator for stays encompasing a weekend.\n",
"- Add night price.\n",
"- Fix book to arrival to be not smaller than 0.\n",
"- Filter out companies as recommendations for such clients should work differently.\n",
"- Aggregate group reservations into single interactions.\n",
"\n",
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
"In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:\n",
" - add_length_of_stay - should add 'length_of_stay' variable to the DataFrame, which counts the number of nights the customer stayed at the hotel,\n",
" - add_night_price - should add 'night_price' column to the dataset DataFrame, which shows the average accomodation price per night per room (there can be many rooms in group reservations - 'n_rooms' column).\n",
"You have to pass all assertions."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "swedish-iceland",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_people</th>\n",
" <th>n_children_1</th>\n",
" <th>n_children_2</th>\n",
" <th>n_children_3</th>\n",
" <th>accomodation_price</th>\n",
" <th>meal_price</th>\n",
" <th>service_price</th>\n",
" <th>paid</th>\n",
" <th>n_rooms</th>\n",
" <th>discount</th>\n",
" <th>room_id</th>\n",
" <th>room_group_id</th>\n",
" <th>date_from</th>\n",
" <th>date_to</th>\n",
" <th>booking_date</th>\n",
" <th>rate_plan</th>\n",
" <th>length_of_stay</th>\n",
" <th>book_to_arrival</th>\n",
" <th>weekend_stay</th>\n",
" <th>user_id</th>\n",
" <th>client_id</th>\n",
" <th>client_name</th>\n",
" <th>email</th>\n",
" <th>phone</th>\n",
" <th>is_company</th>\n",
" <th>night_price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>992.29</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>118</td>\n",
" <td>118</td>\n",
" <td>2018-02-10</td>\n",
" <td>2018-02-13</td>\n",
" <td>2017-08-17</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>177</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>54117</td>\n",
" <td>ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c</td>\n",
" <td></td>\n",
" <td>318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a</td>\n",
" <td>0</td>\n",
" <td>330.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>693.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>693.40</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>270</td>\n",
" <td>270</td>\n",
" <td>2018-02-28</td>\n",
" <td>2018-03-03</td>\n",
" <td>2017-08-17</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>195</td>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>54118</td>\n",
" <td>4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a</td>\n",
" <td>f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3</td>\n",
" <td>cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387</td>\n",
" <td>0</td>\n",
" <td>231.13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>366.80</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>294</td>\n",
" <td>294</td>\n",
" <td>2018-02-14</td>\n",
" <td>2018-02-16</td>\n",
" <td>2017-08-29</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>169</td>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>54790</td>\n",
" <td>e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee</td>\n",
" <td>f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35</td>\n",
" <td>1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac</td>\n",
" <td>0</td>\n",
" <td>183.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1064.60</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>183</td>\n",
" <td>183</td>\n",
" <td>2018-02-03</td>\n",
" <td>2018-02-10</td>\n",
" <td>2017-09-04</td>\n",
" <td>Standard</td>\n",
" <td>7</td>\n",
" <td>152</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>55177</td>\n",
" <td>5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf</td>\n",
" <td>6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72</td>\n",
" <td>3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793</td>\n",
" <td>0</td>\n",
" <td>152.09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>713.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-01-29</td>\n",
" <td>2018-02-03</td>\n",
" <td>2017-09-07</td>\n",
" <td>Standard</td>\n",
" <td>5</td>\n",
" <td>144</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>55412</td>\n",
" <td>4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9</td>\n",
" <td>0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086</td>\n",
" <td>ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6</td>\n",
" <td>0</td>\n",
" <td>142.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>800.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>800.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>111</td>\n",
" <td>111</td>\n",
" <td>2018-03-28</td>\n",
" <td>2018-04-01</td>\n",
" <td>2017-09-11</td>\n",
" <td>Standard</td>\n",
" <td>4</td>\n",
" <td>198</td>\n",
" <td>True</td>\n",
" <td>6</td>\n",
" <td>55560</td>\n",
" <td>1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44</td>\n",
" <td>6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9</td>\n",
" <td>516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44</td>\n",
" <td>0</td>\n",
" <td>200.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>402.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>402.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>162</td>\n",
" <td>162</td>\n",
" <td>2018-11-16</td>\n",
" <td>2018-11-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>270</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>63419</td>\n",
" <td>d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9</td>\n",
" <td>5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251</td>\n",
" <td>6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633</td>\n",
" <td>0</td>\n",
" <td>201.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>660.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>660.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>45</td>\n",
" <td>45</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>220.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>320.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>320.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-08-17</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>179</td>\n",
" <td>True</td>\n",
" <td>9</td>\n",
" <td>61778</td>\n",
" <td>364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>160.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>720.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>720.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>126</td>\n",
" <td>126</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>240.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>480.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>480.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>65</td>\n",
" <td>65</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>160.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>595.98</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>595.98</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>267</td>\n",
" <td>267</td>\n",
" <td>2018-08-10</td>\n",
" <td>2018-08-12</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>172</td>\n",
" <td>True</td>\n",
" <td>10</td>\n",
" <td>61378</td>\n",
" <td>f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0</td>\n",
" <td>a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5</td>\n",
" <td>446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad</td>\n",
" <td>0</td>\n",
" <td>297.99</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>915.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>915.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>370</td>\n",
" <td>362</td>\n",
" <td>2018-08-03</td>\n",
" <td>2018-08-06</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>165</td>\n",
" <td>True</td>\n",
" <td>11</td>\n",
" <td>61105</td>\n",
" <td>8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9</td>\n",
" <td>37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7</td>\n",
" <td>27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766</td>\n",
" <td>0</td>\n",
" <td>305.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3579.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3579.40</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>321</td>\n",
" <td>321</td>\n",
" <td>2018-07-22</td>\n",
" <td>2018-07-31</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>9</td>\n",
" <td>153</td>\n",
" <td>True</td>\n",
" <td>12</td>\n",
" <td>61807</td>\n",
" <td>cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751</td>\n",
" <td></td>\n",
" <td>5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5</td>\n",
" <td>0</td>\n",
" <td>397.71</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>213</td>\n",
" <td>213</td>\n",
" <td>2018-07-13</td>\n",
" <td>2018-07-16</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>144</td>\n",
" <td>True</td>\n",
" <td>14</td>\n",
" <td>61391</td>\n",
" <td>4d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea</td>\n",
" <td>25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c32</td>\n",
" <td>97edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa2027</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"preprocessed_data = original_data.copy()\n",
"\n",
"dataset_specification = DatasetSpecification()\n",
"dp_toolkit = DataPreprocessingToolkit()\n",
"\n",
"id_column_names = dataset_specification.get_id_columns()\n",
"\n",
"people_identifier = PeopleIdentifier()\n",
"preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, \"user_id\")\n",
"\n",
"preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data) # Code this method\n",
"preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)\n",
"\n",
"preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)\n",
"\n",
"preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)\n",
"\n",
"preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)\n",
"\n",
"preprocessed_data = dp_toolkit.add_night_price(preprocessed_data) # Code this method (remember that there can be many rooms)\n",
"\n",
"preprocessed_data = preprocessed_data.reset_index(drop=True)\n",
"\n",
"assert preprocessed_data.iloc[1]['length_of_stay'] == 3\n",
"assert preprocessed_data.iloc[2]['length_of_stay'] == 2\n",
"assert preprocessed_data.iloc[3]['length_of_stay'] == 7\n",
"\n",
"assert preprocessed_data.iloc[0]['night_price'] == 330.76\n",
"assert preprocessed_data.iloc[1]['night_price'] == 231.13\n",
"assert preprocessed_data.iloc[2]['night_price'] == 183.40\n",
"\n",
"display(HTML(preprocessed_data.head(15).to_html()))"
]
},
{
"cell_type": "markdown",
"id": "coupled-river",
"metadata": {},
"source": [
"# Bucket important features to reduce the offer space size\n",
"\n",
"Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \\sim 3.3 \\text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:\n",
"\n",
"```python\n",
"column_values_dict = {\n",
" 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n",
" 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n",
" 'rate_plan': ['Standard', 'Nonref'],\n",
" 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n",
" 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n",
" 'weekend_stay': ['True', 'False']\n",
"}\n",
"```\n",
"\n",
"Explanation:\n",
" - term - the term of the arrival date,\n",
" - length_of_stay_bucket - aggregated length of stay,\n",
" - rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),\n",
" - room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,\n",
" - n_people_bucket - aggregated number of people in a reservation,\n",
" - weekend_stay - indicates if the stay encompassed a weekend.\n",
"\n",
"The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.\n",
"\n",
"The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.\n",
"\n",
"### The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.\n",
"\n",
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
"In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every **room_group_id** and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions."
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "interracial-rendering",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_people</th>\n",
" <th>n_children_1</th>\n",
" <th>n_children_2</th>\n",
" <th>n_children_3</th>\n",
" <th>accomodation_price</th>\n",
" <th>meal_price</th>\n",
" <th>service_price</th>\n",
" <th>paid</th>\n",
" <th>n_rooms</th>\n",
" <th>discount</th>\n",
" <th>room_id</th>\n",
" <th>room_group_id</th>\n",
" <th>date_from</th>\n",
" <th>date_to</th>\n",
" <th>booking_date</th>\n",
" <th>rate_plan</th>\n",
" <th>length_of_stay</th>\n",
" <th>book_to_arrival</th>\n",
" <th>weekend_stay</th>\n",
" <th>user_id</th>\n",
" <th>client_id</th>\n",
" <th>client_name</th>\n",
" <th>email</th>\n",
" <th>phone</th>\n",
" <th>is_company</th>\n",
" <th>night_price</th>\n",
" <th>term</th>\n",
" <th>length_of_stay_bucket</th>\n",
" <th>room_segment</th>\n",
" <th>n_people_bucket</th>\n",
" <th>item</th>\n",
" <th>item_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>992.29</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>118</td>\n",
" <td>118</td>\n",
" <td>2018-02-10</td>\n",
" <td>2018-02-13</td>\n",
" <td>2017-08-17</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>177</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>54117</td>\n",
" <td>ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c</td>\n",
" <td></td>\n",
" <td>318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a</td>\n",
" <td>0</td>\n",
" <td>330.76</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>WinterVacation [2-3] Standard [260-360] [5-inf] True</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>693.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>693.40</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>270</td>\n",
" <td>270</td>\n",
" <td>2018-02-28</td>\n",
" <td>2018-03-03</td>\n",
" <td>2017-08-17</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>195</td>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>54118</td>\n",
" <td>4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a</td>\n",
" <td>f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3</td>\n",
" <td>cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387</td>\n",
" <td>0</td>\n",
" <td>231.13</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>WinterVacation [2-3] Standard [160-260] [3-4] True</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>366.80</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>294</td>\n",
" <td>294</td>\n",
" <td>2018-02-14</td>\n",
" <td>2018-02-16</td>\n",
" <td>2017-08-29</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>169</td>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>54790</td>\n",
" <td>e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee</td>\n",
" <td>f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35</td>\n",
" <td>1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac</td>\n",
" <td>0</td>\n",
" <td>183.40</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[2-2]</td>\n",
" <td>WinterVacation [2-3] Standard [160-260] [2-2] False</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1064.60</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>183</td>\n",
" <td>183</td>\n",
" <td>2018-02-03</td>\n",
" <td>2018-02-10</td>\n",
" <td>2017-09-04</td>\n",
" <td>Standard</td>\n",
" <td>7</td>\n",
" <td>152</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>55177</td>\n",
" <td>5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf</td>\n",
" <td>6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72</td>\n",
" <td>3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793</td>\n",
" <td>0</td>\n",
" <td>152.09</td>\n",
" <td>WinterVacation</td>\n",
" <td>[4-7]</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>WinterVacation [4-7] Standard [160-260] [3-4] True</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>713.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-01-29</td>\n",
" <td>2018-02-03</td>\n",
" <td>2017-09-07</td>\n",
" <td>Standard</td>\n",
" <td>5</td>\n",
" <td>144</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>55412</td>\n",
" <td>4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9</td>\n",
" <td>0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086</td>\n",
" <td>ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6</td>\n",
" <td>0</td>\n",
" <td>142.60</td>\n",
" <td>WinterVacation</td>\n",
" <td>[4-7]</td>\n",
" <td>[0-160]</td>\n",
" <td>[2-2]</td>\n",
" <td>WinterVacation [4-7] Standard [0-160] [2-2] True</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>800.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>800.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>111</td>\n",
" <td>111</td>\n",
" <td>2018-03-28</td>\n",
" <td>2018-04-01</td>\n",
" <td>2017-09-11</td>\n",
" <td>Standard</td>\n",
" <td>4</td>\n",
" <td>198</td>\n",
" <td>True</td>\n",
" <td>6</td>\n",
" <td>55560</td>\n",
" <td>1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44</td>\n",
" <td>6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9</td>\n",
" <td>516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44</td>\n",
" <td>0</td>\n",
" <td>200.00</td>\n",
" <td>Easter</td>\n",
" <td>[4-7]</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>Easter [4-7] Standard [260-360] [5-inf] True</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>402.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>402.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>162</td>\n",
" <td>162</td>\n",
" <td>2018-11-16</td>\n",
" <td>2018-11-18</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>270</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>63419</td>\n",
" <td>d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9</td>\n",
" <td>5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251</td>\n",
" <td>6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633</td>\n",
" <td>0</td>\n",
" <td>201.00</td>\n",
" <td>OffSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>OffSeason [2-3] Standard [260-360] [5-inf] True</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>660.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>660.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>45</td>\n",
" <td>45</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>220.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>HighSeason [2-3] Standard [160-260] [1-1] True</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>320.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>320.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>64</td>\n",
" <td>64</td>\n",
" <td>2018-08-17</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>179</td>\n",
" <td>True</td>\n",
" <td>9</td>\n",
" <td>61778</td>\n",
" <td>364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>160.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[0-160]</td>\n",
" <td>[1-1]</td>\n",
" <td>HighSeason [2-3] Standard [0-160] [1-1] True</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>720.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>720.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>126</td>\n",
" <td>126</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>240.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>HighSeason [2-3] Standard [160-260] [1-1] True</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>480.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>480.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>65</td>\n",
" <td>65</td>\n",
" <td>2018-08-16</td>\n",
" <td>2018-08-19</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>178</td>\n",
" <td>True</td>\n",
" <td>8</td>\n",
" <td>61777</td>\n",
" <td>f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0</td>\n",
" <td>160.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>HighSeason [2-3] Standard [160-260] [1-1] True</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>595.98</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>595.98</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>267</td>\n",
" <td>267</td>\n",
" <td>2018-08-10</td>\n",
" <td>2018-08-12</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>2</td>\n",
" <td>172</td>\n",
" <td>True</td>\n",
" <td>10</td>\n",
" <td>61378</td>\n",
" <td>f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0</td>\n",
" <td>a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5</td>\n",
" <td>446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad</td>\n",
" <td>0</td>\n",
" <td>297.99</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>HighSeason [2-3] Standard [160-260] [3-4] True</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>915.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>915.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>370</td>\n",
" <td>362</td>\n",
" <td>2018-08-03</td>\n",
" <td>2018-08-06</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>165</td>\n",
" <td>True</td>\n",
" <td>11</td>\n",
" <td>61105</td>\n",
" <td>8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9</td>\n",
" <td>37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7</td>\n",
" <td>27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766</td>\n",
" <td>0</td>\n",
" <td>305.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>HighSeason [2-3] Standard [160-260] [3-4] True</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3579.40</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3579.40</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>321</td>\n",
" <td>321</td>\n",
" <td>2018-07-22</td>\n",
" <td>2018-07-31</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>9</td>\n",
" <td>153</td>\n",
" <td>True</td>\n",
" <td>12</td>\n",
" <td>61807</td>\n",
" <td>cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751</td>\n",
" <td></td>\n",
" <td>5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5</td>\n",
" <td>0</td>\n",
" <td>397.71</td>\n",
" <td>HighSeason</td>\n",
" <td>[8-inf]</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>HighSeason [8-inf] Standard [160-260] [3-4] True</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.00</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>213</td>\n",
" <td>213</td>\n",
" <td>2018-07-13</td>\n",
" <td>2018-07-16</td>\n",
" <td>2018-02-19</td>\n",
" <td>Standard</td>\n",
" <td>3</td>\n",
" <td>144</td>\n",
" <td>True</td>\n",
" <td>14</td>\n",
" <td>61391</td>\n",
" <td>4d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea</td>\n",
" <td>25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c32</td>\n",
" <td>97edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa2027</td>\n",
" <td>0</td>\n",
" <td>0.00</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>[0-160]</td>\n",
" <td>[3-4]</td>\n",
" <td>HighSeason [2-3] Standard [0-160] [3-4] True</td>\n",
" <td>11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)\n",
"preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data) # Code this method\n",
"preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)\n",
"\n",
"assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'\n",
"assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'\n",
"assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'\n",
"\n",
"preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)\n",
"\n",
"preprocessed_data.to_csv(os.path.join(data_path, \"hotel_data_preprocessed.csv\"))\n",
"\n",
"display(HTML(preprocessed_data.head(15).to_html()))"
]
},
{
"cell_type": "markdown",
"id": "offshore-biography",
"metadata": {},
"source": [
"# Base statistics"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "acknowledged-crime",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of users: 14188\n",
"\n",
"Number of items: 771\n",
"\n",
"Number of interactions: 16102\n",
"\n"
]
},
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_users</th>\n",
" </tr>\n",
" <tr>\n",
" <th>item_id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>679</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>581</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>581</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>413</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>226</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124</th>\n",
" <td>224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>204</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>196</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>n_items</th>\n",
" </tr>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>706</th>\n",
" <td>337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1736</th>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7779</th>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1413</th>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3336</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2930</th>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(\"Number of users: {}\".format(len(preprocessed_data['user_id'].unique())))\n",
"print()\n",
"print(\"Number of items: {}\".format(len(preprocessed_data['item_id'].unique())))\n",
"print()\n",
"print(\"Number of interactions: {}\".format(len(preprocessed_data)))\n",
"print()\n",
"\n",
"n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)\n",
"n_user = n_user.rename(columns={'user_id': 'n_users'})\n",
"display(HTML(n_user.head(10).to_html()))\n",
"\n",
"n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)\n",
"n_item = n_item.rename(columns={'item_id': 'n_items'})\n",
"display(HTML(n_item.head(10).to_html()))"
]
},
{
"cell_type": "markdown",
"id": "blessed-knitting",
"metadata": {},
"source": [
"# Prepare the dataset for recommenders\n",
"\n",
"One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "victorian-bottom",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>term</th>\n",
" <th>length_of_stay_bucket</th>\n",
" <th>rate_plan</th>\n",
" <th>room_segment</th>\n",
" <th>n_people_bucket</th>\n",
" <th>weekend_stay</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>WinterVacation</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[2-2]</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>WinterVacation</td>\n",
" <td>[4-7]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>WinterVacation</td>\n",
" <td>[4-7]</td>\n",
" <td>Standard</td>\n",
" <td>[0-160]</td>\n",
" <td>[2-2]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>Easter</td>\n",
" <td>[4-7]</td>\n",
" <td>Standard</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>6</td>\n",
" <td>OffSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[260-360]</td>\n",
" <td>[5-inf]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[0-160]</td>\n",
" <td>[1-1]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[1-1]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>10</td>\n",
" <td>9</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>11</td>\n",
" <td>9</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>HighSeason</td>\n",
" <td>[8-inf]</td>\n",
" <td>Standard</td>\n",
" <td>[160-260]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>14</td>\n",
" <td>11</td>\n",
" <td>HighSeason</td>\n",
" <td>[2-3]</td>\n",
" <td>Standard</td>\n",
" <td>[0-160]</td>\n",
" <td>[3-4]</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']\n",
"\n",
"interactions_df = preprocessed_data.loc[\n",
" :, ['user_id', 'item_id'] + item_features]\n",
"\n",
"column_values_dict = {\n",
" 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n",
" 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n",
" 'rate_plan': ['Standard', 'Nonref'],\n",
" 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n",
" 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n",
" 'weekend_stay': ['True', 'False']\n",
"}\n",
"\n",
"interactions_df.loc[:, 'term'] = pd.Categorical(\n",
" interactions_df['term'], categories=column_values_dict['term'])\n",
"interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(\n",
" interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])\n",
"interactions_df.loc[:, 'rate_plan'] = pd.Categorical(\n",
" interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])\n",
"interactions_df.loc[:, 'room_segment'] = pd.Categorical(\n",
" interactions_df['room_segment'], categories=column_values_dict['room_segment'])\n",
"interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(\n",
" interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])\n",
"interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(\n",
" interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])\n",
"\n",
"interactions_df.to_csv(os.path.join(data_path, \"hotel_data_interactions_df.csv\"))\n",
"\n",
"display(HTML(interactions_df.head(15).to_html()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "rek_uno",
"language": "python",
"name": "rek_uno"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}