{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "alike-morgan", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from IPython.display import Markdown, display, HTML\n", "from collections import defaultdict\n", "\n", "# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n", "import os\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'" ] }, { "cell_type": "code", "execution_count": 9, "id": "friendly-herald", "metadata": { "scrolled": true }, "outputs": [], "source": [ "from data_preprocessing.dataset_specification import DatasetSpecification\n", "from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit\n", "from data_preprocessing.people_identifier import PeopleIdentifier" ] }, { "cell_type": "markdown", "id": "prepared-signal", "metadata": {}, "source": [ "# Load original data" ] }, { "cell_type": "code", "execution_count": 10, "id": "solid-crisis", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reservation_idgroup_idroom_idroom_group_iddate_fromdate_tobooking_datebooking_timen_peoplen_children_1n_children_2n_children_3discountaccomodation_pricemeal_priceservice_pricepaidrate_planclient_idclient_nameemailphoneis_companyreservation_status
0141601351352017-09-012018-03-302017-07-042017-07-04 10:52:001000NaN0.000.00.00.00Standard5166586bd787ca115281ad9642c5fd6e79e6f2d87841c2fd9c6812b32b6a761109b6201
1160751181182018-02-102018-02-122017-08-172017-08-17 15:01:005000NaN992.290.00.01.00Standard54117ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a01
2160762702702018-02-282018-03-022017-08-172017-08-17 15:08:0040000.0693.400.00.0693.40Standard541184db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639af9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e6238702
3166352942942018-02-142018-02-152017-08-292017-08-29 13:58:002000NaN366.800.00.01.00Standard54790e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdeef6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc351c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac01
4169641831832018-02-032018-02-092017-09-042017-09-04 15:52:004000NaN1064.600.00.01.00Standard551775380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a723aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa616579301
51717364642018-01-292018-02-022017-09-072017-09-07 13:21:002000NaN713.000.00.01.00Standard554124aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d90d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e601
6173081111112018-03-282018-03-312017-09-112017-09-11 10:31:0050000.0800.000.00.0800.00Standard555601f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a446163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f4404
71201651621622018-11-162018-11-172018-02-192018-02-19 17:44:0050000.0402.000.00.0402.00Standard63419d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd95213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd19722516416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f829063304
812018345452018-08-162018-08-182018-02-192018-02-19 17:44:0010000.0660.000.00.0660.00Standard61777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab04
912018464642018-08-172018-08-182018-02-192018-02-19 17:44:0010000.0320.000.00.0320.00Standard61778364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d4689504
101201851261262018-08-162018-08-182018-02-192018-02-19 17:44:0010000.0720.000.00.0720.00Standard61777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab04
1112018665652018-08-162018-08-182018-02-192018-02-19 17:44:0010000.0480.000.00.0480.00Standard61777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab04
121201942672672018-08-102018-08-112018-02-192018-02-19 17:44:0030000.0595.980.00.0595.98Standard61378f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad04
131202093703622018-08-032018-08-052018-02-192018-02-19 17:44:0040000.0915.000.00.0915.00Standard611058373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc937827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f727673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc866617376604
141202323213212018-07-222018-07-302018-02-192018-02-19 17:44:0030000.03579.400.00.03579.40Standard61807cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d97515aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df503
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data_path = os.path.join(\"data\", \"hotel_data\")\n", "\n", "original_data = pd.read_csv(os.path.join(data_path, \"hotel_data_original.csv\"), index_col=0)\n", "\n", "original_data = original_data.replace({\"\\\\N\": \"\"})\n", "original_data = original_data.fillna(\"\")\n", "\n", "numeric_columns = [\"n_people\", \"n_children_1\", \"n_children_2\", \"n_children_3\",\n", " \"discount\", \"accomodation_price\", \"meal_price\", \"service_price\",\n", " \"paid\"]\n", "\n", "for column in numeric_columns:\n", " original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors=\"coerce\")\n", "\n", "original_data = original_data.astype(\n", " {\n", " \"date_from\": np.datetime64,\n", " \"date_to\": np.datetime64,\n", " \"booking_time\": np.datetime64,\n", " \"booking_date\": np.datetime64,\n", " \"n_people\": np.int64,\n", " \"n_children_1\": np.int64,\n", " \"n_children_2\": np.int64,\n", " \"n_children_3\": np.int64,\n", " \"discount\": np.float64,\n", " \"accomodation_price\": np.float64,\n", " \"meal_price\": np.float64,\n", " \"service_price\": np.float64,\n", " \"paid\": np.float64,\n", " }\n", " )\n", "\n", "display(HTML(original_data.head(15).to_html()))" ] }, { "cell_type": "markdown", "id": "endangered-lingerie", "metadata": {}, "source": [ "# Preprocess the data\n", "\n", "- Identify users by client_id, name hash, phone hash, email hash.\n", "- Fix date_to - originally it points to the last full day of stay, not the departure date.\n", "- Add length of stay.\n", "- Add book to arrival.\n", "- Add number of rooms (important for group reservations).\n", "- Add indicator for stays encompasing a weekend.\n", "- Add night price.\n", "- Fix book to arrival to be not smaller than 0.\n", "- Filter out companies as recommendations for such clients should work differently.\n", "- Aggregate group reservations into single interactions.\n", "\n", "**Task:**
\n", "In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:\n", " - add_length_of_stay - should add 'length_of_stay' variable to the DataFrame, which counts the number of nights the customer stayed at the hotel,\n", " - add_night_price - should add 'night_price' column to the dataset DataFrame, which shows the average accomodation price per night per room (there can be many rooms in group reservations - 'n_rooms' column).\n", "You have to pass all assertions." ] }, { "cell_type": "code", "execution_count": 11, "id": "swedish-iceland", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_peoplen_children_1n_children_2n_children_3accomodation_pricemeal_priceservice_pricepaidn_roomsdiscountroom_idroom_group_iddate_fromdate_tobooking_daterate_planlength_of_staybook_to_arrivalweekend_stayuser_idclient_idclient_nameemailphoneis_companynight_price
05000992.290.00.01.001NaN1181182018-02-102018-02-132017-08-17Standard3177True154117ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a0330.76
14000693.400.00.0693.4010.02702702018-02-282018-03-032017-08-17Standard3195True2541184db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639af9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e623870231.13
22000366.800.00.01.001NaN2942942018-02-142018-02-162017-08-29Standard2169False354790e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdeef6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc351c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac0183.40
340001064.600.00.01.001NaN1831832018-02-032018-02-102017-09-04Standard7152True4551775380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a723aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa61657930152.09
42000713.000.00.01.001NaN64642018-01-292018-02-032017-09-07Standard5144True5554124aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d90d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e60142.60
55000800.000.00.0800.0010.01111112018-03-282018-04-012017-09-11Standard4198True6555601f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a446163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f440200.00
65000402.000.00.0402.0010.01621622018-11-162018-11-182018-02-19Standard2270True763419d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd95213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd19722516416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f82906330201.00
71000660.000.00.0660.0010.045452018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0220.00
81000320.000.00.0320.0010.064642018-08-172018-08-192018-02-19Standard2179True961778364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d468950160.00
91000720.000.00.0720.0010.01261262018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0240.00
101000480.000.00.0480.0010.065652018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0160.00
113000595.980.00.0595.9810.02672672018-08-102018-08-122018-02-19Standard2172True1061378f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad0297.99
124000915.000.00.0915.0010.03703622018-08-032018-08-062018-02-19Standard3165True11611058373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc937827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f727673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc86661737660305.00
1330003579.400.00.03579.4010.03213212018-07-222018-07-312018-02-19Standard9153True1261807cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d97515aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df50397.71
1440000.00NaNNaN1.0010.02132132018-07-132018-07-162018-02-19Standard3144True14613914d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c3297edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa202700.00
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "preprocessed_data = original_data.copy()\n", "\n", "dataset_specification = DatasetSpecification()\n", "dp_toolkit = DataPreprocessingToolkit()\n", "\n", "id_column_names = dataset_specification.get_id_columns()\n", "\n", "people_identifier = PeopleIdentifier()\n", "preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, \"user_id\")\n", "\n", "preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)\n", "preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data) # Code this method\n", "preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)\n", "preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)\n", "preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)\n", "preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)\n", "\n", "preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)\n", "\n", "preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)\n", "preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)\n", "\n", "preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)\n", "\n", "preprocessed_data = dp_toolkit.add_night_price(preprocessed_data) # Code this method (remember that there can be many rooms)\n", "\n", "preprocessed_data = preprocessed_data.reset_index(drop=True)\n", "\n", "assert preprocessed_data.iloc[1]['length_of_stay'] == 3\n", "assert preprocessed_data.iloc[2]['length_of_stay'] == 2\n", "assert preprocessed_data.iloc[3]['length_of_stay'] == 7\n", "\n", "assert preprocessed_data.iloc[0]['night_price'] == 330.76\n", "assert preprocessed_data.iloc[1]['night_price'] == 231.13\n", "assert preprocessed_data.iloc[2]['night_price'] == 183.40\n", "\n", "display(HTML(preprocessed_data.head(15).to_html()))" ] }, { "cell_type": "markdown", "id": "coupled-river", "metadata": {}, "source": [ "# Bucket important features to reduce the offer space size\n", "\n", "Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \\sim 3.3 \\text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:\n", "\n", "```python\n", "column_values_dict = {\n", " 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n", " 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n", " 'rate_plan': ['Standard', 'Nonref'],\n", " 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n", " 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n", " 'weekend_stay': ['True', 'False']\n", "}\n", "```\n", "\n", "Explanation:\n", " - term - the term of the arrival date,\n", " - length_of_stay_bucket - aggregated length of stay,\n", " - rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),\n", " - room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,\n", " - n_people_bucket - aggregated number of people in a reservation,\n", " - weekend_stay - indicates if the stay encompassed a weekend.\n", "\n", "The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.\n", "\n", "The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.\n", "\n", "### The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.\n", "\n", "**Task:**
\n", "In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every **room_group_id** and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions." ] }, { "cell_type": "code", "execution_count": 12, "id": "interracial-rendering", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_peoplen_children_1n_children_2n_children_3accomodation_pricemeal_priceservice_pricepaidn_roomsdiscountroom_idroom_group_iddate_fromdate_tobooking_daterate_planlength_of_staybook_to_arrivalweekend_stayuser_idclient_idclient_nameemailphoneis_companynight_pricetermlength_of_stay_bucketroom_segmentn_people_bucketitemitem_id
05000992.290.00.01.001NaN1181182018-02-102018-02-132017-08-17Standard3177True154117ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a0330.76WinterVacation[2-3][260-360][5-inf]WinterVacation [2-3] Standard [260-360] [5-inf] True0
14000693.400.00.0693.4010.02702702018-02-282018-03-032017-08-17Standard3195True2541184db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639af9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e623870231.13WinterVacation[2-3][160-260][3-4]WinterVacation [2-3] Standard [160-260] [3-4] True1
22000366.800.00.01.001NaN2942942018-02-142018-02-162017-08-29Standard2169False354790e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdeef6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc351c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac0183.40WinterVacation[2-3][160-260][2-2]WinterVacation [2-3] Standard [160-260] [2-2] False2
340001064.600.00.01.001NaN1831832018-02-032018-02-102017-09-04Standard7152True4551775380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a723aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa61657930152.09WinterVacation[4-7][160-260][3-4]WinterVacation [4-7] Standard [160-260] [3-4] True3
42000713.000.00.01.001NaN64642018-01-292018-02-032017-09-07Standard5144True5554124aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d90d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e60142.60WinterVacation[4-7][0-160][2-2]WinterVacation [4-7] Standard [0-160] [2-2] True4
55000800.000.00.0800.0010.01111112018-03-282018-04-012017-09-11Standard4198True6555601f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a446163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f440200.00Easter[4-7][260-360][5-inf]Easter [4-7] Standard [260-360] [5-inf] True5
65000402.000.00.0402.0010.01621622018-11-162018-11-182018-02-19Standard2270True763419d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd95213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd19722516416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f82906330201.00OffSeason[2-3][260-360][5-inf]OffSeason [2-3] Standard [260-360] [5-inf] True6
71000660.000.00.0660.0010.045452018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0220.00HighSeason[2-3][160-260][1-1]HighSeason [2-3] Standard [160-260] [1-1] True7
81000320.000.00.0320.0010.064642018-08-172018-08-192018-02-19Standard2179True961778364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d468950160.00HighSeason[2-3][0-160][1-1]HighSeason [2-3] Standard [0-160] [1-1] True8
91000720.000.00.0720.0010.01261262018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0240.00HighSeason[2-3][160-260][1-1]HighSeason [2-3] Standard [160-260] [1-1] True7
101000480.000.00.0480.0010.065652018-08-162018-08-192018-02-19Standard3178True861777f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab0160.00HighSeason[2-3][160-260][1-1]HighSeason [2-3] Standard [160-260] [1-1] True7
113000595.980.00.0595.9810.02672672018-08-102018-08-122018-02-19Standard2172True1061378f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad0297.99HighSeason[2-3][160-260][3-4]HighSeason [2-3] Standard [160-260] [3-4] True9
124000915.000.00.0915.0010.03703622018-08-032018-08-062018-02-19Standard3165True11611058373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc937827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f727673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc86661737660305.00HighSeason[2-3][160-260][3-4]HighSeason [2-3] Standard [160-260] [3-4] True9
1330003579.400.00.03579.4010.03213212018-07-222018-07-312018-02-19Standard9153True1261807cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d97515aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df50397.71HighSeason[8-inf][160-260][3-4]HighSeason [8-inf] Standard [160-260] [3-4] True10
1440000.00NaNNaN1.0010.02132132018-07-132018-07-162018-02-19Standard3144True14613914d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c3297edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa202700.00HighSeason[2-3][0-160][3-4]HighSeason [2-3] Standard [0-160] [3-4] True11
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)\n", "preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)\n", "preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data) # Code this method\n", "preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)\n", "\n", "assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'\n", "assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'\n", "assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'\n", "\n", "preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)\n", "\n", "preprocessed_data.to_csv(os.path.join(data_path, \"hotel_data_preprocessed.csv\"))\n", "\n", "display(HTML(preprocessed_data.head(15).to_html()))" ] }, { "cell_type": "markdown", "id": "offshore-biography", "metadata": {}, "source": [ "# Base statistics" ] }, { "cell_type": "code", "execution_count": 13, "id": "acknowledged-crime", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of users: 14188\n", "\n", "Number of items: 772\n", "\n", "Number of interactions: 16102\n", "\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_users
item_id
99679
28581
103581
41413
249226
9225
124224
1205
109204
16197
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n_items
user_id
706337
173629
777927
9624
123
5023
11522
141316
333613
293013
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(\"Number of users: {}\".format(len(preprocessed_data['user_id'].unique())))\n", "print()\n", "print(\"Number of items: {}\".format(len(preprocessed_data['item_id'].unique())))\n", "print()\n", "print(\"Number of interactions: {}\".format(len(preprocessed_data)))\n", "print()\n", "\n", "n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)\n", "n_user = n_user.rename(columns={'user_id': 'n_users'})\n", "display(HTML(n_user.head(10).to_html()))\n", "\n", "n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)\n", "n_item = n_item.rename(columns={'item_id': 'n_items'})\n", "display(HTML(n_item.head(10).to_html()))" ] }, { "cell_type": "markdown", "id": "blessed-knitting", "metadata": {}, "source": [ "# Prepare the dataset for recommenders\n", "\n", "One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important." ] }, { "cell_type": "code", "execution_count": 14, "id": "victorian-bottom", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idtermlength_of_stay_bucketrate_planroom_segmentn_people_bucketweekend_stay
010WinterVacation[2-3]Standard[260-360][5-inf]True
121WinterVacation[2-3]Standard[160-260][3-4]True
232WinterVacation[2-3]Standard[160-260][2-2]False
343WinterVacation[4-7]Standard[160-260][3-4]True
454WinterVacation[4-7]Standard[0-160][2-2]True
565Easter[4-7]Standard[260-360][5-inf]True
676OffSeason[2-3]Standard[260-360][5-inf]True
787HighSeason[2-3]Standard[160-260][1-1]True
898HighSeason[2-3]Standard[0-160][1-1]True
987HighSeason[2-3]Standard[160-260][1-1]True
1087HighSeason[2-3]Standard[160-260][1-1]True
11109HighSeason[2-3]Standard[160-260][3-4]True
12119HighSeason[2-3]Standard[160-260][3-4]True
131210HighSeason[8-inf]Standard[160-260][3-4]True
141411HighSeason[2-3]Standard[0-160][3-4]True
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']\n", "\n", "interactions_df = preprocessed_data.loc[\n", " :, ['user_id', 'item_id'] + item_features]\n", "\n", "column_values_dict = {\n", " 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n", " 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n", " 'rate_plan': ['Standard', 'Nonref'],\n", " 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n", " 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n", " 'weekend_stay': ['True', 'False']\n", "}\n", "\n", "interactions_df.loc[:, 'term'] = pd.Categorical(\n", " interactions_df['term'], categories=column_values_dict['term'])\n", "interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(\n", " interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])\n", "interactions_df.loc[:, 'rate_plan'] = pd.Categorical(\n", " interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])\n", "interactions_df.loc[:, 'room_segment'] = pd.Categorical(\n", " interactions_df['room_segment'], categories=column_values_dict['room_segment'])\n", "interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(\n", " interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])\n", "interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(\n", " interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])\n", "\n", "interactions_df.to_csv(os.path.join(data_path, \"hotel_data_interactions_df.csv\"))\n", "\n", "display(HTML(interactions_df.head(15).to_html()))" ] }, { "cell_type": "code", "execution_count": null, "id": "incredible-feeling", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bf4e6e97", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e554cad9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "REK", "language": "python", "name": "rek" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }