REK-proj-1/project_1_data_preparation.ipynb
Aleksander Piotrowski f6ce2585b8 first commit
2021-05-18 16:18:33 +02:00

77 KiB

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
from data_preprocessing.dataset_specification import DatasetSpecification
from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit
from data_preprocessing.people_identifier import PeopleIdentifier

Load original data

data_path = os.path.join("data", "hotel_data")

original_data = pd.read_csv(os.path.join(data_path, "hotel_data_original.csv"), index_col=0)

original_data = original_data.replace({"\\\\N": ""})
original_data = original_data.fillna("")

numeric_columns = ["n_people", "n_children_1", "n_children_2", "n_children_3",
                   "discount", "accomodation_price", "meal_price", "service_price",
                   "paid"]

for column in numeric_columns:
    original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors="coerce")

original_data = original_data.astype(
        {
            "date_from": np.datetime64,
            "date_to": np.datetime64,
            "booking_time": np.datetime64,
            "booking_date": np.datetime64,
            "n_people": np.int64,
            "n_children_1": np.int64,
            "n_children_2": np.int64,
            "n_children_3": np.int64,
            "discount": np.float64,
            "accomodation_price": np.float64,
            "meal_price": np.float64,
            "service_price": np.float64,
            "paid": np.float64,
        }
    )

display(HTML(original_data.head(15).to_html()))
reservation_id group_id room_id room_group_id date_from date_to booking_date booking_time n_people n_children_1 n_children_2 n_children_3 discount accomodation_price meal_price service_price paid rate_plan client_id client_name email phone is_company reservation_status
0 14160 135 135 2017-09-01 2018-03-30 2017-07-04 2017-07-04 10:52:00 1 0 0 0 NaN 0.00 0.0 0.0 0.00 Standard 51665 86bd787ca115281ad9642c5fd6e79e6f2d87841c2fd9c6812b32b6a761109b62 0 1
1 16075 118 118 2018-02-10 2018-02-12 2017-08-17 2017-08-17 15:01:00 5 0 0 0 NaN 992.29 0.0 0.0 1.00 Standard 54117 ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c 318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a 0 1
2 16076 270 270 2018-02-28 2018-03-02 2017-08-17 2017-08-17 15:08:00 4 0 0 0 0.0 693.40 0.0 0.0 693.40 Standard 54118 4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3 cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387 0 2
3 16635 294 294 2018-02-14 2018-02-15 2017-08-29 2017-08-29 13:58:00 2 0 0 0 NaN 366.80 0.0 0.0 1.00 Standard 54790 e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35 1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac 0 1
4 16964 183 183 2018-02-03 2018-02-09 2017-09-04 2017-09-04 15:52:00 4 0 0 0 NaN 1064.60 0.0 0.0 1.00 Standard 55177 5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf 6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72 3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793 0 1
5 17173 64 64 2018-01-29 2018-02-02 2017-09-07 2017-09-07 13:21:00 2 0 0 0 NaN 713.00 0.0 0.0 1.00 Standard 55412 4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9 0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086 ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6 0 1
6 17308 111 111 2018-03-28 2018-03-31 2017-09-11 2017-09-11 10:31:00 5 0 0 0 0.0 800.00 0.0 0.0 800.00 Standard 55560 1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44 6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9 516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44 0 4
7 120165 162 162 2018-11-16 2018-11-17 2018-02-19 2018-02-19 17:44:00 5 0 0 0 0.0 402.00 0.0 0.0 402.00 Standard 63419 d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9 5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251 6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633 0 4
8 120183 45 45 2018-08-16 2018-08-18 2018-02-19 2018-02-19 17:44:00 1 0 0 0 0.0 660.00 0.0 0.0 660.00 Standard 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 4
9 120184 64 64 2018-08-17 2018-08-18 2018-02-19 2018-02-19 17:44:00 1 0 0 0 0.0 320.00 0.0 0.0 320.00 Standard 61778 364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895 0 4
10 120185 126 126 2018-08-16 2018-08-18 2018-02-19 2018-02-19 17:44:00 1 0 0 0 0.0 720.00 0.0 0.0 720.00 Standard 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 4
11 120186 65 65 2018-08-16 2018-08-18 2018-02-19 2018-02-19 17:44:00 1 0 0 0 0.0 480.00 0.0 0.0 480.00 Standard 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 4
12 120194 267 267 2018-08-10 2018-08-11 2018-02-19 2018-02-19 17:44:00 3 0 0 0 0.0 595.98 0.0 0.0 595.98 Standard 61378 f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0 a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5 446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad 0 4
13 120209 370 362 2018-08-03 2018-08-05 2018-02-19 2018-02-19 17:44:00 4 0 0 0 0.0 915.00 0.0 0.0 915.00 Standard 61105 8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9 37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7 27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766 0 4
14 120232 321 321 2018-07-22 2018-07-30 2018-02-19 2018-02-19 17:44:00 3 0 0 0 0.0 3579.40 0.0 0.0 3579.40 Standard 61807 cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751 5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5 0 3

Preprocess the data

  • Identify users by client_id, name hash, phone hash, email hash.
  • Fix date_to - originally it points to the last full day of stay, not the departure date.
  • Add length of stay.
  • Add book to arrival.
  • Add number of rooms (important for group reservations).
  • Add indicator for stays encompasing a weekend.
  • Add night price.
  • Fix book to arrival to be not smaller than 0.
  • Filter out companies as recommendations for such clients should work differently.
  • Aggregate group reservations into single interactions.

Task:
In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:

  • add_length_of_stay - should add 'length_of_stay' variable to the DataFrame, which counts the number of nights the customer stayed at the hotel,
  • add_night_price - should add 'night_price' column to the dataset DataFrame, which shows the average accomodation price per night per room (there can be many rooms in group reservations - 'n_rooms' column). You have to pass all assertions.
preprocessed_data = original_data.copy()

dataset_specification = DatasetSpecification()
dp_toolkit = DataPreprocessingToolkit()

id_column_names = dataset_specification.get_id_columns()

people_identifier = PeopleIdentifier()
preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, "user_id")

preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)
preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)
preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)
preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)
preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)

preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)

preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)
preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)

preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)

preprocessed_data = dp_toolkit.add_night_price(preprocessed_data)  # Code this method (remember that there can be many rooms)

preprocessed_data = preprocessed_data.reset_index(drop=True)

assert preprocessed_data.iloc[1]['length_of_stay'] == 3
assert preprocessed_data.iloc[2]['length_of_stay'] == 2
assert preprocessed_data.iloc[3]['length_of_stay'] == 7

assert preprocessed_data.iloc[0]['night_price'] == 330.76
assert preprocessed_data.iloc[1]['night_price'] == 231.13
assert preprocessed_data.iloc[2]['night_price'] == 183.40

display(HTML(preprocessed_data.head(15).to_html()))
n_people n_children_1 n_children_2 n_children_3 accomodation_price meal_price service_price paid n_rooms discount room_id room_group_id date_from date_to booking_date rate_plan length_of_stay book_to_arrival weekend_stay user_id client_id client_name email phone is_company night_price
0 5 0 0 0 992.29 0.0 0.0 1.00 1 NaN 118 118 2018-02-10 2018-02-13 2017-08-17 Standard 3 177 True 1 54117 ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c 318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a 0 330.76
1 4 0 0 0 693.40 0.0 0.0 693.40 1 0.0 270 270 2018-02-28 2018-03-03 2017-08-17 Standard 3 195 True 2 54118 4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3 cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387 0 231.13
2 2 0 0 0 366.80 0.0 0.0 1.00 1 NaN 294 294 2018-02-14 2018-02-16 2017-08-29 Standard 2 169 False 3 54790 e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35 1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac 0 183.40
3 4 0 0 0 1064.60 0.0 0.0 1.00 1 NaN 183 183 2018-02-03 2018-02-10 2017-09-04 Standard 7 152 True 4 55177 5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf 6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72 3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793 0 152.09
4 2 0 0 0 713.00 0.0 0.0 1.00 1 NaN 64 64 2018-01-29 2018-02-03 2017-09-07 Standard 5 144 True 5 55412 4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9 0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086 ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6 0 142.60
5 5 0 0 0 800.00 0.0 0.0 800.00 1 0.0 111 111 2018-03-28 2018-04-01 2017-09-11 Standard 4 198 True 6 55560 1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44 6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9 516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44 0 200.00
6 5 0 0 0 402.00 0.0 0.0 402.00 1 0.0 162 162 2018-11-16 2018-11-18 2018-02-19 Standard 2 270 True 7 63419 d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9 5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251 6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633 0 201.00
7 1 0 0 0 660.00 0.0 0.0 660.00 1 0.0 45 45 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 220.00
8 1 0 0 0 320.00 0.0 0.0 320.00 1 0.0 64 64 2018-08-17 2018-08-19 2018-02-19 Standard 2 179 True 9 61778 364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895 0 160.00
9 1 0 0 0 720.00 0.0 0.0 720.00 1 0.0 126 126 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 240.00
10 1 0 0 0 480.00 0.0 0.0 480.00 1 0.0 65 65 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 160.00
11 3 0 0 0 595.98 0.0 0.0 595.98 1 0.0 267 267 2018-08-10 2018-08-12 2018-02-19 Standard 2 172 True 10 61378 f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0 a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5 446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad 0 297.99
12 4 0 0 0 915.00 0.0 0.0 915.00 1 0.0 370 362 2018-08-03 2018-08-06 2018-02-19 Standard 3 165 True 11 61105 8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9 37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7 27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766 0 305.00
13 3 0 0 0 3579.40 0.0 0.0 3579.40 1 0.0 321 321 2018-07-22 2018-07-31 2018-02-19 Standard 9 153 True 12 61807 cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751 5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5 0 397.71
14 4 0 0 0 0.00 NaN NaN 1.00 1 0.0 213 213 2018-07-13 2018-07-16 2018-02-19 Standard 3 144 True 14 61391 4d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea 25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c32 97edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa2027 0 0.00

Bucket important features to reduce the offer space size

Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \sim 3.3 \text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

Explanation:

  • term - the term of the arrival date,
  • length_of_stay_bucket - aggregated length of stay,
  • rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),
  • room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,
  • n_people_bucket - aggregated number of people in a reservation,
  • weekend_stay - indicates if the stay encompassed a weekend.

The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.

The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.

The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.

Task:
In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every room_group_id and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions.

preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)
preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)
preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)

assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'
assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'
assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'

preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)

preprocessed_data.to_csv(os.path.join(data_path, "hotel_data_preprocessed.csv"))

display(HTML(preprocessed_data.head(15).to_html()))
n_people n_children_1 n_children_2 n_children_3 accomodation_price meal_price service_price paid n_rooms discount room_id room_group_id date_from date_to booking_date rate_plan length_of_stay book_to_arrival weekend_stay user_id client_id client_name email phone is_company night_price term length_of_stay_bucket room_segment n_people_bucket item item_id
0 5 0 0 0 992.29 0.0 0.0 1.00 1 NaN 118 118 2018-02-10 2018-02-13 2017-08-17 Standard 3 177 True 1 54117 ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d805a02b3b49160e87b1c 318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b3010f6f9459c4be03a 0 330.76 WinterVacation [2-3] [260-360] [5-inf] WinterVacation [2-3] Standard [260-360] [5-inf] True 0
1 4 0 0 0 693.40 0.0 0.0 693.40 1 0.0 270 270 2018-02-28 2018-03-03 2017-08-17 Standard 3 195 True 2 54118 4db36724fc28085e053a3003dce55368ee207cce37d355f876f48164372d639a f9c0564c66d6a830c4964a30ac261038dd7cf762b0641cc1fb85542bc71d3ca3 cb550ba6d303bf230379073bcbdd55c37229eab3f173dc24f40a9251a6e62387 0 231.13 WinterVacation [2-3] [160-260] [3-4] WinterVacation [2-3] Standard [160-260] [3-4] True 1
2 2 0 0 0 366.80 0.0 0.0 1.00 1 NaN 294 294 2018-02-14 2018-02-16 2017-08-29 Standard 2 169 False 3 54790 e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4aead5f61fa3e0ebdee f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6a0e86ae7cabc19bc35 1c56315c10c9d8153ca7820648900befbd9109fb6cfb814c29dbdf73084c5aac 0 183.40 WinterVacation [2-3] [160-260] [2-2] WinterVacation [2-3] Standard [160-260] [2-2] False 2
3 4 0 0 0 1064.60 0.0 0.0 1.00 1 NaN 183 183 2018-02-03 2018-02-10 2017-09-04 Standard 7 152 True 4 55177 5380adccf08ea3000791aad3ccc478e3b6a8de440910aaa03fdcae6e3dc484bf 6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f84ec6496f4f66575a72 3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df47983721a2aa6165793 0 152.09 WinterVacation [4-7] [160-260] [3-4] WinterVacation [4-7] Standard [160-260] [3-4] True 3
4 2 0 0 0 713.00 0.0 0.0 1.00 1 NaN 64 64 2018-01-29 2018-02-03 2017-09-07 Standard 5 144 True 5 55412 4aebfe125cf6c059588792b9fb871afe282a8806299dfe488a4609ffebe680d9 0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12009e67ffdfe74bc086 ea16c664798581a9d93a3128d772b8b89e05743edbbfae43ab422049878af3e6 0 142.60 WinterVacation [4-7] [0-160] [2-2] WinterVacation [4-7] Standard [0-160] [2-2] True 4
5 5 0 0 0 800.00 0.0 0.0 800.00 1 0.0 111 111 2018-03-28 2018-04-01 2017-09-11 Standard 4 198 True 6 55560 1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278634bfe50e2ea446a44 6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03670c42a7357b5923f9 516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d145641f5adbddf6f44 0 200.00 Easter [4-7] [260-360] [5-inf] Easter [4-7] Standard [260-360] [5-inf] True 5
6 5 0 0 0 402.00 0.0 0.0 402.00 1 0.0 162 162 2018-11-16 2018-11-18 2018-02-19 Standard 2 270 True 7 63419 d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd922ba20ce6c7a5813dd9 5213ac7a6db98631330ac74a241ffdf840e1857481a0b59c76092b8bd1972251 6416a3bc7ea31b09ae63628a143b160d7976978cdbd298cc1dd429c9f8290633 0 201.00 OffSeason [2-3] [260-360] [5-inf] OffSeason [2-3] Standard [260-360] [5-inf] True 6
7 1 0 0 0 660.00 0.0 0.0 660.00 1 0.0 45 45 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 220.00 HighSeason [2-3] [160-260] [1-1] HighSeason [2-3] Standard [160-260] [1-1] True 7
8 1 0 0 0 320.00 0.0 0.0 320.00 1 0.0 64 64 2018-08-17 2018-08-19 2018-02-19 Standard 2 179 True 9 61778 364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba381c11614ac442d46895 0 160.00 HighSeason [2-3] [0-160] [1-1] HighSeason [2-3] Standard [0-160] [1-1] True 8
9 1 0 0 0 720.00 0.0 0.0 720.00 1 0.0 126 126 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 240.00 HighSeason [2-3] [160-260] [1-1] HighSeason [2-3] Standard [160-260] [1-1] True 7
10 1 0 0 0 480.00 0.0 0.0 480.00 1 0.0 65 65 2018-08-16 2018-08-19 2018-02-19 Standard 3 178 True 8 61777 f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74bef0a1aa44da3296b9ab 0 160.00 HighSeason [2-3] [160-260] [1-1] HighSeason [2-3] Standard [160-260] [1-1] True 7
11 3 0 0 0 595.98 0.0 0.0 595.98 1 0.0 267 267 2018-08-10 2018-08-12 2018-02-19 Standard 2 172 True 10 61378 f929533e542ff1302069567400b5ba584dc27dcbd63aae039937b055ccb9fdb0 a90206b6164e13331d087034d9d9a963a9a4bcf8b2969f184b5b342440ca01e5 446e3878681ff4dac3067bf75b8ec3b7e5be2cf8a02c75ecdfe1df9265449dad 0 297.99 HighSeason [2-3] [160-260] [3-4] HighSeason [2-3] Standard [160-260] [3-4] True 9
12 4 0 0 0 915.00 0.0 0.0 915.00 1 0.0 370 362 2018-08-03 2018-08-06 2018-02-19 Standard 3 165 True 11 61105 8373dfd3bf2b44222dce774ee032a32b74a495f6ddd355c2ecdbb33bc1cb8fc9 37827cbb5aec4ed83d13193d766324efb93251cd44b28b11ef376365afffe7f7 27673c0003a9f52f8e73269e2e5799dd8d4c2b0bc0dc071ee6e6fc8666173766 0 305.00 HighSeason [2-3] [160-260] [3-4] HighSeason [2-3] Standard [160-260] [3-4] True 9
13 3 0 0 0 3579.40 0.0 0.0 3579.40 1 0.0 321 321 2018-07-22 2018-07-31 2018-02-19 Standard 9 153 True 12 61807 cdd642820a12bb4a8f2407ed1d02b67435991afef52d9ce896a6c495575d9751 5aeffc70468d19eba135a79c74024c206abe83c29fc6f02a48de4933b88b5df5 0 397.71 HighSeason [8-inf] [160-260] [3-4] HighSeason [8-inf] Standard [160-260] [3-4] True 10
14 4 0 0 0 0.00 NaN NaN 1.00 1 0.0 213 213 2018-07-13 2018-07-16 2018-02-19 Standard 3 144 True 14 61391 4d9a25ef49b785020f71d87e7202cd209bdc4197b2649c006fb9863c767324ea 25dec48fdcf1d2e7dbe97aff20d52cae0b2660f5eae1d84161a0be139ad94c32 97edf16b92140283616026230d86f5ded26d8e09863a76bff14535e706fa2027 0 0.00 HighSeason [2-3] [0-160] [3-4] HighSeason [2-3] Standard [0-160] [3-4] True 11

Base statistics

print("Number of users: {}".format(len(preprocessed_data['user_id'].unique())))
print()
print("Number of items: {}".format(len(preprocessed_data['item_id'].unique())))
print()
print("Number of interactions: {}".format(len(preprocessed_data)))
print()

n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)
n_user = n_user.rename(columns={'user_id': 'n_users'})
display(HTML(n_user.head(10).to_html()))

n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)
n_item = n_item.rename(columns={'item_id': 'n_items'})
display(HTML(n_item.head(10).to_html()))
Number of users: 14188

Number of items: 772

Number of interactions: 16102

n_users
item_id
99 679
28 581
103 581
41 413
249 226
9 225
124 224
1 205
109 204
16 197
n_items
user_id
706 337
1736 29
7779 27
96 24
1 23
50 23
115 22
1413 16
3336 13
2930 13

Prepare the dataset for recommenders

One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important.

item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

interactions_df = preprocessed_data.loc[
    :, ['user_id', 'item_id'] + item_features]

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

interactions_df.loc[:, 'term'] = pd.Categorical(
    interactions_df['term'], categories=column_values_dict['term'])
interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(
    interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df.loc[:, 'rate_plan'] = pd.Categorical(
    interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df.loc[:, 'room_segment'] = pd.Categorical(
    interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(
    interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(
    interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

interactions_df.to_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"))

display(HTML(interactions_df.head(15).to_html()))
user_id item_id term length_of_stay_bucket rate_plan room_segment n_people_bucket weekend_stay
0 1 0 WinterVacation [2-3] Standard [260-360] [5-inf] True
1 2 1 WinterVacation [2-3] Standard [160-260] [3-4] True
2 3 2 WinterVacation [2-3] Standard [160-260] [2-2] False
3 4 3 WinterVacation [4-7] Standard [160-260] [3-4] True
4 5 4 WinterVacation [4-7] Standard [0-160] [2-2] True
5 6 5 Easter [4-7] Standard [260-360] [5-inf] True
6 7 6 OffSeason [2-3] Standard [260-360] [5-inf] True
7 8 7 HighSeason [2-3] Standard [160-260] [1-1] True
8 9 8 HighSeason [2-3] Standard [0-160] [1-1] True
9 8 7 HighSeason [2-3] Standard [160-260] [1-1] True
10 8 7 HighSeason [2-3] Standard [160-260] [1-1] True
11 10 9 HighSeason [2-3] Standard [160-260] [3-4] True
12 11 9 HighSeason [2-3] Standard [160-260] [3-4] True
13 12 10 HighSeason [8-inf] Standard [160-260] [3-4] True
14 14 11 HighSeason [2-3] Standard [0-160] [3-4] True