14 KiB
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
from data_preprocessing.dataset_specification import DatasetSpecification
from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit
from data_preprocessing.people_identifier import PeopleIdentifier
Load original data
data_path = os.path.join("data", "hotel_data")
original_data = pd.read_csv(os.path.join(data_path, "hotel_data_original.csv"), index_col=0)
original_data = original_data.replace({"\\\\N": ""})
original_data = original_data.fillna("")
numeric_columns = ["n_people", "n_children_1", "n_children_2", "n_children_3",
"discount", "accomodation_price", "meal_price", "service_price",
"paid"]
for column in numeric_columns:
original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors="coerce")
original_data = original_data.astype(
{
"date_from": np.datetime64,
"date_to": np.datetime64,
"booking_time": np.datetime64,
"booking_date": np.datetime64,
"n_people": np.int64,
"n_children_1": np.int64,
"n_children_2": np.int64,
"n_children_3": np.int64,
"discount": np.float64,
"accomodation_price": np.float64,
"meal_price": np.float64,
"service_price": np.float64,
"paid": np.float64,
}
)
display(HTML(original_data.head(15).to_html()))
Preprocess the data
- Identify users by client_id, name hash, phone hash, email hash.
- Fix date_to - originally it points to the last full day of stay, not the departure date.
- Add length of stay.
- Add book to arrival.
- Add number of rooms (important for group reservations).
- Add indicator for stays encompasing a weekend.
- Add night price.
- Fix book to arrival to be not smaller than 0.
- Filter out companies as recommendations for such clients should work differently.
- Aggregate group reservations into single interactions.
Task:
In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:
- add_length_of_stay - should add 'length_of_stay' variable to the DataFrame, which counts the number of nights the customer stayed at the hotel,
- add_night_price - should add 'night_price' column to the dataset DataFrame, which shows the average accomodation price per night per room (there can be many rooms in group reservations - 'n_rooms' column). You have to pass all assertions.
preprocessed_data = original_data.copy()
dataset_specification = DatasetSpecification()
dp_toolkit = DataPreprocessingToolkit()
id_column_names = dataset_specification.get_id_columns()
people_identifier = PeopleIdentifier()
preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, "user_id")
preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)
preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data) # Code this method
preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)
preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)
preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)
preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)
preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)
preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)
preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)
preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)
preprocessed_data = dp_toolkit.add_night_price(preprocessed_data) # Code this method (remember that there can be many rooms)
preprocessed_data = preprocessed_data.reset_index(drop=True)
assert preprocessed_data.iloc[1]['length_of_stay'] == 3
assert preprocessed_data.iloc[2]['length_of_stay'] == 2
assert preprocessed_data.iloc[3]['length_of_stay'] == 7
assert preprocessed_data.iloc[0]['night_price'] == 330.76
assert preprocessed_data.iloc[1]['night_price'] == 231.13
assert preprocessed_data.iloc[2]['night_price'] == 183.40
display(HTML(preprocessed_data.head(15).to_html()))
Bucket important features to reduce the offer space size
Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \sim 3.3 \text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:
column_values_dict = {
'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
'rate_plan': ['Standard', 'Nonref'],
'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
'weekend_stay': ['True', 'False']
}
Explanation:
- term - the term of the arrival date,
- length_of_stay_bucket - aggregated length of stay,
- rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),
- room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,
- n_people_bucket - aggregated number of people in a reservation,
- weekend_stay - indicates if the stay encompassed a weekend.
The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.
The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.
The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.
Task:
In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every room_group_id and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions.
preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)
preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)
preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data) # Code this method
preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)
assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'
assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'
assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'
preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)
preprocessed_data.to_csv(os.path.join(data_path, "hotel_data_preprocessed.csv"))
display(HTML(preprocessed_data.head(15).to_html()))
Base statistics
print("Number of users: {}".format(len(preprocessed_data['user_id'].unique())))
print()
print("Number of items: {}".format(len(preprocessed_data['item_id'].unique())))
print()
print("Number of interactions: {}".format(len(preprocessed_data)))
print()
n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)
n_user = n_user.rename(columns={'user_id': 'n_users'})
display(HTML(n_user.head(10).to_html()))
n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)
n_item = n_item.rename(columns={'item_id': 'n_items'})
display(HTML(n_item.head(10).to_html()))
Prepare the dataset for recommenders
One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important.
item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
interactions_df = preprocessed_data.loc[
:, ['user_id', 'item_id'] + item_features]
column_values_dict = {
'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
'rate_plan': ['Standard', 'Nonref'],
'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
'weekend_stay': ['True', 'False']
}
interactions_df.loc[:, 'term'] = pd.Categorical(
interactions_df['term'], categories=column_values_dict['term'])
interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(
interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df.loc[:, 'rate_plan'] = pd.Categorical(
interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df.loc[:, 'room_segment'] = pd.Categorical(
interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(
interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(
interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])
interactions_df.to_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"))
display(HTML(interactions_df.head(15).to_html()))