%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
from data_preprocessing.dataset_specification import DatasetSpecification
from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit
from data_preprocessing.people_identifier import PeopleIdentifier
data_path = os.path.join("data", "hotel_data")
original_data = pd.read_csv(os.path.join(data_path, "hotel_data_original.csv"), index_col=0)
original_data = original_data.replace({"\\N": ""})
original_data = original_data.fillna("")
numeric_columns = ["n_people", "n_children_1", "n_children_2", "n_children_3",
"discount", "accomodation_price", "meal_price", "service_price",
"paid"]
for column in numeric_columns:
original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors="coerce")
original_data = original_data.astype(
{
"date_from": np.datetime64,
"date_to": np.datetime64,
"booking_time": np.datetime64,
"booking_date": np.datetime64,
"n_people": np.int64,
"n_children_1": np.int64,
"n_children_2": np.int64,
"n_children_3": np.int64,
"discount": np.float64,
"accomodation_price": np.float64,
"meal_price": np.float64,
"service_price": np.float64,
"paid": np.float64,
}
)
display(HTML(original_data.head(15).to_html()))
**Task:**
In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:
preprocessed_data = original_data.copy()
dataset_specification = DatasetSpecification()
dp_toolkit = DataPreprocessingToolkit()
id_column_names = dataset_specification.get_id_columns()
people_identifier = PeopleIdentifier()
preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, "user_id")
preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)
preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data) # Code this method
preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)
preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)
preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)
preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)
preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)
preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)
preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)
preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)
preprocessed_data = dp_toolkit.add_night_price(preprocessed_data) # Code this method (remember that there can be many rooms)
preprocessed_data = preprocessed_data.reset_index(drop=True)
assert preprocessed_data.iloc[1]['length_of_stay'] == 3
assert preprocessed_data.iloc[2]['length_of_stay'] == 2
assert preprocessed_data.iloc[3]['length_of_stay'] == 7
assert preprocessed_data.iloc[0]['night_price'] == 330.76
assert preprocessed_data.iloc[1]['night_price'] == 231.13
assert preprocessed_data.iloc[2]['night_price'] == 183.40
display(HTML(preprocessed_data.head(15).to_html()))
Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \sim 3.3 \text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:
column_values_dict = {
'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
'rate_plan': ['Standard', 'Nonref'],
'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
'weekend_stay': ['True', 'False']
}
Explanation:
The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.
The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.
**Task:**
In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every room_group_id and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions.
preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)
preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)
preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data) # Code this method
preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)
assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'
assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'
assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'
preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)
preprocessed_data.to_csv(os.path.join(data_path, "hotel_data_preprocessed.csv"))
display(HTML(preprocessed_data.head(15).to_html()))
print("Number of users: {}".format(len(preprocessed_data['user_id'].unique())))
print()
print("Number of items: {}".format(len(preprocessed_data['item_id'].unique())))
print()
print("Number of interactions: {}".format(len(preprocessed_data)))
print()
n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)
n_user = n_user.rename(columns={'user_id': 'n_users'})
display(HTML(n_user.head(10).to_html()))
n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)
n_item = n_item.rename(columns={'item_id': 'n_items'})
display(HTML(n_item.head(10).to_html()))
One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important.
item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
interactions_df = preprocessed_data.loc[
:, ['user_id', 'item_id'] + item_features]
column_values_dict = {
'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
'rate_plan': ['Standard', 'Nonref'],
'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
'weekend_stay': ['True', 'False']
}
interactions_df.loc[:, 'term'] = pd.Categorical(
interactions_df['term'], categories=column_values_dict['term'])
interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(
interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df.loc[:, 'rate_plan'] = pd.Categorical(
interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df.loc[:, 'room_segment'] = pd.Categorical(
interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(
interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(
interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])
interactions_df.to_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"))
display(HTML(interactions_df.head(15).to_html()))