REK-proj-2/data_preprocessing/data_preprocessing_toolkit.py
2021-06-28 20:18:14 +02:00

279 lines
11 KiB
Python

# Load libraries ---------------------------------------------
from datetime import datetime, timedelta
from dateutil.easter import easter
from data_preprocessing.dataset_specification import DatasetSpecification
import pandas as pd
import numpy as np
# ------------------------------------------------------------
class DataPreprocessingToolkit(object):
def __init__(self):
dataset_specification = DatasetSpecification()
self.sum_columns = dataset_specification.get_sum_columns()
self.mean_columns = dataset_specification.get_mean_columns()
self.mode_columns = dataset_specification.get_mode_columns()
self.first_columns = dataset_specification.get_first_columns()
self.nights_buckets = dataset_specification.get_nights_buckets()
self.npeople_buckets = dataset_specification.get_npeople_buckets()
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
self.arrival_terms = dataset_specification.get_arrival_terms()
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
# #########################
# Entire datasets functions
# #########################
def fix_date_to(self, df):
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
return df
def add_length_of_stay(self, df):
df.loc[:, "length_of_stay"] = (df["date_to"] - df["date_from"]).dt.days
return df
def add_book_to_arrival(self, df):
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
return df
def add_nrooms(self, df):
df.loc[:, "n_rooms"] = 1
return df
def add_weekend_stay(self, df):
s = df["date_from"].dt.dayofweek
e = df["date_to"].dt.dayofweek
dt = (df["date_to"] - df["date_from"]).dt.days
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
return df
def add_night_price(self, df):
df.loc[:, "night_price"] = np.round(df["accomodation_price"] / df["length_of_stay"] / df["n_rooms"], 2)
return df
def clip_book_to_arrival(self, df):
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
return df
def sum_npeople(self, df):
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
return df
def filter_out_company_clients(self, df):
df = df.loc[df["is_company"] == 0]
return df
def filter_out_long_stays(self, df):
df = df.loc[df["length_of_stay"] <= 21]
return df
def leave_one_from_group_reservations(self, df):
unique_group_rows = []
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
group_ids = []
for idx, row in df.iterrows():
if row["group_id"] != -1:
if row["group_id"] not in group_ids:
unique_group_rows.append(row)
group_ids.append(row["group_id"])
else:
unique_group_rows.append(row)
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
return df
def aggregate_group_reservations(self, df):
non_group_reservations = df.loc[df["group_id"] == "",
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
group_reservations = df.loc[df["group_id"] != ""]
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
group_reservations = agg_datasets[0]
for i in range(1, len(agg_datasets)):
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
group_reservations = group_reservations.reset_index(drop=True)
df = pd.concat([non_group_reservations, group_reservations])
return df
def leave_only_ota(self, df):
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
return df
def map_date_to_term_datasets(self, df):
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
return df
def map_length_of_stay_to_nights_buckets(self, df):
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
return df
def map_night_price_to_room_segment_buckets(self, df):
night_prices = df.loc[df['accomodation_price'] > 1]\
.groupby('room_group_id')['night_price'].mean().reset_index()
night_prices.columns = ['room_group_id', 'room_night_price']
df = pd.merge(df, night_prices, on=['room_group_id'], how='left')
df.loc[df['room_night_price'].isnull(), 'room_night_price'] = 0.0
df.loc[:, 'room_segment'] = df['room_night_price'].apply(
lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
df = df.drop(columns=['room_night_price'])
return df
# def map_night_price_to_room_segment_buckets(self, df):
# night_prices = df.loc[df['accomodation_price'] > 1]\
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
# df = df.drop(columns=['termnight_price'])
# return df
def map_npeople_to_npeople_buckets(self, df):
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
return df
def map_item_to_item_id(self, df):
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
ids = df['item'].unique().tolist()
mapping = {ids[i]: i for i in range(len(ids))}
df['item_id'] = df['item'].apply(lambda x: mapping[x])
return df
def add_interaction_id(self, df):
df.loc[:, 'interaction_id'] = range(df.shape[0])
return df
# ################
# Column functions
# ################
def bundle_period(self, diff):
diff = float(diff)
if int(diff) < 0:
return "<0"
elif int(diff) <= 7:
return diff
elif 7 < int(diff) <= 14:
return "<14"
elif 14 < int(diff) <= 30:
return "<30"
elif 30 < int(diff) <= 60:
return "<60"
elif 60 < int(diff) <= 180:
return "<180"
elif int(diff) > 180:
return ">180"
def bundle_price(self, price):
mod = 300.0
return int((price + mod / 2) / mod) * mod
def map_date_to_season(self, date):
day = int(date[8:10])
month = int(date[5:7])
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
return "Winter"
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
return "Spring"
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
return "Summer"
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
return "Autumn"
def map_value_to_bucket(self, value, buckets):
if value == "":
return str(buckets[0]).replace(", ", "-")
for bucket in buckets:
if bucket[0] <= value <= bucket[1]:
return str(bucket).replace(", ", "-")
def map_date_to_term(self, date):
m = int(date[5:7])
d = int(date[8:10])
term = None
for arrival_term in self.arrival_terms:
if arrival_term == "Easter":
year = int(date[:4])
easter_date = easter(year)
easter_start = easter_date + timedelta(days=-4)
easter_end = easter_date + timedelta(days=1)
esm = easter_start.month
esd = easter_start.day
eem = easter_end.month
eed = easter_end.day
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
term = arrival_term
break
elif arrival_term == "NewYear":
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
em = self.arrival_terms[arrival_term][0]["end"]["m"]
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
term = arrival_term
break
else:
is_match = False
for i in range(len(self.arrival_terms[arrival_term])):
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
em = self.arrival_terms[arrival_term][i]["end"]["m"]
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
term = arrival_term
is_match = True
break
if is_match:
break
return term
def map_dates_to_terms(self, dates):
terms = []
for date in dates:
term = self.map_date_to_term(date)
terms.append(term)
return terms
def filter_out_historical_dates(self, date_list):
"""
Filters out past dates from a list of dates.
"""
future_dates = []
for date in date_list:
if date >= datetime.now():
future_dates.append(date.strftime("%Y-%m-%d"))
return future_dates