Add recommender with HR10 0.116
This commit is contained in:
commit
4cf2994aca
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.ipynb_checkpoints
|
||||||
|
__pycache__
|
52
README.md
Normal file
52
README.md
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
# Recommender Systems class - Project 2
|
||||||
|
|
||||||
|
## Preparing your system
|
||||||
|
|
||||||
|
1. Install [Docker](https://docs.docker.com/engine/install/).
|
||||||
|
|
||||||
|
2. Fork this repository to your GitHub account.
|
||||||
|
|
||||||
|
3. Run Jupyter docker image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run \
|
||||||
|
-d -p 8888:8888 \
|
||||||
|
-v DIRECTORY:/home/jovyan/REK \
|
||||||
|
--name REK \
|
||||||
|
jupyter/minimal-notebook
|
||||||
|
```
|
||||||
|
|
||||||
|
Remember to change **DIRECTORY** to directory where all files can be found. You can change it to `$(pwd)` if your current directory is proper one.
|
||||||
|
|
||||||
|
4. Get loging link with following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker logs REK 2>&1 | grep -o 'http://127.0.0.1:8888.*' | tail -n1
|
||||||
|
|
||||||
|
Example output:
|
||||||
|
http://127.0.0.1:8888/?token=2bb816a4bc36a4bdbf64e0c9a89f336ae5404a01d15e442c
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Prepare conda environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec REK bash -c "
|
||||||
|
conda env create --name rs-class-env -f /home/jovyan/REK/environment.yml;
|
||||||
|
python -m ipykernel install --user --name=rs-class-env"
|
||||||
|
```
|
||||||
|
|
||||||
|
6. You can start/stop container whenever you want:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker stop REK
|
||||||
|
docker start REK
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to start from scratch, you can remove container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker stop REK
|
||||||
|
docker rm REK
|
||||||
|
```
|
||||||
|
|
||||||
|
Now you are ready to work!
|
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
File diff suppressed because it is too large
Load Diff
17251
data/hotel_data/hotel_data_original.csv
Normal file
17251
data/hotel_data/hotel_data_original.csv
Normal file
File diff suppressed because it is too large
Load Diff
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
File diff suppressed because it is too large
Load Diff
0
data_preprocessing/__init__.py
Normal file
0
data_preprocessing/__init__.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dateutil.easter import easter
|
||||||
|
from data_preprocessing.dataset_specification import DatasetSpecification
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class DataPreprocessingToolkit(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
dataset_specification = DatasetSpecification()
|
||||||
|
|
||||||
|
self.sum_columns = dataset_specification.get_sum_columns()
|
||||||
|
self.mean_columns = dataset_specification.get_mean_columns()
|
||||||
|
self.mode_columns = dataset_specification.get_mode_columns()
|
||||||
|
self.first_columns = dataset_specification.get_first_columns()
|
||||||
|
|
||||||
|
self.nights_buckets = dataset_specification.get_nights_buckets()
|
||||||
|
self.npeople_buckets = dataset_specification.get_npeople_buckets()
|
||||||
|
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
|
||||||
|
|
||||||
|
self.arrival_terms = dataset_specification.get_arrival_terms()
|
||||||
|
|
||||||
|
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
|
||||||
|
|
||||||
|
# #########################
|
||||||
|
# Entire datasets functions
|
||||||
|
# #########################
|
||||||
|
|
||||||
|
def fix_date_to(self, df):
|
||||||
|
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_length_of_stay(self, df):
|
||||||
|
df.loc[:, "length_of_stay"] = (df["date_to"] - df["date_from"]).dt.days
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_book_to_arrival(self, df):
|
||||||
|
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_nrooms(self, df):
|
||||||
|
df.loc[:, "n_rooms"] = 1
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_weekend_stay(self, df):
|
||||||
|
s = df["date_from"].dt.dayofweek
|
||||||
|
e = df["date_to"].dt.dayofweek
|
||||||
|
dt = (df["date_to"] - df["date_from"]).dt.days
|
||||||
|
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
|
||||||
|
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_night_price(self, df):
|
||||||
|
df.loc[:, "night_price"] = np.round(df["accomodation_price"] / df["length_of_stay"] / df["n_rooms"], 2)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def clip_book_to_arrival(self, df):
|
||||||
|
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def sum_npeople(self, df):
|
||||||
|
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def filter_out_company_clients(self, df):
|
||||||
|
df = df.loc[df["is_company"] == 0]
|
||||||
|
return df
|
||||||
|
|
||||||
|
def filter_out_long_stays(self, df):
|
||||||
|
df = df.loc[df["length_of_stay"] <= 21]
|
||||||
|
return df
|
||||||
|
|
||||||
|
def leave_one_from_group_reservations(self, df):
|
||||||
|
unique_group_rows = []
|
||||||
|
|
||||||
|
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
|
||||||
|
|
||||||
|
group_ids = []
|
||||||
|
for idx, row in df.iterrows():
|
||||||
|
if row["group_id"] != -1:
|
||||||
|
if row["group_id"] not in group_ids:
|
||||||
|
unique_group_rows.append(row)
|
||||||
|
group_ids.append(row["group_id"])
|
||||||
|
else:
|
||||||
|
unique_group_rows.append(row)
|
||||||
|
|
||||||
|
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def aggregate_group_reservations(self, df):
|
||||||
|
non_group_reservations = df.loc[df["group_id"] == "",
|
||||||
|
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
|
||||||
|
group_reservations = df.loc[df["group_id"] != ""]
|
||||||
|
|
||||||
|
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
|
||||||
|
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
|
||||||
|
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
|
||||||
|
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
|
||||||
|
|
||||||
|
group_reservations = agg_datasets[0]
|
||||||
|
for i in range(1, len(agg_datasets)):
|
||||||
|
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
|
||||||
|
|
||||||
|
group_reservations = group_reservations.reset_index(drop=True)
|
||||||
|
|
||||||
|
df = pd.concat([non_group_reservations, group_reservations])
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def leave_only_ota(self, df):
|
||||||
|
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
|
||||||
|
return df
|
||||||
|
|
||||||
|
def map_date_to_term_datasets(self, df):
|
||||||
|
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
|
||||||
|
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
|
||||||
|
return df
|
||||||
|
|
||||||
|
def map_length_of_stay_to_nights_buckets(self, df):
|
||||||
|
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
|
||||||
|
return df
|
||||||
|
|
||||||
|
def map_night_price_to_room_segment_buckets(self, df):
|
||||||
|
night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||||
|
.groupby('room_group_id')['night_price'].mean().reset_index()
|
||||||
|
night_prices.columns = ['room_group_id', 'room_night_price']
|
||||||
|
df = pd.merge(df, night_prices, on=['room_group_id'], how='left')
|
||||||
|
df.loc[df['room_night_price'].isnull(), 'room_night_price'] = 0.0
|
||||||
|
df.loc[:, 'room_segment'] = df['room_night_price'].apply(
|
||||||
|
lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||||
|
df = df.drop(columns=['room_night_price'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
# def map_night_price_to_room_segment_buckets(self, df):
|
||||||
|
# night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||||
|
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
|
||||||
|
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
|
||||||
|
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
|
||||||
|
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
|
||||||
|
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||||
|
# df = df.drop(columns=['termnight_price'])
|
||||||
|
# return df
|
||||||
|
|
||||||
|
def map_npeople_to_npeople_buckets(self, df):
|
||||||
|
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
|
||||||
|
return df
|
||||||
|
|
||||||
|
def map_item_to_item_id(self, df):
|
||||||
|
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
|
||||||
|
|
||||||
|
ids = df['item'].unique().tolist()
|
||||||
|
mapping = {ids[i]: i for i in range(len(ids))}
|
||||||
|
|
||||||
|
df['item_id'] = df['item'].apply(lambda x: mapping[x])
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def add_interaction_id(self, df):
|
||||||
|
df.loc[:, 'interaction_id'] = range(df.shape[0])
|
||||||
|
return df
|
||||||
|
|
||||||
|
# ################
|
||||||
|
# Column functions
|
||||||
|
# ################
|
||||||
|
|
||||||
|
def bundle_period(self, diff):
|
||||||
|
diff = float(diff)
|
||||||
|
if int(diff) < 0:
|
||||||
|
return "<0"
|
||||||
|
elif int(diff) <= 7:
|
||||||
|
return diff
|
||||||
|
elif 7 < int(diff) <= 14:
|
||||||
|
return "<14"
|
||||||
|
elif 14 < int(diff) <= 30:
|
||||||
|
return "<30"
|
||||||
|
elif 30 < int(diff) <= 60:
|
||||||
|
return "<60"
|
||||||
|
elif 60 < int(diff) <= 180:
|
||||||
|
return "<180"
|
||||||
|
elif int(diff) > 180:
|
||||||
|
return ">180"
|
||||||
|
|
||||||
|
def bundle_price(self, price):
|
||||||
|
mod = 300.0
|
||||||
|
return int((price + mod / 2) / mod) * mod
|
||||||
|
|
||||||
|
def map_date_to_season(self, date):
|
||||||
|
day = int(date[8:10])
|
||||||
|
month = int(date[5:7])
|
||||||
|
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
|
||||||
|
return "Winter"
|
||||||
|
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
|
||||||
|
return "Spring"
|
||||||
|
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
|
||||||
|
return "Summer"
|
||||||
|
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
|
||||||
|
return "Autumn"
|
||||||
|
|
||||||
|
def map_value_to_bucket(self, value, buckets):
|
||||||
|
if value == "":
|
||||||
|
return str(buckets[0]).replace(", ", "-")
|
||||||
|
for bucket in buckets:
|
||||||
|
if bucket[0] <= value <= bucket[1]:
|
||||||
|
return str(bucket).replace(", ", "-")
|
||||||
|
|
||||||
|
def map_date_to_term(self, date):
|
||||||
|
|
||||||
|
m = int(date[5:7])
|
||||||
|
d = int(date[8:10])
|
||||||
|
term = None
|
||||||
|
|
||||||
|
for arrival_term in self.arrival_terms:
|
||||||
|
if arrival_term == "Easter":
|
||||||
|
year = int(date[:4])
|
||||||
|
easter_date = easter(year)
|
||||||
|
easter_start = easter_date + timedelta(days=-4)
|
||||||
|
easter_end = easter_date + timedelta(days=1)
|
||||||
|
esm = easter_start.month
|
||||||
|
esd = easter_start.day
|
||||||
|
eem = easter_end.month
|
||||||
|
eed = easter_end.day
|
||||||
|
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
|
||||||
|
term = arrival_term
|
||||||
|
break
|
||||||
|
|
||||||
|
elif arrival_term == "NewYear":
|
||||||
|
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
|
||||||
|
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
|
||||||
|
em = self.arrival_terms[arrival_term][0]["end"]["m"]
|
||||||
|
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
|
||||||
|
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
|
||||||
|
term = arrival_term
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
is_match = False
|
||||||
|
|
||||||
|
for i in range(len(self.arrival_terms[arrival_term])):
|
||||||
|
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
|
||||||
|
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
|
||||||
|
em = self.arrival_terms[arrival_term][i]["end"]["m"]
|
||||||
|
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
|
||||||
|
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
|
||||||
|
term = arrival_term
|
||||||
|
is_match = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_match:
|
||||||
|
break
|
||||||
|
|
||||||
|
return term
|
||||||
|
|
||||||
|
def map_dates_to_terms(self, dates):
|
||||||
|
|
||||||
|
terms = []
|
||||||
|
for date in dates:
|
||||||
|
term = self.map_date_to_term(date)
|
||||||
|
terms.append(term)
|
||||||
|
|
||||||
|
return terms
|
||||||
|
|
||||||
|
def filter_out_historical_dates(self, date_list):
|
||||||
|
"""
|
||||||
|
Filters out past dates from a list of dates.
|
||||||
|
"""
|
||||||
|
future_dates = []
|
||||||
|
|
||||||
|
for date in date_list:
|
||||||
|
if date >= datetime.now():
|
||||||
|
future_dates.append(date.strftime("%Y-%m-%d"))
|
||||||
|
|
||||||
|
return future_dates
|
88
data_preprocessing/dataset_specification.py
Normal file
88
data_preprocessing/dataset_specification.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetSpecification(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ################
|
||||||
|
# Original dataset functions
|
||||||
|
# ################
|
||||||
|
|
||||||
|
def get_sum_columns(self):
|
||||||
|
return ["n_people", "n_children_1", "n_children_2", "n_children_3", "accomodation_price", "meal_price",
|
||||||
|
"service_price", "paid", "n_rooms"]
|
||||||
|
|
||||||
|
def get_mean_columns(self):
|
||||||
|
return ['discount']
|
||||||
|
|
||||||
|
def get_mode_columns(self):
|
||||||
|
return ["room_id", "room_group_id", "date_from", "date_to", "booking_date", "rate_plan",
|
||||||
|
"length_of_stay", "book_to_arrival", "weekend_stay"]
|
||||||
|
|
||||||
|
def get_first_columns(self):
|
||||||
|
return ["user_id", "client_id", "client_name", "email", "phone", "is_company"]
|
||||||
|
|
||||||
|
def get_id_columns(self):
|
||||||
|
return ["client_id", "client_name", "email", "phone"]
|
||||||
|
|
||||||
|
# ################
|
||||||
|
# Output dataset functions
|
||||||
|
# ################
|
||||||
|
|
||||||
|
def get_people_df_id_columns(self):
|
||||||
|
return ['user_id']
|
||||||
|
|
||||||
|
def get_people_df_feature_columns(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_items_df_id_columns(self):
|
||||||
|
return ['item_id']
|
||||||
|
|
||||||
|
def get_items_df_feature_columns(self):
|
||||||
|
return ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
|
||||||
|
|
||||||
|
def get_purchases_df_id_columns(self):
|
||||||
|
return ['user_id', 'item_id']
|
||||||
|
|
||||||
|
def get_purchases_df_feature_columns(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ################
|
||||||
|
# Mapping functions
|
||||||
|
# ################
|
||||||
|
|
||||||
|
def get_nights_buckets(self):
|
||||||
|
return [[0, 1], [2, 3], [4, 7], [8, np.inf]]
|
||||||
|
|
||||||
|
def get_npeople_buckets(self):
|
||||||
|
return [[1, 1], [2, 2], [3, 4], [5, np.inf]]
|
||||||
|
|
||||||
|
def get_room_segment_buckets(self):
|
||||||
|
return [[0, 160], [160, 260], [260, 360], [360, 500], [500, 900], [900, np.inf]]
|
||||||
|
|
||||||
|
def get_book_to_arrival_buckets(self):
|
||||||
|
return [[0, 0], [1, 2], [3, 4], [5, 7], [8, 14], [15, 30], [31, 60], [61, 90], [91, 180], [181, np.inf]]
|
||||||
|
|
||||||
|
def get_arrival_terms(self):
|
||||||
|
arrival_terms = {"Easter": [{"start": {"m": np.nan, "d": np.nan}, "end": {"m": np.nan, "d": np.nan}}],
|
||||||
|
# Treated with priority
|
||||||
|
"Christmas": [{"start": {"m": 12, "d": 22}, "end": {"m": 12, "d": 27}}],
|
||||||
|
"NewYear": [{"start": {"m": 12, "d": 28}, "end": {"m": 1, "d": 4}}],
|
||||||
|
"WinterVacation": [{"start": {"m": 1, "d": 5}, "end": {"m": 2, "d": 29}}],
|
||||||
|
"OffSeason": [
|
||||||
|
{"start": {"m": 3, "d": 1}, "end": {"m": 4, "d": 27}},
|
||||||
|
{"start": {"m": 5, "d": 6}, "end": {"m": 6, "d": 20}},
|
||||||
|
{"start": {"m": 9, "d": 26}, "end": {"m": 12, "d": 21}}],
|
||||||
|
"MayLongWeekend": [{"start": {"m": 4, "d": 28}, "end": {"m": 5, "d": 5}}],
|
||||||
|
"LowSeason": [
|
||||||
|
{"start": {"m": 6, "d": 21}, "end": {"m": 7, "d": 10}},
|
||||||
|
{"start": {"m": 8, "d": 23}, "end": {"m": 9, "d": 25}}],
|
||||||
|
"HighSeason": [{"start": {"m": 7, "d": 11}, "end": {"m": 8, "d": 22}}]}
|
||||||
|
return arrival_terms
|
77
data_preprocessing/people_identifier.py
Normal file
77
data_preprocessing/people_identifier.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class PeopleIdentifier(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.id_column_names = []
|
||||||
|
self.pid_cname = ""
|
||||||
|
self.next_available_pid = 0
|
||||||
|
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
|
||||||
|
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
|
||||||
|
self.data = None
|
||||||
|
|
||||||
|
def add_pid(self, data, id_column_names, pid_cname):
|
||||||
|
self.id_column_names = id_column_names
|
||||||
|
self.pid_cname = pid_cname
|
||||||
|
|
||||||
|
for cid_cname in id_column_names:
|
||||||
|
self.cid_to_pid[cid_cname] = {}
|
||||||
|
|
||||||
|
for idx, reservation in data.iterrows():
|
||||||
|
pids = set()
|
||||||
|
for cid_cname in id_column_names:
|
||||||
|
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
|
||||||
|
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||||
|
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||||
|
|
||||||
|
if len(pids) > 0:
|
||||||
|
min_pid = min(pids)
|
||||||
|
|
||||||
|
self.set_pid(min_pid, reservation)
|
||||||
|
|
||||||
|
# Merge pids connected through this node
|
||||||
|
|
||||||
|
if len(pids) > 1:
|
||||||
|
pids.remove(min_pid)
|
||||||
|
self.merge_pids(pids, min_pid)
|
||||||
|
|
||||||
|
# print("Chosen pid: {}".format(min_pid))
|
||||||
|
else:
|
||||||
|
new_pid = self.next_available_pid
|
||||||
|
self.next_available_pid += 1
|
||||||
|
|
||||||
|
self.set_pid(new_pid, reservation)
|
||||||
|
# print("Chosen pid: {}".format(new_pid))
|
||||||
|
|
||||||
|
# print("=======")
|
||||||
|
# print(self.pid_to_cid)
|
||||||
|
# print("=======")
|
||||||
|
|
||||||
|
data_pid = data.copy()
|
||||||
|
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
|
||||||
|
self.data = data_pid
|
||||||
|
|
||||||
|
return data_pid
|
||||||
|
|
||||||
|
def set_pid(self, pid, reservation):
|
||||||
|
for cid_cname in self.id_column_names:
|
||||||
|
if reservation[cid_cname] != "":
|
||||||
|
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
|
||||||
|
if pid in self.pid_to_cid:
|
||||||
|
for cid_cname in self.id_column_names:
|
||||||
|
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||||
|
else:
|
||||||
|
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||||
|
for cid_cname in self.id_column_names}
|
||||||
|
|
||||||
|
def merge_pids(self, pids_from, pid_to):
|
||||||
|
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
|
||||||
|
for pid_from in pids_from:
|
||||||
|
for cid_cname in self.id_column_names:
|
||||||
|
for cid in self.pid_to_cid[pid_from][cid_cname]:
|
||||||
|
self.cid_to_pid[cid_cname][cid] = pid_to
|
||||||
|
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
|
||||||
|
self.pid_to_cid.pop(pid_from)
|
16
environment.yml
Normal file
16
environment.yml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
name: rs-class-env
|
||||||
|
channels:
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- pip=21.0.1
|
||||||
|
- python=3.8.8
|
||||||
|
- numpy==1.20.1
|
||||||
|
- matplotlib==3.3.2
|
||||||
|
- ipykernel==5.5.0
|
||||||
|
- pandas==1.2.3
|
||||||
|
- hyperopt==0.2.5
|
||||||
|
- seaborn==0.11.1
|
||||||
|
- pip:
|
||||||
|
- sklearn==0.0
|
||||||
|
- torch==1.8.0
|
||||||
|
- livelossplot==0.5.4
|
0
evaluation_and_testing/__init__.py
Normal file
0
evaluation_and_testing/__init__.py
Normal file
89
evaluation_and_testing/evaluation_measures.py
Normal file
89
evaluation_and_testing/evaluation_measures.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def rmse(r_pred, r_real):
|
||||||
|
return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))
|
||||||
|
|
||||||
|
|
||||||
|
def mape(r_pred, r_real):
|
||||||
|
return 1 / len(r_pred) * np.sum(np.abs(r_pred - r_real) / np.abs(r_real))
|
||||||
|
|
||||||
|
|
||||||
|
def tre(r_pred, r_real):
|
||||||
|
return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))
|
||||||
|
|
||||||
|
|
||||||
|
def hr(recommendations, real_interactions, n=1):
|
||||||
|
"""
|
||||||
|
Assumes recommendations are ordered by user_id and then by score.
|
||||||
|
|
||||||
|
:param pd.DataFrame recommendations:
|
||||||
|
:param pd.DataFrame real_interactions:
|
||||||
|
:param int n:
|
||||||
|
"""
|
||||||
|
# Transform real_interactions to a dict for a large speed-up
|
||||||
|
rui = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
for idx, row in real_interactions.iterrows():
|
||||||
|
rui[(row['user_id'], row['item_id'])] = 1
|
||||||
|
|
||||||
|
result = 0.0
|
||||||
|
|
||||||
|
previous_user_id = -1
|
||||||
|
rank = 0
|
||||||
|
for idx, row in recommendations.iterrows():
|
||||||
|
if previous_user_id == row['user_id']:
|
||||||
|
rank += 1
|
||||||
|
else:
|
||||||
|
rank = 1
|
||||||
|
|
||||||
|
if rank <= n:
|
||||||
|
result += rui[(row['user_id'], row['item_id'])]
|
||||||
|
|
||||||
|
previous_user_id = row['user_id']
|
||||||
|
|
||||||
|
if len(recommendations['user_id'].unique()) > 0:
|
||||||
|
result /= len(recommendations['user_id'].unique())
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def ndcg(recommendations, real_interactions, n=1):
|
||||||
|
"""
|
||||||
|
Assumes recommendations are ordered by user_id and then by score.
|
||||||
|
|
||||||
|
:param pd.DataFrame recommendations:
|
||||||
|
:param pd.DataFrame real_interactions:
|
||||||
|
:param int n:
|
||||||
|
"""
|
||||||
|
# Transform real_interactions to a dict for a large speed-up
|
||||||
|
rui = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
for idx, row in real_interactions.iterrows():
|
||||||
|
rui[(row['user_id'], row['item_id'])] = 1
|
||||||
|
|
||||||
|
result = 0.0
|
||||||
|
|
||||||
|
previous_user_id = -1
|
||||||
|
rank = 0
|
||||||
|
for idx, row in recommendations.iterrows():
|
||||||
|
if previous_user_id == row['user_id']:
|
||||||
|
rank += 1
|
||||||
|
else:
|
||||||
|
rank = 1
|
||||||
|
|
||||||
|
if rank <= n:
|
||||||
|
result += rui[(row['user_id'], row['item_id'])] / np.log2(1 + rank)
|
||||||
|
|
||||||
|
previous_user_id = row['user_id']
|
||||||
|
|
||||||
|
if len(recommendations['user_id'].unique()) > 0:
|
||||||
|
result /= len(recommendations['user_id'].unique())
|
||||||
|
|
||||||
|
return result
|
209
evaluation_and_testing/testing.py
Normal file
209
evaluation_and_testing/testing.py
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
|
||||||
|
from evaluation_and_testing.evaluation_measures import rmse
|
||||||
|
from evaluation_and_testing.evaluation_measures import mape
|
||||||
|
from evaluation_and_testing.evaluation_measures import tre
|
||||||
|
from evaluation_and_testing.evaluation_measures import hr
|
||||||
|
from evaluation_and_testing.evaluation_measures import ndcg
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
|
||||||
|
rng = np.random.RandomState(seed=seed)
|
||||||
|
|
||||||
|
if isinstance(interactions_df, dict):
|
||||||
|
# If interactions_df is a dict with already split data, use the split
|
||||||
|
interactions_df_train = interactions_df['train']
|
||||||
|
interactions_df_test = interactions_df['test']
|
||||||
|
else:
|
||||||
|
# Otherwise split the dataset into train and test
|
||||||
|
|
||||||
|
shuffle = np.arange(len(interactions_df))
|
||||||
|
rng.shuffle(shuffle)
|
||||||
|
shuffle = list(shuffle)
|
||||||
|
|
||||||
|
train_test_split = 0.8
|
||||||
|
split_index = int(len(interactions_df) * train_test_split)
|
||||||
|
|
||||||
|
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||||
|
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||||
|
|
||||||
|
# Train the recommender
|
||||||
|
|
||||||
|
recommender.fit(interactions_df_train, None, items_df)
|
||||||
|
|
||||||
|
# Gather predictions
|
||||||
|
|
||||||
|
r_pred = []
|
||||||
|
|
||||||
|
for idx, row in interactions_df_test.iterrows():
|
||||||
|
users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
|
||||||
|
eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
|
||||||
|
eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
|
||||||
|
recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)
|
||||||
|
|
||||||
|
r_pred.append(recommendations.iloc[0]['score'])
|
||||||
|
|
||||||
|
# Gather real ratings
|
||||||
|
|
||||||
|
r_real = np.array(interactions_df_test['rating'].tolist())
|
||||||
|
|
||||||
|
# Return evaluation metrics
|
||||||
|
|
||||||
|
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
|
||||||
|
# Write your code here
|
||||||
|
rng = np.random.RandomState(seed=seed)
|
||||||
|
|
||||||
|
if isinstance(interactions_df, dict):
|
||||||
|
# If interactions_df is a dict with already split data, use the split
|
||||||
|
interactions_df_train = interactions_df['train']
|
||||||
|
interactions_df_test = interactions_df['test']
|
||||||
|
else:
|
||||||
|
# Otherwise split the dataset into train and test
|
||||||
|
|
||||||
|
shuffle = np.arange(len(interactions_df))
|
||||||
|
rng.shuffle(shuffle)
|
||||||
|
shuffle = list(shuffle)
|
||||||
|
|
||||||
|
train_test_split = 0.8
|
||||||
|
split_index = int(len(interactions_df) * train_test_split)
|
||||||
|
|
||||||
|
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||||
|
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||||
|
|
||||||
|
hr_1 = []
|
||||||
|
hr_3 = []
|
||||||
|
hr_5 = []
|
||||||
|
hr_10 = []
|
||||||
|
ndcg_1 = []
|
||||||
|
ndcg_3 = []
|
||||||
|
ndcg_5 = []
|
||||||
|
ndcg_10 = []
|
||||||
|
|
||||||
|
# Train the recommender
|
||||||
|
|
||||||
|
recommender.fit(interactions_df_train, None, items_df)
|
||||||
|
|
||||||
|
# Make recommendations for each user in the test set and calculate the metric
|
||||||
|
# against all items of that user in the test set
|
||||||
|
|
||||||
|
test_user_interactions = interactions_df_test.groupby(by='user_id')
|
||||||
|
|
||||||
|
for user_id, user_interactions in test_user_interactions:
|
||||||
|
|
||||||
|
recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']),
|
||||||
|
items_df, n_recommendations=10)
|
||||||
|
|
||||||
|
hr_1.append(hr(recommendations, user_interactions, n=1))
|
||||||
|
hr_3.append(hr(recommendations, user_interactions, n=3))
|
||||||
|
hr_5.append(hr(recommendations, user_interactions, n=5))
|
||||||
|
hr_10.append(hr(recommendations, user_interactions, n=10))
|
||||||
|
ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
|
||||||
|
ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
|
||||||
|
ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
|
||||||
|
ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
|
||||||
|
|
||||||
|
hr_1 = np.mean(hr_1)
|
||||||
|
hr_3 = np.mean(hr_3)
|
||||||
|
hr_5 = np.mean(hr_5)
|
||||||
|
hr_10 = np.mean(hr_10)
|
||||||
|
ndcg_1 = np.mean(ndcg_1)
|
||||||
|
ndcg_3 = np.mean(ndcg_3)
|
||||||
|
ndcg_5 = np.mean(ndcg_5)
|
||||||
|
ndcg_10 = np.mean(ndcg_10)
|
||||||
|
|
||||||
|
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||||
|
rng = np.random.RandomState(seed=seed)
|
||||||
|
|
||||||
|
# Prepare splits of the datasets
|
||||||
|
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||||
|
|
||||||
|
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||||
|
|
||||||
|
r_pred = []
|
||||||
|
r_real = []
|
||||||
|
n_eval = 1
|
||||||
|
for train_index, test_index in kf.split(interactions_df.index):
|
||||||
|
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||||
|
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||||
|
|
||||||
|
recommender.fit(interactions_df_train, None, items_df)
|
||||||
|
recommendations = recommender.recommend(
|
||||||
|
interactions_df_test.loc[:, ['user_id']],
|
||||||
|
items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
|
||||||
|
|
||||||
|
r_pred.append(recommendations.iloc[0]['score'])
|
||||||
|
r_real.append(interactions_df_test.iloc[0]['rating'])
|
||||||
|
|
||||||
|
if n_eval == max_evals:
|
||||||
|
break
|
||||||
|
n_eval += 1
|
||||||
|
|
||||||
|
r_pred = np.array(r_pred)
|
||||||
|
r_real = np.array(r_real)
|
||||||
|
|
||||||
|
# Return evaluation metrics
|
||||||
|
|
||||||
|
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||||
|
rng = np.random.RandomState(seed=seed)
|
||||||
|
|
||||||
|
# Prepare splits of the datasets
|
||||||
|
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||||
|
|
||||||
|
hr_1 = []
|
||||||
|
hr_3 = []
|
||||||
|
hr_5 = []
|
||||||
|
hr_10 = []
|
||||||
|
ndcg_1 = []
|
||||||
|
ndcg_3 = []
|
||||||
|
ndcg_5 = []
|
||||||
|
ndcg_10 = []
|
||||||
|
|
||||||
|
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||||
|
|
||||||
|
n_eval = 1
|
||||||
|
for train_index, test_index in kf.split(interactions_df.index):
|
||||||
|
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||||
|
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||||
|
|
||||||
|
recommender.fit(interactions_df_train, None, items_df)
|
||||||
|
recommendations = recommender.recommend(
|
||||||
|
interactions_df_test.loc[:, ['user_id']], items_df, n_recommendations=10)
|
||||||
|
|
||||||
|
hr_1.append(hr(recommendations, interactions_df_test, n=1))
|
||||||
|
hr_3.append(hr(recommendations, interactions_df_test, n=3))
|
||||||
|
hr_5.append(hr(recommendations, interactions_df_test, n=5))
|
||||||
|
hr_10.append(hr(recommendations, interactions_df_test, n=10))
|
||||||
|
ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
|
||||||
|
ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
|
||||||
|
ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
|
||||||
|
ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
|
||||||
|
|
||||||
|
if n_eval == max_evals:
|
||||||
|
break
|
||||||
|
n_eval += 1
|
||||||
|
|
||||||
|
hr_1 = np.mean(hr_1)
|
||||||
|
hr_3 = np.mean(hr_3)
|
||||||
|
hr_5 = np.mean(hr_5)
|
||||||
|
hr_10 = np.mean(hr_10)
|
||||||
|
ndcg_1 = np.mean(ndcg_1)
|
||||||
|
ndcg_3 = np.mean(ndcg_3)
|
||||||
|
ndcg_5 = np.mean(ndcg_5)
|
||||||
|
ndcg_10 = np.mean(ndcg_10)
|
||||||
|
|
||||||
|
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
14586
project_1_data_preparation.html
Normal file
14586
project_1_data_preparation.html
Normal file
File diff suppressed because one or more lines are too long
2186
project_1_data_preparation.ipynb
Normal file
2186
project_1_data_preparation.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1873
project_2_recommender_and_evaluation-0_116.ipynb
Normal file
1873
project_2_recommender_and_evaluation-0_116.ipynb
Normal file
File diff suppressed because one or more lines are too long
1687
project_2_recommender_and_evaluation-Copy1.ipynb
Normal file
1687
project_2_recommender_and_evaluation-Copy1.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1979
project_2_recommender_and_evaluation-Copy2.ipynb
Normal file
1979
project_2_recommender_and_evaluation-Copy2.ipynb
Normal file
File diff suppressed because one or more lines are too long
1890
project_2_recommender_and_evaluation.ipynb
Normal file
1890
project_2_recommender_and_evaluation.ipynb
Normal file
File diff suppressed because one or more lines are too long
0
recommenders/__init__.py
Normal file
0
recommenders/__init__.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import scipy.special as scisp
|
||||||
|
|
||||||
|
from recommenders.recommender import Recommender
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonRecommender(Recommender):
|
||||||
|
"""
|
||||||
|
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
|
||||||
|
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
|
||||||
|
IEEE Internet Computing, 2003,
|
||||||
|
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||||
|
self.interactions_df = None
|
||||||
|
self.item_id_mapping = None
|
||||||
|
self.user_id_mapping = None
|
||||||
|
self.item_id_reverse_mapping = None
|
||||||
|
self.user_id_reverse_mapping = None
|
||||||
|
self.e_xy = None
|
||||||
|
self.n_xy = None
|
||||||
|
self.scores = None
|
||||||
|
self.most_popular_items = None
|
||||||
|
self.should_recommend_already_bought = False
|
||||||
|
|
||||||
|
def initialize(self, **params):
|
||||||
|
if 'should_recommend_already_bought' in params:
|
||||||
|
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||||
|
|
||||||
|
def fit(self, interactions_df, users_df, items_df):
|
||||||
|
"""
|
||||||
|
Training of the recommender.
|
||||||
|
|
||||||
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||||
|
defined by user_id, item_id and features of the interaction.
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||||
|
user_id and the user feature columns.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||||
|
by item_id and the item feature columns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Shift item ids and user ids so that they are consecutive
|
||||||
|
|
||||||
|
unique_item_ids = interactions_df['item_id'].unique()
|
||||||
|
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||||
|
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||||
|
unique_user_ids = interactions_df['user_id'].unique()
|
||||||
|
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||||
|
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||||
|
|
||||||
|
interactions_df = interactions_df.copy()
|
||||||
|
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Get the number of items and users
|
||||||
|
|
||||||
|
self.interactions_df = interactions_df
|
||||||
|
n_items = np.max(interactions_df['item_id']) + 1
|
||||||
|
n_users = np.max(interactions_df['user_id']) + 1
|
||||||
|
|
||||||
|
# Get maximal number of interactions
|
||||||
|
|
||||||
|
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
|
||||||
|
# Unnecessary, but added for readability
|
||||||
|
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
|
||||||
|
max_interactions = n_user_interactions['n_items'].max()
|
||||||
|
|
||||||
|
# Calculate P_Y's
|
||||||
|
|
||||||
|
n_interactions = len(interactions_df)
|
||||||
|
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
|
||||||
|
p_y = p_y.rename(columns={'user_id': 'P_Y'})
|
||||||
|
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
|
||||||
|
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
|
||||||
|
|
||||||
|
# Get the series of all items
|
||||||
|
|
||||||
|
# items = list(range(n_items))
|
||||||
|
items = interactions_df['item_id'].unique()
|
||||||
|
|
||||||
|
# For every X calculate the E[Y|X]
|
||||||
|
|
||||||
|
e_xy = np.zeros(shape=(n_items, n_items))
|
||||||
|
e_xy[:][:] = -1e100
|
||||||
|
|
||||||
|
p_y_powers = {}
|
||||||
|
for y in items:
|
||||||
|
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
|
||||||
|
|
||||||
|
# In the next version calculate all alpha_k first (this works well with parallelization)
|
||||||
|
|
||||||
|
for x in items:
|
||||||
|
# Get users who bought X
|
||||||
|
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
|
||||||
|
|
||||||
|
# Get users who bought only X
|
||||||
|
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
|
||||||
|
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
|
||||||
|
|
||||||
|
# Calculate the number of non-X interactions for each user who bought X
|
||||||
|
# Include users with zero non-X interactions
|
||||||
|
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
|
||||||
|
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
|
||||||
|
# Unnecessary, but added for readability
|
||||||
|
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
|
||||||
|
|
||||||
|
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
|
||||||
|
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
|
||||||
|
|
||||||
|
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
|
||||||
|
|
||||||
|
# Calculate the expected numbers of Y products bought by clients who bought X
|
||||||
|
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
|
||||||
|
for abs_c in n_non_x_interactions["n_items"]])
|
||||||
|
for k in range(1, max_interactions + 1)])
|
||||||
|
|
||||||
|
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
|
||||||
|
if y != x:
|
||||||
|
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
|
||||||
|
else:
|
||||||
|
e_xy[x][y] = n_users * p_y[x]
|
||||||
|
|
||||||
|
self.e_xy = e_xy
|
||||||
|
|
||||||
|
# Calculate the number of users who bought both X and Y
|
||||||
|
|
||||||
|
# Simple and slow method (commented out)
|
||||||
|
|
||||||
|
# n_xy = np.zeros(shape=(n_items, n_items))
|
||||||
|
|
||||||
|
# for x in items:
|
||||||
|
# for y in items:
|
||||||
|
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
|
||||||
|
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
|
||||||
|
# users_x_and_y = users_x & users_y
|
||||||
|
# n_xy[x][y] = len(users_x_and_y)
|
||||||
|
|
||||||
|
# Optimized method (can be further optimized by using sparse matrices)
|
||||||
|
|
||||||
|
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||||
|
r = np.zeros(shape=(n_users, n_items))
|
||||||
|
for idx, interaction in interactions_df.iterrows():
|
||||||
|
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||||
|
|
||||||
|
# Get the number of users who bought both X and Y
|
||||||
|
|
||||||
|
n_xy = np.matmul(r.T, r)
|
||||||
|
|
||||||
|
self.n_xy = n_xy
|
||||||
|
|
||||||
|
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
|
||||||
|
|
||||||
|
# Find the most popular items for the cold start problem
|
||||||
|
|
||||||
|
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||||
|
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||||
|
self.most_popular_items = offers_count.index
|
||||||
|
|
||||||
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||||
|
"""
|
||||||
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||||
|
top n_recommendations for each user.
|
||||||
|
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||||
|
recommendations should be generated.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||||
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||||
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||||
|
for each user.
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Clean previous recommendations (iloc could be used alternatively)
|
||||||
|
self.recommender_df = self.recommender_df[:0]
|
||||||
|
|
||||||
|
# Handle users not in the training data
|
||||||
|
|
||||||
|
# Map item ids
|
||||||
|
|
||||||
|
items_df = items_df.copy()
|
||||||
|
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Generate recommendations
|
||||||
|
|
||||||
|
for idx, user in users_df.iterrows():
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
user_id = user['user_id']
|
||||||
|
|
||||||
|
if user_id in self.user_id_mapping:
|
||||||
|
mapped_user_id = self.user_id_mapping[user_id]
|
||||||
|
|
||||||
|
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||||
|
final_scores = np.sum(self.scores[x_list], axis=0)
|
||||||
|
|
||||||
|
# Choose n recommendations based on highest scores
|
||||||
|
if not self.should_recommend_already_bought:
|
||||||
|
final_scores[x_list] = -1e100
|
||||||
|
|
||||||
|
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
|
||||||
|
|
||||||
|
for item_id in chosen_ids:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||||
|
'item_id': self.item_id_reverse_mapping[item_id],
|
||||||
|
'score': final_scores[item_id]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else: # For new users recommend most popular items
|
||||||
|
for i in range(n_recommendations):
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': user['user_id'],
|
||||||
|
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||||
|
'score': 1.0
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
user_recommendations = pd.DataFrame(recommendations)
|
||||||
|
|
||||||
|
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||||
|
|
||||||
|
return self.recommender_df
|
233
recommenders/nearest_neighbors_recommender.py
Normal file
233
recommenders/nearest_neighbors_recommender.py
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from recommenders.recommender import Recommender
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class NearestNeighborsRecommender(Recommender):
|
||||||
|
"""
|
||||||
|
Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.
|
||||||
|
|
||||||
|
Possible similarity measures:
|
||||||
|
- 'cosine',
|
||||||
|
- 'pearson'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||||
|
self.interactions_df = None
|
||||||
|
self.item_id_mapping = None
|
||||||
|
self.user_id_mapping = None
|
||||||
|
self.item_id_reverse_mapping = None
|
||||||
|
self.user_id_reverse_mapping = None
|
||||||
|
self.r = None
|
||||||
|
self.similarities = None
|
||||||
|
self.most_popular_items = None
|
||||||
|
|
||||||
|
self.collaboration_type = 'user'
|
||||||
|
self.similarity_measure = 'cosine'
|
||||||
|
self.n_neighbors = 10
|
||||||
|
self.should_recommend_already_bought = False
|
||||||
|
|
||||||
|
def initialize(self, **params):
|
||||||
|
if 'n_neighbors' in params:
|
||||||
|
self.n_neighbors = params['n_neighbors']
|
||||||
|
if 'should_recommend_already_bought' in params:
|
||||||
|
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||||
|
|
||||||
|
def fit(self, interactions_df, users_df, items_df):
|
||||||
|
"""
|
||||||
|
Training of the recommender.
|
||||||
|
|
||||||
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||||
|
defined by user_id, item_id and features of the interaction.
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||||
|
user_id and the user feature columns.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||||
|
by item_id and the item feature columns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
del users_df, items_df
|
||||||
|
|
||||||
|
# Shift item ids and user ids so that they are consecutive
|
||||||
|
|
||||||
|
unique_item_ids = interactions_df['item_id'].unique()
|
||||||
|
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||||
|
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||||
|
unique_user_ids = interactions_df['user_id'].unique()
|
||||||
|
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||||
|
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||||
|
|
||||||
|
interactions_df = interactions_df.copy()
|
||||||
|
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Get the number of items and users
|
||||||
|
|
||||||
|
self.interactions_df = interactions_df
|
||||||
|
n_items = np.max(interactions_df['item_id']) + 1
|
||||||
|
n_users = np.max(interactions_df['user_id']) + 1
|
||||||
|
|
||||||
|
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||||
|
r = np.zeros(shape=(n_users, n_items))
|
||||||
|
for idx, interaction in interactions_df.iterrows():
|
||||||
|
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||||
|
|
||||||
|
if self.collaboration_type == 'item':
|
||||||
|
r = r.T
|
||||||
|
|
||||||
|
self.r = r
|
||||||
|
|
||||||
|
# Calculate all similarities
|
||||||
|
|
||||||
|
similarities = None
|
||||||
|
if self.similarity_measure == 'cosine':
|
||||||
|
n_uv = np.matmul(r, r.T)
|
||||||
|
norms = np.sqrt(np.diag(n_uv))
|
||||||
|
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||||
|
elif self.similarity_measure == 'pearson':
|
||||||
|
r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
|
||||||
|
n_uv = np.matmul(r_shifted, r_shifted.T)
|
||||||
|
norms = np.sqrt(np.diag(n_uv))
|
||||||
|
norms[norms == 0] = 0.000001
|
||||||
|
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||||
|
|
||||||
|
np.fill_diagonal(similarities, -1000)
|
||||||
|
|
||||||
|
self.similarities = similarities
|
||||||
|
|
||||||
|
# Find the most popular items for the cold start problem
|
||||||
|
|
||||||
|
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||||
|
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||||
|
self.most_popular_items = offers_count.index
|
||||||
|
|
||||||
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||||
|
"""
|
||||||
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||||
|
top n_recommendations for each user.
|
||||||
|
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||||
|
recommendations should be generated.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||||
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||||
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||||
|
for each user.
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Clean previous recommendations (iloc could be used alternatively)
|
||||||
|
self.recommender_df = self.recommender_df[:0]
|
||||||
|
|
||||||
|
# Handle users not in the training data
|
||||||
|
|
||||||
|
# Map item ids
|
||||||
|
|
||||||
|
items_df = items_df.copy()
|
||||||
|
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||||
|
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Generate recommendations
|
||||||
|
|
||||||
|
for idx, user in users_df.iterrows():
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
user_id = user['user_id']
|
||||||
|
|
||||||
|
if user_id in self.user_id_mapping:
|
||||||
|
chosen_ids = []
|
||||||
|
scores = []
|
||||||
|
mapped_user_id = self.user_id_mapping[user_id]
|
||||||
|
|
||||||
|
if self.collaboration_type == 'user':
|
||||||
|
neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
|
||||||
|
user_similarities = self.similarities[mapped_user_id][neighbor_ids]
|
||||||
|
|
||||||
|
item_ids = items_df['item_id'].tolist()
|
||||||
|
|
||||||
|
v_i = self.r[neighbor_ids][:, item_ids]
|
||||||
|
|
||||||
|
scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)
|
||||||
|
|
||||||
|
# Choose n recommendations based on highest scores
|
||||||
|
if not self.should_recommend_already_bought:
|
||||||
|
x_list = self.interactions_df.loc[
|
||||||
|
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||||
|
scores[x_list] = -1e100
|
||||||
|
|
||||||
|
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||||
|
|
||||||
|
elif self.collaboration_type == 'item':
|
||||||
|
x_list = self.interactions_df.loc[
|
||||||
|
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||||
|
scores = np.sum(self.similarities[x_list], axis=0)
|
||||||
|
|
||||||
|
# Choose n recommendations based on highest scores
|
||||||
|
if not self.should_recommend_already_bought:
|
||||||
|
scores[x_list] = -1e100
|
||||||
|
|
||||||
|
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||||
|
|
||||||
|
for item_id in chosen_ids:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||||
|
'item_id': self.item_id_reverse_mapping[item_id],
|
||||||
|
'score': scores[item_id]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else: # For new users recommend most popular items
|
||||||
|
for i in range(n_recommendations):
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': user['user_id'],
|
||||||
|
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||||
|
'score': 1.0
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
user_recommendations = pd.DataFrame(recommendations)
|
||||||
|
|
||||||
|
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||||
|
|
||||||
|
return self.recommender_df
|
||||||
|
|
||||||
|
|
||||||
|
class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.collaboration_type = 'user'
|
||||||
|
self.similarity_measure = 'cosine'
|
||||||
|
|
||||||
|
|
||||||
|
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.collaboration_type = 'user'
|
||||||
|
self.similarity_measure = 'pearson'
|
||||||
|
|
||||||
|
|
||||||
|
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.collaboration_type = 'item'
|
||||||
|
self.similarity_measure = 'cosine'
|
||||||
|
|
||||||
|
|
||||||
|
class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.collaboration_type = 'item'
|
||||||
|
self.similarity_measure = 'pearson'
|
305
recommenders/netflix_recommender.py
Normal file
305
recommenders/netflix_recommender.py
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import scipy.special as scisp
|
||||||
|
from livelossplot import PlotLosses
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
|
||||||
|
from recommenders.recommender import Recommender
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class NetflixRecommender(Recommender):
|
||||||
|
"""
|
||||||
|
Collaborative filtering based on matrix factorization with the following choice of an optimizer:
|
||||||
|
- Stochastic Gradient Descent (SGD),
|
||||||
|
- Mini-Batch Gradient Descent (MBGD),
|
||||||
|
- Alternating Least Squares (ALS).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
|
||||||
|
super().__init__()
|
||||||
|
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||||
|
self.interactions_df = None
|
||||||
|
self.item_id_mapping = None
|
||||||
|
self.user_id_mapping = None
|
||||||
|
self.item_id_reverse_mapping = None
|
||||||
|
self.user_id_reverse_mapping = None
|
||||||
|
self.r = None
|
||||||
|
self.most_popular_items = None
|
||||||
|
|
||||||
|
self.n_neg_per_pos = n_neg_per_pos
|
||||||
|
if 'optimizer' in params:
|
||||||
|
self.optimizer = params['optimizer']
|
||||||
|
else:
|
||||||
|
self.optimizer = 'SGD'
|
||||||
|
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
|
||||||
|
self.n_epochs = params['n_epochs']
|
||||||
|
else:
|
||||||
|
self.n_epochs = 10
|
||||||
|
if 'lr' in params: # learning rate
|
||||||
|
self.lr = params['lr']
|
||||||
|
else:
|
||||||
|
self.lr = 0.01
|
||||||
|
if 'reg_l' in params: # regularization coefficient
|
||||||
|
self.reg_l = params['reg_l']
|
||||||
|
else:
|
||||||
|
self.reg_l = 0.1
|
||||||
|
if 'embedding_dim' in params:
|
||||||
|
self.embedding_dim = params['embedding_dim']
|
||||||
|
else:
|
||||||
|
self.embedding_dim = 8
|
||||||
|
|
||||||
|
self.user_repr = None
|
||||||
|
self.item_repr = None
|
||||||
|
|
||||||
|
if 'should_recommend_already_bought' in params:
|
||||||
|
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||||
|
else:
|
||||||
|
self.should_recommend_already_bought = False
|
||||||
|
|
||||||
|
self.validation_set_size = 0.2
|
||||||
|
|
||||||
|
self.seed = seed
|
||||||
|
self.rng = np.random.RandomState(seed=seed)
|
||||||
|
|
||||||
|
self.print_type = print_type
|
||||||
|
|
||||||
|
def fit(self, interactions_df, users_df, items_df):
|
||||||
|
"""
|
||||||
|
Training of the recommender.
|
||||||
|
|
||||||
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||||
|
defined by user_id, item_id and features of the interaction.
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||||
|
user_id and the user feature columns.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||||
|
by item_id and the item feature columns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
del users_df, items_df
|
||||||
|
|
||||||
|
# Shift item ids and user ids so that they are consecutive
|
||||||
|
|
||||||
|
unique_item_ids = interactions_df['item_id'].unique()
|
||||||
|
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||||
|
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||||
|
unique_user_ids = interactions_df['user_id'].unique()
|
||||||
|
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||||
|
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||||
|
|
||||||
|
interactions_df = interactions_df.copy()
|
||||||
|
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Get the number of items and users
|
||||||
|
|
||||||
|
self.interactions_df = interactions_df
|
||||||
|
n_users = np.max(interactions_df['user_id']) + 1
|
||||||
|
n_items = np.max(interactions_df['item_id']) + 1
|
||||||
|
|
||||||
|
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||||
|
r = np.zeros(shape=(n_users, n_items))
|
||||||
|
for idx, interaction in interactions_df.iterrows():
|
||||||
|
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||||
|
|
||||||
|
self.r = r
|
||||||
|
|
||||||
|
# Generate negative interactions
|
||||||
|
negative_interactions = []
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < self.n_neg_per_pos * len(interactions_df):
|
||||||
|
sample_size = 1000
|
||||||
|
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
|
||||||
|
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
|
||||||
|
|
||||||
|
j = 0
|
||||||
|
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
|
||||||
|
if r[user_ids[j]][item_ids[j]] == 0:
|
||||||
|
negative_interactions.append([user_ids[j], item_ids[j], 0])
|
||||||
|
i += 1
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
interactions_df = pd.concat(
|
||||||
|
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
|
||||||
|
|
||||||
|
# Initialize user and item embeddings as random vectors (from Gaussian distribution)
|
||||||
|
|
||||||
|
self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim))
|
||||||
|
self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim))
|
||||||
|
|
||||||
|
# Initialize losses and loss visualization
|
||||||
|
|
||||||
|
if self.print_type is not None and self.print_type == 'live':
|
||||||
|
liveloss = PlotLosses()
|
||||||
|
|
||||||
|
training_losses = deque(maxlen=50)
|
||||||
|
training_avg_losses = []
|
||||||
|
training_epoch_losses = []
|
||||||
|
validation_losses = deque(maxlen=50)
|
||||||
|
validation_avg_losses = []
|
||||||
|
validation_epoch_losses = []
|
||||||
|
last_training_total_loss = 0.0
|
||||||
|
last_validation_total_loss = 0.0
|
||||||
|
|
||||||
|
# Split the data
|
||||||
|
|
||||||
|
interaction_ids = self.rng.permutation(len(interactions_df))
|
||||||
|
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
|
||||||
|
training_ids = interaction_ids[:train_validation_slice_idx]
|
||||||
|
validation_ids = interaction_ids[train_validation_slice_idx:]
|
||||||
|
|
||||||
|
# Train the model
|
||||||
|
|
||||||
|
for epoch in range(self.n_epochs):
|
||||||
|
if self.print_type is not None and self.print_type == 'live':
|
||||||
|
logs = {}
|
||||||
|
|
||||||
|
# Train
|
||||||
|
|
||||||
|
training_losses.clear()
|
||||||
|
training_total_loss = 0.0
|
||||||
|
batch_idx = 0
|
||||||
|
for idx in training_ids:
|
||||||
|
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||||
|
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||||
|
|
||||||
|
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||||
|
self.user_repr[user_id] = self.user_repr[user_id] \
|
||||||
|
+ self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id])
|
||||||
|
self.item_repr[item_id] = self.item_repr[item_id] \
|
||||||
|
+ self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id])
|
||||||
|
|
||||||
|
loss = e_ui**2
|
||||||
|
training_total_loss += loss
|
||||||
|
|
||||||
|
if self.print_type is not None and self.print_type == 'text':
|
||||||
|
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
|
||||||
|
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
|
||||||
|
|
||||||
|
batch_idx += 1
|
||||||
|
|
||||||
|
training_losses.append(loss)
|
||||||
|
training_avg_losses.append(np.mean(training_losses))
|
||||||
|
|
||||||
|
# Validate
|
||||||
|
|
||||||
|
validation_losses.clear()
|
||||||
|
validation_total_loss = 0.0
|
||||||
|
for idx in validation_ids:
|
||||||
|
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||||
|
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||||
|
|
||||||
|
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||||
|
|
||||||
|
loss = e_ui**2
|
||||||
|
validation_total_loss += loss
|
||||||
|
|
||||||
|
validation_losses.append(loss)
|
||||||
|
validation_avg_losses.append(np.mean(validation_losses))
|
||||||
|
|
||||||
|
# Save and print epoch losses
|
||||||
|
|
||||||
|
training_last_avg_loss = training_total_loss / len(training_ids)
|
||||||
|
training_epoch_losses.append(training_last_avg_loss)
|
||||||
|
validation_last_avg_loss = validation_total_loss / len(validation_ids)
|
||||||
|
validation_epoch_losses.append(validation_last_avg_loss)
|
||||||
|
|
||||||
|
if self.print_type is not None and self.print_type == 'live' and epoch >= 3:
|
||||||
|
# A bound on epoch prevents showing extremely high losses in the first epochs
|
||||||
|
# noinspection PyUnboundLocalVariable
|
||||||
|
logs['loss'] = training_last_avg_loss
|
||||||
|
logs['val_loss'] = validation_last_avg_loss
|
||||||
|
# noinspection PyUnboundLocalVariable
|
||||||
|
liveloss.update(logs)
|
||||||
|
liveloss.send()
|
||||||
|
|
||||||
|
# Find the most popular items for the cold start problem
|
||||||
|
|
||||||
|
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||||
|
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||||
|
self.most_popular_items = offers_count.index
|
||||||
|
|
||||||
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||||
|
"""
|
||||||
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||||
|
top n_recommendations for each user.
|
||||||
|
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||||
|
recommendations should be generated.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||||
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||||
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||||
|
for each user.
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Clean previous recommendations (iloc could be used alternatively)
|
||||||
|
self.recommender_df = self.recommender_df[:0]
|
||||||
|
|
||||||
|
# Handle users not in the training data
|
||||||
|
|
||||||
|
# Map item ids
|
||||||
|
|
||||||
|
items_df = items_df.copy()
|
||||||
|
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||||
|
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||||
|
|
||||||
|
# Generate recommendations
|
||||||
|
|
||||||
|
for idx, user in users_df.iterrows():
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
user_id = user['user_id']
|
||||||
|
|
||||||
|
if user_id in self.user_id_mapping:
|
||||||
|
mapped_user_id = self.user_id_mapping[user_id]
|
||||||
|
|
||||||
|
ids_list = items_df['item_id'].tolist()
|
||||||
|
id_to_pos = np.array([0]*len(ids_list))
|
||||||
|
for k in range(len(ids_list)):
|
||||||
|
id_to_pos[ids_list[k]] = k
|
||||||
|
scores = np.matmul(self.user_repr[mapped_user_id].reshape(1, -1),
|
||||||
|
self.item_repr[ids_list].T).flatten()
|
||||||
|
|
||||||
|
# Choose n recommendations based on highest scores
|
||||||
|
if not self.should_recommend_already_bought:
|
||||||
|
x_list = self.interactions_df.loc[
|
||||||
|
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||||
|
scores[id_to_pos[x_list]] = -1e100
|
||||||
|
|
||||||
|
chosen_pos = np.argsort(-scores)[:n_recommendations]
|
||||||
|
|
||||||
|
for item_pos in chosen_pos:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||||
|
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
|
||||||
|
'score': scores[item_pos]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else: # For new users recommend most popular items
|
||||||
|
for i in range(n_recommendations):
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
'user_id': user['user_id'],
|
||||||
|
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||||
|
'score': 1.0
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
user_recommendations = pd.DataFrame(recommendations)
|
||||||
|
|
||||||
|
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||||
|
|
||||||
|
return self.recommender_df
|
||||||
|
|
||||||
|
def get_user_repr(self, user_id):
|
||||||
|
mapped_user_id = self.user_id_mapping[user_id]
|
||||||
|
return self.user_repr[mapped_user_id]
|
||||||
|
|
||||||
|
def get_item_repr(self, item_id):
|
||||||
|
mapped_item_id = self.item_id_mapping[item_id]
|
||||||
|
return self.item_repr[mapped_item_id]
|
52
recommenders/recommender.py
Normal file
52
recommenders/recommender.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class Recommender(object):
|
||||||
|
"""
|
||||||
|
Base recommender class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Initialize base recommender params and variables.
|
||||||
|
|
||||||
|
:param int seed: Seed for the random number generator.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fit(self, interactions_df, users_df, items_df):
|
||||||
|
"""
|
||||||
|
Training of the recommender.
|
||||||
|
|
||||||
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||||
|
defined by user_id, item_id and features of the interaction.
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||||
|
"""
|
||||||
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||||
|
top n_recommendations for each user.
|
||||||
|
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||||
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||||
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||||
|
for each user.
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||||
|
|
||||||
|
for ix, user in users_df.iterrows():
|
||||||
|
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||||
|
'item_id': [-1] * n_recommendations,
|
||||||
|
'score': [3.0] * n_recommendations})
|
||||||
|
|
||||||
|
recommendations = pd.concat([recommendations, user_recommendations])
|
||||||
|
|
||||||
|
return recommendations
|
102
recommenders/tfidf_recommender.py
Normal file
102
recommenders/tfidf_recommender.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
# Load libraries ---------------------------------------------
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from recommenders.recommender import Recommender
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TFIDFRecommender(Recommender):
|
||||||
|
"""
|
||||||
|
Recommender based on the TF-IDF method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Initialize base recommender params and variables.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.tfidf_scores = None
|
||||||
|
|
||||||
|
def fit(self, interactions_df, users_df, items_df):
|
||||||
|
"""
|
||||||
|
Training of the recommender.
|
||||||
|
|
||||||
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||||
|
defined by user_id, item_id and features of the interaction.
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
|
||||||
|
and the user feature columns.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
|
||||||
|
and the item feature columns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.tfidf_scores = defaultdict(lambda: 0.0)
|
||||||
|
|
||||||
|
# Prepare the corpus for tfidf calculation
|
||||||
|
|
||||||
|
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
|
||||||
|
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
|
||||||
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
|
||||||
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
|
||||||
|
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
|
||||||
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
|
||||||
|
user_ids = user_genres.index.tolist()
|
||||||
|
genres_corpus = user_genres['genres'].tolist()
|
||||||
|
|
||||||
|
# Calculate tf-idf scores
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
tfidf_scores = vectorizer.fit_transform(genres_corpus)
|
||||||
|
|
||||||
|
# Transform results into a dict {(user_id, genre): score}
|
||||||
|
|
||||||
|
for u in range(tfidf_scores.shape[0]):
|
||||||
|
for g in range(tfidf_scores.shape[1]):
|
||||||
|
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
|
||||||
|
|
||||||
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||||
|
"""
|
||||||
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||||
|
top n_recommendations for each user.
|
||||||
|
|
||||||
|
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
|
||||||
|
should be generated.
|
||||||
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||||
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||||
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||||
|
for each user.
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||||
|
|
||||||
|
# Transform genres to a unified form used by the vectorizer
|
||||||
|
|
||||||
|
items_df = items_df.copy()
|
||||||
|
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
|
||||||
|
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
|
||||||
|
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
|
||||||
|
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
|
||||||
|
|
||||||
|
# Score items
|
||||||
|
|
||||||
|
for uix, user in users_df.iterrows():
|
||||||
|
items = []
|
||||||
|
for iix, item in items_df.iterrows():
|
||||||
|
score = 0.0
|
||||||
|
for genre in item['genres']:
|
||||||
|
score += self.tfidf_scores[(user['user_id'], genre)]
|
||||||
|
score /= len(item['genres'])
|
||||||
|
items.append((item['item_id'], score))
|
||||||
|
|
||||||
|
items = sorted(items, key=lambda x: x[1], reverse=True)
|
||||||
|
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||||
|
'item_id': [item[0] for item in items][:n_recommendations],
|
||||||
|
'score': [item[1] for item in items][:n_recommendations]})
|
||||||
|
|
||||||
|
recommendations = pd.concat([recommendations, user_recommendations])
|
||||||
|
|
||||||
|
return recommendations
|
Loading…
Reference in New Issue
Block a user