Add recommender with HR10 0.116

This commit is contained in:
Aleksander Piotrowski 2021-06-28 20:18:14 +02:00
commit 4cf2994aca
25 changed files with 75392 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.ipynb_checkpoints
__pycache__

52
README.md Normal file
View File

@ -0,0 +1,52 @@
# Recommender Systems class - Project 2
## Preparing your system
1. Install [Docker](https://docs.docker.com/engine/install/).
2. Fork this repository to your GitHub account.
3. Run Jupyter docker image:
```bash
docker run \
-d -p 8888:8888 \
-v DIRECTORY:/home/jovyan/REK \
--name REK \
jupyter/minimal-notebook
```
Remember to change **DIRECTORY** to directory where all files can be found. You can change it to `$(pwd)` if your current directory is proper one.
4. Get loging link with following command:
```bash
docker logs REK 2>&1 | grep -o 'http://127.0.0.1:8888.*' | tail -n1
Example output:
http://127.0.0.1:8888/?token=2bb816a4bc36a4bdbf64e0c9a89f336ae5404a01d15e442c
```
5. Prepare conda environment:
```bash
docker exec REK bash -c "
conda env create --name rs-class-env -f /home/jovyan/REK/environment.yml;
python -m ipykernel install --user --name=rs-class-env"
```
6. You can start/stop container whenever you want:
```bash
docker stop REK
docker start REK
```
If you want to start from scratch, you can remove container:
```bash
docker stop REK
docker rm REK
```
Now you are ready to work!

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

View File

@ -0,0 +1,278 @@
# Load libraries ---------------------------------------------
from datetime import datetime, timedelta
from dateutil.easter import easter
from data_preprocessing.dataset_specification import DatasetSpecification
import pandas as pd
import numpy as np
# ------------------------------------------------------------
class DataPreprocessingToolkit(object):
def __init__(self):
dataset_specification = DatasetSpecification()
self.sum_columns = dataset_specification.get_sum_columns()
self.mean_columns = dataset_specification.get_mean_columns()
self.mode_columns = dataset_specification.get_mode_columns()
self.first_columns = dataset_specification.get_first_columns()
self.nights_buckets = dataset_specification.get_nights_buckets()
self.npeople_buckets = dataset_specification.get_npeople_buckets()
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
self.arrival_terms = dataset_specification.get_arrival_terms()
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
# #########################
# Entire datasets functions
# #########################
def fix_date_to(self, df):
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
return df
def add_length_of_stay(self, df):
df.loc[:, "length_of_stay"] = (df["date_to"] - df["date_from"]).dt.days
return df
def add_book_to_arrival(self, df):
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
return df
def add_nrooms(self, df):
df.loc[:, "n_rooms"] = 1
return df
def add_weekend_stay(self, df):
s = df["date_from"].dt.dayofweek
e = df["date_to"].dt.dayofweek
dt = (df["date_to"] - df["date_from"]).dt.days
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
return df
def add_night_price(self, df):
df.loc[:, "night_price"] = np.round(df["accomodation_price"] / df["length_of_stay"] / df["n_rooms"], 2)
return df
def clip_book_to_arrival(self, df):
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
return df
def sum_npeople(self, df):
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
return df
def filter_out_company_clients(self, df):
df = df.loc[df["is_company"] == 0]
return df
def filter_out_long_stays(self, df):
df = df.loc[df["length_of_stay"] <= 21]
return df
def leave_one_from_group_reservations(self, df):
unique_group_rows = []
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
group_ids = []
for idx, row in df.iterrows():
if row["group_id"] != -1:
if row["group_id"] not in group_ids:
unique_group_rows.append(row)
group_ids.append(row["group_id"])
else:
unique_group_rows.append(row)
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
return df
def aggregate_group_reservations(self, df):
non_group_reservations = df.loc[df["group_id"] == "",
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
group_reservations = df.loc[df["group_id"] != ""]
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
group_reservations = agg_datasets[0]
for i in range(1, len(agg_datasets)):
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
group_reservations = group_reservations.reset_index(drop=True)
df = pd.concat([non_group_reservations, group_reservations])
return df
def leave_only_ota(self, df):
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
return df
def map_date_to_term_datasets(self, df):
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
return df
def map_length_of_stay_to_nights_buckets(self, df):
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
return df
def map_night_price_to_room_segment_buckets(self, df):
night_prices = df.loc[df['accomodation_price'] > 1]\
.groupby('room_group_id')['night_price'].mean().reset_index()
night_prices.columns = ['room_group_id', 'room_night_price']
df = pd.merge(df, night_prices, on=['room_group_id'], how='left')
df.loc[df['room_night_price'].isnull(), 'room_night_price'] = 0.0
df.loc[:, 'room_segment'] = df['room_night_price'].apply(
lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
df = df.drop(columns=['room_night_price'])
return df
# def map_night_price_to_room_segment_buckets(self, df):
# night_prices = df.loc[df['accomodation_price'] > 1]\
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
# df = df.drop(columns=['termnight_price'])
# return df
def map_npeople_to_npeople_buckets(self, df):
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
return df
def map_item_to_item_id(self, df):
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
ids = df['item'].unique().tolist()
mapping = {ids[i]: i for i in range(len(ids))}
df['item_id'] = df['item'].apply(lambda x: mapping[x])
return df
def add_interaction_id(self, df):
df.loc[:, 'interaction_id'] = range(df.shape[0])
return df
# ################
# Column functions
# ################
def bundle_period(self, diff):
diff = float(diff)
if int(diff) < 0:
return "<0"
elif int(diff) <= 7:
return diff
elif 7 < int(diff) <= 14:
return "<14"
elif 14 < int(diff) <= 30:
return "<30"
elif 30 < int(diff) <= 60:
return "<60"
elif 60 < int(diff) <= 180:
return "<180"
elif int(diff) > 180:
return ">180"
def bundle_price(self, price):
mod = 300.0
return int((price + mod / 2) / mod) * mod
def map_date_to_season(self, date):
day = int(date[8:10])
month = int(date[5:7])
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
return "Winter"
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
return "Spring"
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
return "Summer"
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
return "Autumn"
def map_value_to_bucket(self, value, buckets):
if value == "":
return str(buckets[0]).replace(", ", "-")
for bucket in buckets:
if bucket[0] <= value <= bucket[1]:
return str(bucket).replace(", ", "-")
def map_date_to_term(self, date):
m = int(date[5:7])
d = int(date[8:10])
term = None
for arrival_term in self.arrival_terms:
if arrival_term == "Easter":
year = int(date[:4])
easter_date = easter(year)
easter_start = easter_date + timedelta(days=-4)
easter_end = easter_date + timedelta(days=1)
esm = easter_start.month
esd = easter_start.day
eem = easter_end.month
eed = easter_end.day
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
term = arrival_term
break
elif arrival_term == "NewYear":
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
em = self.arrival_terms[arrival_term][0]["end"]["m"]
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
term = arrival_term
break
else:
is_match = False
for i in range(len(self.arrival_terms[arrival_term])):
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
em = self.arrival_terms[arrival_term][i]["end"]["m"]
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
term = arrival_term
is_match = True
break
if is_match:
break
return term
def map_dates_to_terms(self, dates):
terms = []
for date in dates:
term = self.map_date_to_term(date)
terms.append(term)
return terms
def filter_out_historical_dates(self, date_list):
"""
Filters out past dates from a list of dates.
"""
future_dates = []
for date in date_list:
if date >= datetime.now():
future_dates.append(date.strftime("%Y-%m-%d"))
return future_dates

View File

@ -0,0 +1,88 @@
# Load libraries ---------------------------------------------
from collections import defaultdict
import numpy as np
# ------------------------------------------------------------
class DatasetSpecification(object):
def __init__(self):
pass
# ################
# Original dataset functions
# ################
def get_sum_columns(self):
return ["n_people", "n_children_1", "n_children_2", "n_children_3", "accomodation_price", "meal_price",
"service_price", "paid", "n_rooms"]
def get_mean_columns(self):
return ['discount']
def get_mode_columns(self):
return ["room_id", "room_group_id", "date_from", "date_to", "booking_date", "rate_plan",
"length_of_stay", "book_to_arrival", "weekend_stay"]
def get_first_columns(self):
return ["user_id", "client_id", "client_name", "email", "phone", "is_company"]
def get_id_columns(self):
return ["client_id", "client_name", "email", "phone"]
# ################
# Output dataset functions
# ################
def get_people_df_id_columns(self):
return ['user_id']
def get_people_df_feature_columns(self):
return []
def get_items_df_id_columns(self):
return ['item_id']
def get_items_df_feature_columns(self):
return ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
def get_purchases_df_id_columns(self):
return ['user_id', 'item_id']
def get_purchases_df_feature_columns(self):
return []
# ################
# Mapping functions
# ################
def get_nights_buckets(self):
return [[0, 1], [2, 3], [4, 7], [8, np.inf]]
def get_npeople_buckets(self):
return [[1, 1], [2, 2], [3, 4], [5, np.inf]]
def get_room_segment_buckets(self):
return [[0, 160], [160, 260], [260, 360], [360, 500], [500, 900], [900, np.inf]]
def get_book_to_arrival_buckets(self):
return [[0, 0], [1, 2], [3, 4], [5, 7], [8, 14], [15, 30], [31, 60], [61, 90], [91, 180], [181, np.inf]]
def get_arrival_terms(self):
arrival_terms = {"Easter": [{"start": {"m": np.nan, "d": np.nan}, "end": {"m": np.nan, "d": np.nan}}],
# Treated with priority
"Christmas": [{"start": {"m": 12, "d": 22}, "end": {"m": 12, "d": 27}}],
"NewYear": [{"start": {"m": 12, "d": 28}, "end": {"m": 1, "d": 4}}],
"WinterVacation": [{"start": {"m": 1, "d": 5}, "end": {"m": 2, "d": 29}}],
"OffSeason": [
{"start": {"m": 3, "d": 1}, "end": {"m": 4, "d": 27}},
{"start": {"m": 5, "d": 6}, "end": {"m": 6, "d": 20}},
{"start": {"m": 9, "d": 26}, "end": {"m": 12, "d": 21}}],
"MayLongWeekend": [{"start": {"m": 4, "d": 28}, "end": {"m": 5, "d": 5}}],
"LowSeason": [
{"start": {"m": 6, "d": 21}, "end": {"m": 7, "d": 10}},
{"start": {"m": 8, "d": 23}, "end": {"m": 9, "d": 25}}],
"HighSeason": [{"start": {"m": 7, "d": 11}, "end": {"m": 8, "d": 22}}]}
return arrival_terms

View File

@ -0,0 +1,77 @@
# Load libraries ---------------------------------------------
# ------------------------------------------------------------
class PeopleIdentifier(object):
def __init__(self):
self.id_column_names = []
self.pid_cname = ""
self.next_available_pid = 0
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
self.data = None
def add_pid(self, data, id_column_names, pid_cname):
self.id_column_names = id_column_names
self.pid_cname = pid_cname
for cid_cname in id_column_names:
self.cid_to_pid[cid_cname] = {}
for idx, reservation in data.iterrows():
pids = set()
for cid_cname in id_column_names:
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
if len(pids) > 0:
min_pid = min(pids)
self.set_pid(min_pid, reservation)
# Merge pids connected through this node
if len(pids) > 1:
pids.remove(min_pid)
self.merge_pids(pids, min_pid)
# print("Chosen pid: {}".format(min_pid))
else:
new_pid = self.next_available_pid
self.next_available_pid += 1
self.set_pid(new_pid, reservation)
# print("Chosen pid: {}".format(new_pid))
# print("=======")
# print(self.pid_to_cid)
# print("=======")
data_pid = data.copy()
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
self.data = data_pid
return data_pid
def set_pid(self, pid, reservation):
for cid_cname in self.id_column_names:
if reservation[cid_cname] != "":
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
if pid in self.pid_to_cid:
for cid_cname in self.id_column_names:
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
else:
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
for cid_cname in self.id_column_names}
def merge_pids(self, pids_from, pid_to):
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
for pid_from in pids_from:
for cid_cname in self.id_column_names:
for cid in self.pid_to_cid[pid_from][cid_cname]:
self.cid_to_pid[cid_cname][cid] = pid_to
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
self.pid_to_cid.pop(pid_from)

16
environment.yml Normal file
View File

@ -0,0 +1,16 @@
name: rs-class-env
channels:
- defaults
dependencies:
- pip=21.0.1
- python=3.8.8
- numpy==1.20.1
- matplotlib==3.3.2
- ipykernel==5.5.0
- pandas==1.2.3
- hyperopt==0.2.5
- seaborn==0.11.1
- pip:
- sklearn==0.0
- torch==1.8.0
- livelossplot==0.5.4

View File

View File

@ -0,0 +1,89 @@
# Load libraries ---------------------------------------------
import numpy as np
import pandas as pd
from collections import defaultdict
# ------------------------------------------------------------
def rmse(r_pred, r_real):
return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))
def mape(r_pred, r_real):
return 1 / len(r_pred) * np.sum(np.abs(r_pred - r_real) / np.abs(r_real))
def tre(r_pred, r_real):
return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))
def hr(recommendations, real_interactions, n=1):
"""
Assumes recommendations are ordered by user_id and then by score.
:param pd.DataFrame recommendations:
:param pd.DataFrame real_interactions:
:param int n:
"""
# Transform real_interactions to a dict for a large speed-up
rui = defaultdict(lambda: 0)
for idx, row in real_interactions.iterrows():
rui[(row['user_id'], row['item_id'])] = 1
result = 0.0
previous_user_id = -1
rank = 0
for idx, row in recommendations.iterrows():
if previous_user_id == row['user_id']:
rank += 1
else:
rank = 1
if rank <= n:
result += rui[(row['user_id'], row['item_id'])]
previous_user_id = row['user_id']
if len(recommendations['user_id'].unique()) > 0:
result /= len(recommendations['user_id'].unique())
return result
def ndcg(recommendations, real_interactions, n=1):
"""
Assumes recommendations are ordered by user_id and then by score.
:param pd.DataFrame recommendations:
:param pd.DataFrame real_interactions:
:param int n:
"""
# Transform real_interactions to a dict for a large speed-up
rui = defaultdict(lambda: 0)
for idx, row in real_interactions.iterrows():
rui[(row['user_id'], row['item_id'])] = 1
result = 0.0
previous_user_id = -1
rank = 0
for idx, row in recommendations.iterrows():
if previous_user_id == row['user_id']:
rank += 1
else:
rank = 1
if rank <= n:
result += rui[(row['user_id'], row['item_id'])] / np.log2(1 + rank)
previous_user_id = row['user_id']
if len(recommendations['user_id'].unique()) > 0:
result /= len(recommendations['user_id'].unique())
return result

View File

@ -0,0 +1,209 @@
# Load libraries ---------------------------------------------
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from evaluation_and_testing.evaluation_measures import rmse
from evaluation_and_testing.evaluation_measures import mape
from evaluation_and_testing.evaluation_measures import tre
from evaluation_and_testing.evaluation_measures import hr
from evaluation_and_testing.evaluation_measures import ndcg
# ------------------------------------------------------------
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
rng = np.random.RandomState(seed=seed)
if isinstance(interactions_df, dict):
# If interactions_df is a dict with already split data, use the split
interactions_df_train = interactions_df['train']
interactions_df_test = interactions_df['test']
else:
# Otherwise split the dataset into train and test
shuffle = np.arange(len(interactions_df))
rng.shuffle(shuffle)
shuffle = list(shuffle)
train_test_split = 0.8
split_index = int(len(interactions_df) * train_test_split)
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
# Train the recommender
recommender.fit(interactions_df_train, None, items_df)
# Gather predictions
r_pred = []
for idx, row in interactions_df_test.iterrows():
users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)
r_pred.append(recommendations.iloc[0]['score'])
# Gather real ratings
r_real = np.array(interactions_df_test['rating'].tolist())
# Return evaluation metrics
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
# Write your code here
rng = np.random.RandomState(seed=seed)
if isinstance(interactions_df, dict):
# If interactions_df is a dict with already split data, use the split
interactions_df_train = interactions_df['train']
interactions_df_test = interactions_df['test']
else:
# Otherwise split the dataset into train and test
shuffle = np.arange(len(interactions_df))
rng.shuffle(shuffle)
shuffle = list(shuffle)
train_test_split = 0.8
split_index = int(len(interactions_df) * train_test_split)
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
hr_1 = []
hr_3 = []
hr_5 = []
hr_10 = []
ndcg_1 = []
ndcg_3 = []
ndcg_5 = []
ndcg_10 = []
# Train the recommender
recommender.fit(interactions_df_train, None, items_df)
# Make recommendations for each user in the test set and calculate the metric
# against all items of that user in the test set
test_user_interactions = interactions_df_test.groupby(by='user_id')
for user_id, user_interactions in test_user_interactions:
recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']),
items_df, n_recommendations=10)
hr_1.append(hr(recommendations, user_interactions, n=1))
hr_3.append(hr(recommendations, user_interactions, n=3))
hr_5.append(hr(recommendations, user_interactions, n=5))
hr_10.append(hr(recommendations, user_interactions, n=10))
ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
hr_1 = np.mean(hr_1)
hr_3 = np.mean(hr_3)
hr_5 = np.mean(hr_5)
hr_10 = np.mean(hr_10)
ndcg_1 = np.mean(ndcg_1)
ndcg_3 = np.mean(ndcg_3)
ndcg_5 = np.mean(ndcg_5)
ndcg_10 = np.mean(ndcg_10)
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
rng = np.random.RandomState(seed=seed)
# Prepare splits of the datasets
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
# For each split of the dataset train the recommender, generate recommendations and evaluate
r_pred = []
r_real = []
n_eval = 1
for train_index, test_index in kf.split(interactions_df.index):
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
recommender.fit(interactions_df_train, None, items_df)
recommendations = recommender.recommend(
interactions_df_test.loc[:, ['user_id']],
items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
r_pred.append(recommendations.iloc[0]['score'])
r_real.append(interactions_df_test.iloc[0]['rating'])
if n_eval == max_evals:
break
n_eval += 1
r_pred = np.array(r_pred)
r_real = np.array(r_real)
# Return evaluation metrics
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
rng = np.random.RandomState(seed=seed)
# Prepare splits of the datasets
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
hr_1 = []
hr_3 = []
hr_5 = []
hr_10 = []
ndcg_1 = []
ndcg_3 = []
ndcg_5 = []
ndcg_10 = []
# For each split of the dataset train the recommender, generate recommendations and evaluate
n_eval = 1
for train_index, test_index in kf.split(interactions_df.index):
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
recommender.fit(interactions_df_train, None, items_df)
recommendations = recommender.recommend(
interactions_df_test.loc[:, ['user_id']], items_df, n_recommendations=10)
hr_1.append(hr(recommendations, interactions_df_test, n=1))
hr_3.append(hr(recommendations, interactions_df_test, n=3))
hr_5.append(hr(recommendations, interactions_df_test, n=5))
hr_10.append(hr(recommendations, interactions_df_test, n=10))
ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
if n_eval == max_evals:
break
n_eval += 1
hr_1 = np.mean(hr_1)
hr_3 = np.mean(hr_3)
hr_5 = np.mean(hr_5)
hr_10 = np.mean(hr_10)
ndcg_1 = np.mean(ndcg_1)
ndcg_3 = np.mean(ndcg_3)
ndcg_5 = np.mean(ndcg_5)
ndcg_10 = np.mean(ndcg_10)
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

0
recommenders/__init__.py Normal file
View File

View File

@ -0,0 +1,231 @@
# Load libraries ---------------------------------------------
import pandas as pd
import numpy as np
import scipy.special as scisp
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class AmazonRecommender(Recommender):
"""
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
IEEE Internet Computing, 2003,
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
"""
def __init__(self):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.e_xy = None
self.n_xy = None
self.scores = None
self.most_popular_items = None
self.should_recommend_already_bought = False
def initialize(self, **params):
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1
# Get maximal number of interactions
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()
# Calculate P_Y's
n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
# Get the series of all items
# items = list(range(n_items))
items = interactions_df['item_id'].unique()
# For every X calculate the E[Y|X]
e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
p_y_powers = {}
for y in items:
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
# In the next version calculate all alpha_k first (this works well with parallelization)
for x in items:
# Get users who bought X
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
# Get users who bought only X
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
# Calculate the number of non-X interactions for each user who bought X
# Include users with zero non-X interactions
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
# Unnecessary, but added for readability
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
# Calculate the expected numbers of Y products bought by clients who bought X
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
for abs_c in n_non_x_interactions["n_items"]])
for k in range(1, max_interactions + 1)])
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
if y != x:
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
else:
e_xy[x][y] = n_users * p_y[x]
self.e_xy = e_xy
# Calculate the number of users who bought both X and Y
# Simple and slow method (commented out)
# n_xy = np.zeros(shape=(n_items, n_items))
# for x in items:
# for y in items:
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
# users_x_and_y = users_x & users_y
# n_xy[x][y] = len(users_x_and_y)
# Optimized method (can be further optimized by using sparse matrices)
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
# Get the number of users who bought both X and Y
n_xy = np.matmul(r.T, r)
self.n_xy = n_xy
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
mapped_user_id = self.user_id_mapping[user_id]
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(self.scores[x_list], axis=0)
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
final_scores[x_list] = -1e100
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
for item_id in chosen_ids:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[item_id],
'score': final_scores[item_id]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df

View File

@ -0,0 +1,233 @@
# Load libraries ---------------------------------------------
import pandas as pd
import numpy as np
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class NearestNeighborsRecommender(Recommender):
"""
Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.
Possible similarity measures:
- 'cosine',
- 'pearson'.
"""
def __init__(self):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.r = None
self.similarities = None
self.most_popular_items = None
self.collaboration_type = 'user'
self.similarity_measure = 'cosine'
self.n_neighbors = 10
self.should_recommend_already_bought = False
def initialize(self, **params):
if 'n_neighbors' in params:
self.n_neighbors = params['n_neighbors']
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
del users_df, items_df
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
if self.collaboration_type == 'item':
r = r.T
self.r = r
# Calculate all similarities
similarities = None
if self.similarity_measure == 'cosine':
n_uv = np.matmul(r, r.T)
norms = np.sqrt(np.diag(n_uv))
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
elif self.similarity_measure == 'pearson':
r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
n_uv = np.matmul(r_shifted, r_shifted.T)
norms = np.sqrt(np.diag(n_uv))
norms[norms == 0] = 0.000001
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
np.fill_diagonal(similarities, -1000)
self.similarities = similarities
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
chosen_ids = []
scores = []
mapped_user_id = self.user_id_mapping[user_id]
if self.collaboration_type == 'user':
neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
user_similarities = self.similarities[mapped_user_id][neighbor_ids]
item_ids = items_df['item_id'].tolist()
v_i = self.r[neighbor_ids][:, item_ids]
scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
x_list = self.interactions_df.loc[
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
scores[x_list] = -1e100
chosen_ids = np.argsort(-scores)[:n_recommendations]
elif self.collaboration_type == 'item':
x_list = self.interactions_df.loc[
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
scores = np.sum(self.similarities[x_list], axis=0)
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
scores[x_list] = -1e100
chosen_ids = np.argsort(-scores)[:n_recommendations]
for item_id in chosen_ids:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[item_id],
'score': scores[item_id]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df
class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
def __init__(self):
super().__init__()
self.collaboration_type = 'user'
self.similarity_measure = 'cosine'
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
def __init__(self):
super().__init__()
self.collaboration_type = 'user'
self.similarity_measure = 'pearson'
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
def __init__(self):
super().__init__()
self.collaboration_type = 'item'
self.similarity_measure = 'cosine'
class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
def __init__(self):
super().__init__()
self.collaboration_type = 'item'
self.similarity_measure = 'pearson'

View File

@ -0,0 +1,305 @@
# Load libraries ---------------------------------------------
import pandas as pd
import numpy as np
import scipy.special as scisp
from livelossplot import PlotLosses
from collections import defaultdict, deque
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class NetflixRecommender(Recommender):
"""
Collaborative filtering based on matrix factorization with the following choice of an optimizer:
- Stochastic Gradient Descent (SGD),
- Mini-Batch Gradient Descent (MBGD),
- Alternating Least Squares (ALS).
"""
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.r = None
self.most_popular_items = None
self.n_neg_per_pos = n_neg_per_pos
if 'optimizer' in params:
self.optimizer = params['optimizer']
else:
self.optimizer = 'SGD'
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
self.n_epochs = params['n_epochs']
else:
self.n_epochs = 10
if 'lr' in params: # learning rate
self.lr = params['lr']
else:
self.lr = 0.01
if 'reg_l' in params: # regularization coefficient
self.reg_l = params['reg_l']
else:
self.reg_l = 0.1
if 'embedding_dim' in params:
self.embedding_dim = params['embedding_dim']
else:
self.embedding_dim = 8
self.user_repr = None
self.item_repr = None
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
else:
self.should_recommend_already_bought = False
self.validation_set_size = 0.2
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
self.print_type = print_type
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
del users_df, items_df
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df
n_users = np.max(interactions_df['user_id']) + 1
n_items = np.max(interactions_df['item_id']) + 1
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
self.r = r
# Generate negative interactions
negative_interactions = []
i = 0
while i < self.n_neg_per_pos * len(interactions_df):
sample_size = 1000
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
j = 0
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
if r[user_ids[j]][item_ids[j]] == 0:
negative_interactions.append([user_ids[j], item_ids[j], 0])
i += 1
j += 1
interactions_df = pd.concat(
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
# Initialize user and item embeddings as random vectors (from Gaussian distribution)
self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim))
self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim))
# Initialize losses and loss visualization
if self.print_type is not None and self.print_type == 'live':
liveloss = PlotLosses()
training_losses = deque(maxlen=50)
training_avg_losses = []
training_epoch_losses = []
validation_losses = deque(maxlen=50)
validation_avg_losses = []
validation_epoch_losses = []
last_training_total_loss = 0.0
last_validation_total_loss = 0.0
# Split the data
interaction_ids = self.rng.permutation(len(interactions_df))
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
training_ids = interaction_ids[:train_validation_slice_idx]
validation_ids = interaction_ids[train_validation_slice_idx:]
# Train the model
for epoch in range(self.n_epochs):
if self.print_type is not None and self.print_type == 'live':
logs = {}
# Train
training_losses.clear()
training_total_loss = 0.0
batch_idx = 0
for idx in training_ids:
user_id = int(interactions_df.iloc[idx]['user_id'])
item_id = int(interactions_df.iloc[idx]['item_id'])
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
self.user_repr[user_id] = self.user_repr[user_id] \
+ self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id])
self.item_repr[item_id] = self.item_repr[item_id] \
+ self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id])
loss = e_ui**2
training_total_loss += loss
if self.print_type is not None and self.print_type == 'text':
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
batch_idx += 1
training_losses.append(loss)
training_avg_losses.append(np.mean(training_losses))
# Validate
validation_losses.clear()
validation_total_loss = 0.0
for idx in validation_ids:
user_id = int(interactions_df.iloc[idx]['user_id'])
item_id = int(interactions_df.iloc[idx]['item_id'])
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
loss = e_ui**2
validation_total_loss += loss
validation_losses.append(loss)
validation_avg_losses.append(np.mean(validation_losses))
# Save and print epoch losses
training_last_avg_loss = training_total_loss / len(training_ids)
training_epoch_losses.append(training_last_avg_loss)
validation_last_avg_loss = validation_total_loss / len(validation_ids)
validation_epoch_losses.append(validation_last_avg_loss)
if self.print_type is not None and self.print_type == 'live' and epoch >= 3:
# A bound on epoch prevents showing extremely high losses in the first epochs
# noinspection PyUnboundLocalVariable
logs['loss'] = training_last_avg_loss
logs['val_loss'] = validation_last_avg_loss
# noinspection PyUnboundLocalVariable
liveloss.update(logs)
liveloss.send()
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
mapped_user_id = self.user_id_mapping[user_id]
ids_list = items_df['item_id'].tolist()
id_to_pos = np.array([0]*len(ids_list))
for k in range(len(ids_list)):
id_to_pos[ids_list[k]] = k
scores = np.matmul(self.user_repr[mapped_user_id].reshape(1, -1),
self.item_repr[ids_list].T).flatten()
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
x_list = self.interactions_df.loc[
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
scores[id_to_pos[x_list]] = -1e100
chosen_pos = np.argsort(-scores)[:n_recommendations]
for item_pos in chosen_pos:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
'score': scores[item_pos]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df
def get_user_repr(self, user_id):
mapped_user_id = self.user_id_mapping[user_id]
return self.user_repr[mapped_user_id]
def get_item_repr(self, item_id):
mapped_item_id = self.item_id_mapping[item_id]
return self.item_repr[mapped_item_id]

View File

@ -0,0 +1,52 @@
# Load libraries ---------------------------------------------
# ------------------------------------------------------------
class Recommender(object):
"""
Base recommender class.
"""
def __init__(self):
"""
Initialize base recommender params and variables.
:param int seed: Seed for the random number generator.
"""
pass
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
"""
pass
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
for ix, user in users_df.iterrows():
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
'item_id': [-1] * n_recommendations,
'score': [3.0] * n_recommendations})
recommendations = pd.concat([recommendations, user_recommendations])
return recommendations

View File

@ -0,0 +1,102 @@
# Load libraries ---------------------------------------------
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class TFIDFRecommender(Recommender):
"""
Recommender based on the TF-IDF method.
"""
def __init__(self):
"""
Initialize base recommender params and variables.
"""
super().__init__()
self.tfidf_scores = None
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
and the item feature columns.
"""
self.tfidf_scores = defaultdict(lambda: 0.0)
# Prepare the corpus for tfidf calculation
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
user_ids = user_genres.index.tolist()
genres_corpus = user_genres['genres'].tolist()
# Calculate tf-idf scores
vectorizer = TfidfVectorizer()
tfidf_scores = vectorizer.fit_transform(genres_corpus)
# Transform results into a dict {(user_id, genre): score}
for u in range(tfidf_scores.shape[0]):
for g in range(tfidf_scores.shape[1]):
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
# Transform genres to a unified form used by the vectorizer
items_df = items_df.copy()
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
# Score items
for uix, user in users_df.iterrows():
items = []
for iix, item in items_df.iterrows():
score = 0.0
for genre in item['genres']:
score += self.tfidf_scores[(user['user_id'], genre)]
score /= len(item['genres'])
items.append((item['item_id'], score))
items = sorted(items, key=lambda x: x[1], reverse=True)
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
'item_id': [item[0] for item in items][:n_recommendations],
'score': [item[1] for item in items][:n_recommendations]})
recommendations = pd.concat([recommendations, user_recommendations])
return recommendations