Add recommender with HR10 0.116
This commit is contained in:
commit
4cf2994aca
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.ipynb_checkpoints
|
||||
__pycache__
|
52
README.md
Normal file
52
README.md
Normal file
@ -0,0 +1,52 @@
|
||||
# Recommender Systems class - Project 2
|
||||
|
||||
## Preparing your system
|
||||
|
||||
1. Install [Docker](https://docs.docker.com/engine/install/).
|
||||
|
||||
2. Fork this repository to your GitHub account.
|
||||
|
||||
3. Run Jupyter docker image:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-d -p 8888:8888 \
|
||||
-v DIRECTORY:/home/jovyan/REK \
|
||||
--name REK \
|
||||
jupyter/minimal-notebook
|
||||
```
|
||||
|
||||
Remember to change **DIRECTORY** to directory where all files can be found. You can change it to `$(pwd)` if your current directory is proper one.
|
||||
|
||||
4. Get loging link with following command:
|
||||
|
||||
```bash
|
||||
docker logs REK 2>&1 | grep -o 'http://127.0.0.1:8888.*' | tail -n1
|
||||
|
||||
Example output:
|
||||
http://127.0.0.1:8888/?token=2bb816a4bc36a4bdbf64e0c9a89f336ae5404a01d15e442c
|
||||
```
|
||||
|
||||
5. Prepare conda environment:
|
||||
|
||||
```bash
|
||||
docker exec REK bash -c "
|
||||
conda env create --name rs-class-env -f /home/jovyan/REK/environment.yml;
|
||||
python -m ipykernel install --user --name=rs-class-env"
|
||||
```
|
||||
|
||||
6. You can start/stop container whenever you want:
|
||||
|
||||
```bash
|
||||
docker stop REK
|
||||
docker start REK
|
||||
```
|
||||
|
||||
If you want to start from scratch, you can remove container:
|
||||
|
||||
```bash
|
||||
docker stop REK
|
||||
docker rm REK
|
||||
```
|
||||
|
||||
Now you are ready to work!
|
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
File diff suppressed because it is too large
Load Diff
17251
data/hotel_data/hotel_data_original.csv
Normal file
17251
data/hotel_data/hotel_data_original.csv
Normal file
File diff suppressed because it is too large
Load Diff
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
File diff suppressed because it is too large
Load Diff
0
data_preprocessing/__init__.py
Normal file
0
data_preprocessing/__init__.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
@ -0,0 +1,278 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil.easter import easter
|
||||
from data_preprocessing.dataset_specification import DatasetSpecification
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DataPreprocessingToolkit(object):
|
||||
|
||||
def __init__(self):
|
||||
dataset_specification = DatasetSpecification()
|
||||
|
||||
self.sum_columns = dataset_specification.get_sum_columns()
|
||||
self.mean_columns = dataset_specification.get_mean_columns()
|
||||
self.mode_columns = dataset_specification.get_mode_columns()
|
||||
self.first_columns = dataset_specification.get_first_columns()
|
||||
|
||||
self.nights_buckets = dataset_specification.get_nights_buckets()
|
||||
self.npeople_buckets = dataset_specification.get_npeople_buckets()
|
||||
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
|
||||
|
||||
self.arrival_terms = dataset_specification.get_arrival_terms()
|
||||
|
||||
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
|
||||
|
||||
# #########################
|
||||
# Entire datasets functions
|
||||
# #########################
|
||||
|
||||
def fix_date_to(self, df):
|
||||
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
|
||||
return df
|
||||
|
||||
def add_length_of_stay(self, df):
|
||||
df.loc[:, "length_of_stay"] = (df["date_to"] - df["date_from"]).dt.days
|
||||
return df
|
||||
|
||||
def add_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
|
||||
return df
|
||||
|
||||
def add_nrooms(self, df):
|
||||
df.loc[:, "n_rooms"] = 1
|
||||
return df
|
||||
|
||||
def add_weekend_stay(self, df):
|
||||
s = df["date_from"].dt.dayofweek
|
||||
e = df["date_to"].dt.dayofweek
|
||||
dt = (df["date_to"] - df["date_from"]).dt.days
|
||||
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
|
||||
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
|
||||
return df
|
||||
|
||||
def add_night_price(self, df):
|
||||
df.loc[:, "night_price"] = np.round(df["accomodation_price"] / df["length_of_stay"] / df["n_rooms"], 2)
|
||||
return df
|
||||
|
||||
def clip_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
|
||||
return df
|
||||
|
||||
def sum_npeople(self, df):
|
||||
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
|
||||
return df
|
||||
|
||||
def filter_out_company_clients(self, df):
|
||||
df = df.loc[df["is_company"] == 0]
|
||||
return df
|
||||
|
||||
def filter_out_long_stays(self, df):
|
||||
df = df.loc[df["length_of_stay"] <= 21]
|
||||
return df
|
||||
|
||||
def leave_one_from_group_reservations(self, df):
|
||||
unique_group_rows = []
|
||||
|
||||
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
|
||||
|
||||
group_ids = []
|
||||
for idx, row in df.iterrows():
|
||||
if row["group_id"] != -1:
|
||||
if row["group_id"] not in group_ids:
|
||||
unique_group_rows.append(row)
|
||||
group_ids.append(row["group_id"])
|
||||
else:
|
||||
unique_group_rows.append(row)
|
||||
|
||||
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
|
||||
|
||||
return df
|
||||
|
||||
def aggregate_group_reservations(self, df):
|
||||
non_group_reservations = df.loc[df["group_id"] == "",
|
||||
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
|
||||
group_reservations = df.loc[df["group_id"] != ""]
|
||||
|
||||
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
|
||||
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
|
||||
|
||||
group_reservations = agg_datasets[0]
|
||||
for i in range(1, len(agg_datasets)):
|
||||
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
|
||||
|
||||
group_reservations = group_reservations.reset_index(drop=True)
|
||||
|
||||
df = pd.concat([non_group_reservations, group_reservations])
|
||||
|
||||
return df
|
||||
|
||||
def leave_only_ota(self, df):
|
||||
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
|
||||
return df
|
||||
|
||||
def map_date_to_term_datasets(self, df):
|
||||
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
|
||||
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
|
||||
return df
|
||||
|
||||
def map_length_of_stay_to_nights_buckets(self, df):
|
||||
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
|
||||
return df
|
||||
|
||||
def map_night_price_to_room_segment_buckets(self, df):
|
||||
night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||
.groupby('room_group_id')['night_price'].mean().reset_index()
|
||||
night_prices.columns = ['room_group_id', 'room_night_price']
|
||||
df = pd.merge(df, night_prices, on=['room_group_id'], how='left')
|
||||
df.loc[df['room_night_price'].isnull(), 'room_night_price'] = 0.0
|
||||
df.loc[:, 'room_segment'] = df['room_night_price'].apply(
|
||||
lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||
df = df.drop(columns=['room_night_price'])
|
||||
return df
|
||||
|
||||
# def map_night_price_to_room_segment_buckets(self, df):
|
||||
# night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
|
||||
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
|
||||
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
|
||||
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
|
||||
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||
# df = df.drop(columns=['termnight_price'])
|
||||
# return df
|
||||
|
||||
def map_npeople_to_npeople_buckets(self, df):
|
||||
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
|
||||
return df
|
||||
|
||||
def map_item_to_item_id(self, df):
|
||||
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
ids = df['item'].unique().tolist()
|
||||
mapping = {ids[i]: i for i in range(len(ids))}
|
||||
|
||||
df['item_id'] = df['item'].apply(lambda x: mapping[x])
|
||||
|
||||
return df
|
||||
|
||||
def add_interaction_id(self, df):
|
||||
df.loc[:, 'interaction_id'] = range(df.shape[0])
|
||||
return df
|
||||
|
||||
# ################
|
||||
# Column functions
|
||||
# ################
|
||||
|
||||
def bundle_period(self, diff):
|
||||
diff = float(diff)
|
||||
if int(diff) < 0:
|
||||
return "<0"
|
||||
elif int(diff) <= 7:
|
||||
return diff
|
||||
elif 7 < int(diff) <= 14:
|
||||
return "<14"
|
||||
elif 14 < int(diff) <= 30:
|
||||
return "<30"
|
||||
elif 30 < int(diff) <= 60:
|
||||
return "<60"
|
||||
elif 60 < int(diff) <= 180:
|
||||
return "<180"
|
||||
elif int(diff) > 180:
|
||||
return ">180"
|
||||
|
||||
def bundle_price(self, price):
|
||||
mod = 300.0
|
||||
return int((price + mod / 2) / mod) * mod
|
||||
|
||||
def map_date_to_season(self, date):
|
||||
day = int(date[8:10])
|
||||
month = int(date[5:7])
|
||||
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
|
||||
return "Winter"
|
||||
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
|
||||
return "Spring"
|
||||
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
|
||||
return "Summer"
|
||||
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
|
||||
return "Autumn"
|
||||
|
||||
def map_value_to_bucket(self, value, buckets):
|
||||
if value == "":
|
||||
return str(buckets[0]).replace(", ", "-")
|
||||
for bucket in buckets:
|
||||
if bucket[0] <= value <= bucket[1]:
|
||||
return str(bucket).replace(", ", "-")
|
||||
|
||||
def map_date_to_term(self, date):
|
||||
|
||||
m = int(date[5:7])
|
||||
d = int(date[8:10])
|
||||
term = None
|
||||
|
||||
for arrival_term in self.arrival_terms:
|
||||
if arrival_term == "Easter":
|
||||
year = int(date[:4])
|
||||
easter_date = easter(year)
|
||||
easter_start = easter_date + timedelta(days=-4)
|
||||
easter_end = easter_date + timedelta(days=1)
|
||||
esm = easter_start.month
|
||||
esd = easter_start.day
|
||||
eem = easter_end.month
|
||||
eed = easter_end.day
|
||||
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
elif arrival_term == "NewYear":
|
||||
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][0]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
else:
|
||||
is_match = False
|
||||
|
||||
for i in range(len(self.arrival_terms[arrival_term])):
|
||||
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][i]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
is_match = True
|
||||
break
|
||||
|
||||
if is_match:
|
||||
break
|
||||
|
||||
return term
|
||||
|
||||
def map_dates_to_terms(self, dates):
|
||||
|
||||
terms = []
|
||||
for date in dates:
|
||||
term = self.map_date_to_term(date)
|
||||
terms.append(term)
|
||||
|
||||
return terms
|
||||
|
||||
def filter_out_historical_dates(self, date_list):
|
||||
"""
|
||||
Filters out past dates from a list of dates.
|
||||
"""
|
||||
future_dates = []
|
||||
|
||||
for date in date_list:
|
||||
if date >= datetime.now():
|
||||
future_dates.append(date.strftime("%Y-%m-%d"))
|
||||
|
||||
return future_dates
|
88
data_preprocessing/dataset_specification.py
Normal file
88
data_preprocessing/dataset_specification.py
Normal file
@ -0,0 +1,88 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DatasetSpecification(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# ################
|
||||
# Original dataset functions
|
||||
# ################
|
||||
|
||||
def get_sum_columns(self):
|
||||
return ["n_people", "n_children_1", "n_children_2", "n_children_3", "accomodation_price", "meal_price",
|
||||
"service_price", "paid", "n_rooms"]
|
||||
|
||||
def get_mean_columns(self):
|
||||
return ['discount']
|
||||
|
||||
def get_mode_columns(self):
|
||||
return ["room_id", "room_group_id", "date_from", "date_to", "booking_date", "rate_plan",
|
||||
"length_of_stay", "book_to_arrival", "weekend_stay"]
|
||||
|
||||
def get_first_columns(self):
|
||||
return ["user_id", "client_id", "client_name", "email", "phone", "is_company"]
|
||||
|
||||
def get_id_columns(self):
|
||||
return ["client_id", "client_name", "email", "phone"]
|
||||
|
||||
# ################
|
||||
# Output dataset functions
|
||||
# ################
|
||||
|
||||
def get_people_df_id_columns(self):
|
||||
return ['user_id']
|
||||
|
||||
def get_people_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
def get_items_df_id_columns(self):
|
||||
return ['item_id']
|
||||
|
||||
def get_items_df_feature_columns(self):
|
||||
return ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
|
||||
|
||||
def get_purchases_df_id_columns(self):
|
||||
return ['user_id', 'item_id']
|
||||
|
||||
def get_purchases_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
# ################
|
||||
# Mapping functions
|
||||
# ################
|
||||
|
||||
def get_nights_buckets(self):
|
||||
return [[0, 1], [2, 3], [4, 7], [8, np.inf]]
|
||||
|
||||
def get_npeople_buckets(self):
|
||||
return [[1, 1], [2, 2], [3, 4], [5, np.inf]]
|
||||
|
||||
def get_room_segment_buckets(self):
|
||||
return [[0, 160], [160, 260], [260, 360], [360, 500], [500, 900], [900, np.inf]]
|
||||
|
||||
def get_book_to_arrival_buckets(self):
|
||||
return [[0, 0], [1, 2], [3, 4], [5, 7], [8, 14], [15, 30], [31, 60], [61, 90], [91, 180], [181, np.inf]]
|
||||
|
||||
def get_arrival_terms(self):
|
||||
arrival_terms = {"Easter": [{"start": {"m": np.nan, "d": np.nan}, "end": {"m": np.nan, "d": np.nan}}],
|
||||
# Treated with priority
|
||||
"Christmas": [{"start": {"m": 12, "d": 22}, "end": {"m": 12, "d": 27}}],
|
||||
"NewYear": [{"start": {"m": 12, "d": 28}, "end": {"m": 1, "d": 4}}],
|
||||
"WinterVacation": [{"start": {"m": 1, "d": 5}, "end": {"m": 2, "d": 29}}],
|
||||
"OffSeason": [
|
||||
{"start": {"m": 3, "d": 1}, "end": {"m": 4, "d": 27}},
|
||||
{"start": {"m": 5, "d": 6}, "end": {"m": 6, "d": 20}},
|
||||
{"start": {"m": 9, "d": 26}, "end": {"m": 12, "d": 21}}],
|
||||
"MayLongWeekend": [{"start": {"m": 4, "d": 28}, "end": {"m": 5, "d": 5}}],
|
||||
"LowSeason": [
|
||||
{"start": {"m": 6, "d": 21}, "end": {"m": 7, "d": 10}},
|
||||
{"start": {"m": 8, "d": 23}, "end": {"m": 9, "d": 25}}],
|
||||
"HighSeason": [{"start": {"m": 7, "d": 11}, "end": {"m": 8, "d": 22}}]}
|
||||
return arrival_terms
|
77
data_preprocessing/people_identifier.py
Normal file
77
data_preprocessing/people_identifier.py
Normal file
@ -0,0 +1,77 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class PeopleIdentifier(object):
|
||||
|
||||
def __init__(self):
|
||||
self.id_column_names = []
|
||||
self.pid_cname = ""
|
||||
self.next_available_pid = 0
|
||||
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
|
||||
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
|
||||
self.data = None
|
||||
|
||||
def add_pid(self, data, id_column_names, pid_cname):
|
||||
self.id_column_names = id_column_names
|
||||
self.pid_cname = pid_cname
|
||||
|
||||
for cid_cname in id_column_names:
|
||||
self.cid_to_pid[cid_cname] = {}
|
||||
|
||||
for idx, reservation in data.iterrows():
|
||||
pids = set()
|
||||
for cid_cname in id_column_names:
|
||||
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
|
||||
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
|
||||
if len(pids) > 0:
|
||||
min_pid = min(pids)
|
||||
|
||||
self.set_pid(min_pid, reservation)
|
||||
|
||||
# Merge pids connected through this node
|
||||
|
||||
if len(pids) > 1:
|
||||
pids.remove(min_pid)
|
||||
self.merge_pids(pids, min_pid)
|
||||
|
||||
# print("Chosen pid: {}".format(min_pid))
|
||||
else:
|
||||
new_pid = self.next_available_pid
|
||||
self.next_available_pid += 1
|
||||
|
||||
self.set_pid(new_pid, reservation)
|
||||
# print("Chosen pid: {}".format(new_pid))
|
||||
|
||||
# print("=======")
|
||||
# print(self.pid_to_cid)
|
||||
# print("=======")
|
||||
|
||||
data_pid = data.copy()
|
||||
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
|
||||
self.data = data_pid
|
||||
|
||||
return data_pid
|
||||
|
||||
def set_pid(self, pid, reservation):
|
||||
for cid_cname in self.id_column_names:
|
||||
if reservation[cid_cname] != "":
|
||||
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
|
||||
if pid in self.pid_to_cid:
|
||||
for cid_cname in self.id_column_names:
|
||||
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
else:
|
||||
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
for cid_cname in self.id_column_names}
|
||||
|
||||
def merge_pids(self, pids_from, pid_to):
|
||||
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
|
||||
for pid_from in pids_from:
|
||||
for cid_cname in self.id_column_names:
|
||||
for cid in self.pid_to_cid[pid_from][cid_cname]:
|
||||
self.cid_to_pid[cid_cname][cid] = pid_to
|
||||
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
|
||||
self.pid_to_cid.pop(pid_from)
|
16
environment.yml
Normal file
16
environment.yml
Normal file
@ -0,0 +1,16 @@
|
||||
name: rs-class-env
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=21.0.1
|
||||
- python=3.8.8
|
||||
- numpy==1.20.1
|
||||
- matplotlib==3.3.2
|
||||
- ipykernel==5.5.0
|
||||
- pandas==1.2.3
|
||||
- hyperopt==0.2.5
|
||||
- seaborn==0.11.1
|
||||
- pip:
|
||||
- sklearn==0.0
|
||||
- torch==1.8.0
|
||||
- livelossplot==0.5.4
|
0
evaluation_and_testing/__init__.py
Normal file
0
evaluation_and_testing/__init__.py
Normal file
89
evaluation_and_testing/evaluation_measures.py
Normal file
89
evaluation_and_testing/evaluation_measures.py
Normal file
@ -0,0 +1,89 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def rmse(r_pred, r_real):
|
||||
return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))
|
||||
|
||||
|
||||
def mape(r_pred, r_real):
|
||||
return 1 / len(r_pred) * np.sum(np.abs(r_pred - r_real) / np.abs(r_real))
|
||||
|
||||
|
||||
def tre(r_pred, r_real):
|
||||
return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))
|
||||
|
||||
|
||||
def hr(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])]
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def ndcg(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])] / np.log2(1 + rank)
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
209
evaluation_and_testing/testing.py
Normal file
209
evaluation_and_testing/testing.py
Normal file
@ -0,0 +1,209 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
from evaluation_and_testing.evaluation_measures import rmse
|
||||
from evaluation_and_testing.evaluation_measures import mape
|
||||
from evaluation_and_testing.evaluation_measures import tre
|
||||
from evaluation_and_testing.evaluation_measures import hr
|
||||
from evaluation_and_testing.evaluation_measures import ndcg
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Gather predictions
|
||||
|
||||
r_pred = []
|
||||
|
||||
for idx, row in interactions_df_test.iterrows():
|
||||
users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
|
||||
eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
|
||||
eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
|
||||
recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
|
||||
# Gather real ratings
|
||||
|
||||
r_real = np.array(interactions_df_test['rating'].tolist())
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
|
||||
# Write your code here
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Make recommendations for each user in the test set and calculate the metric
|
||||
# against all items of that user in the test set
|
||||
|
||||
test_user_interactions = interactions_df_test.groupby(by='user_id')
|
||||
|
||||
for user_id, user_interactions in test_user_interactions:
|
||||
|
||||
recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']),
|
||||
items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, user_interactions, n=1))
|
||||
hr_3.append(hr(recommendations, user_interactions, n=3))
|
||||
hr_5.append(hr(recommendations, user_interactions, n=5))
|
||||
hr_10.append(hr(recommendations, user_interactions, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
||||
|
||||
|
||||
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
r_pred = []
|
||||
r_real = []
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']],
|
||||
items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
r_real.append(interactions_df_test.iloc[0]['rating'])
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
r_pred = np.array(r_pred)
|
||||
r_real = np.array(r_real)
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']], items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, interactions_df_test, n=1))
|
||||
hr_3.append(hr(recommendations, interactions_df_test, n=3))
|
||||
hr_5.append(hr(recommendations, interactions_df_test, n=5))
|
||||
hr_10.append(hr(recommendations, interactions_df_test, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
14586
project_1_data_preparation.html
Normal file
14586
project_1_data_preparation.html
Normal file
File diff suppressed because one or more lines are too long
2186
project_1_data_preparation.ipynb
Normal file
2186
project_1_data_preparation.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1873
project_2_recommender_and_evaluation-0_116.ipynb
Normal file
1873
project_2_recommender_and_evaluation-0_116.ipynb
Normal file
File diff suppressed because one or more lines are too long
1687
project_2_recommender_and_evaluation-Copy1.ipynb
Normal file
1687
project_2_recommender_and_evaluation-Copy1.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1979
project_2_recommender_and_evaluation-Copy2.ipynb
Normal file
1979
project_2_recommender_and_evaluation-Copy2.ipynb
Normal file
File diff suppressed because one or more lines are too long
1890
project_2_recommender_and_evaluation.ipynb
Normal file
1890
project_2_recommender_and_evaluation.ipynb
Normal file
File diff suppressed because one or more lines are too long
0
recommenders/__init__.py
Normal file
0
recommenders/__init__.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
@ -0,0 +1,231 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy.special as scisp
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class AmazonRecommender(Recommender):
|
||||
"""
|
||||
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
|
||||
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
|
||||
IEEE Internet Computing, 2003,
|
||||
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.e_xy = None
|
||||
self.n_xy = None
|
||||
self.scores = None
|
||||
self.most_popular_items = None
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get maximal number of interactions
|
||||
|
||||
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
|
||||
max_interactions = n_user_interactions['n_items'].max()
|
||||
|
||||
# Calculate P_Y's
|
||||
|
||||
n_interactions = len(interactions_df)
|
||||
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
|
||||
p_y = p_y.rename(columns={'user_id': 'P_Y'})
|
||||
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
|
||||
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
|
||||
|
||||
# Get the series of all items
|
||||
|
||||
# items = list(range(n_items))
|
||||
items = interactions_df['item_id'].unique()
|
||||
|
||||
# For every X calculate the E[Y|X]
|
||||
|
||||
e_xy = np.zeros(shape=(n_items, n_items))
|
||||
e_xy[:][:] = -1e100
|
||||
|
||||
p_y_powers = {}
|
||||
for y in items:
|
||||
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
|
||||
|
||||
# In the next version calculate all alpha_k first (this works well with parallelization)
|
||||
|
||||
for x in items:
|
||||
# Get users who bought X
|
||||
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
|
||||
|
||||
# Get users who bought only X
|
||||
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
|
||||
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
|
||||
|
||||
# Calculate the number of non-X interactions for each user who bought X
|
||||
# Include users with zero non-X interactions
|
||||
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
|
||||
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
|
||||
|
||||
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
|
||||
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
|
||||
|
||||
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
|
||||
|
||||
# Calculate the expected numbers of Y products bought by clients who bought X
|
||||
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
|
||||
for abs_c in n_non_x_interactions["n_items"]])
|
||||
for k in range(1, max_interactions + 1)])
|
||||
|
||||
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
|
||||
if y != x:
|
||||
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
|
||||
else:
|
||||
e_xy[x][y] = n_users * p_y[x]
|
||||
|
||||
self.e_xy = e_xy
|
||||
|
||||
# Calculate the number of users who bought both X and Y
|
||||
|
||||
# Simple and slow method (commented out)
|
||||
|
||||
# n_xy = np.zeros(shape=(n_items, n_items))
|
||||
|
||||
# for x in items:
|
||||
# for y in items:
|
||||
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
|
||||
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
|
||||
# users_x_and_y = users_x & users_y
|
||||
# n_xy[x][y] = len(users_x_and_y)
|
||||
|
||||
# Optimized method (can be further optimized by using sparse matrices)
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
# Get the number of users who bought both X and Y
|
||||
|
||||
n_xy = np.matmul(r.T, r)
|
||||
|
||||
self.n_xy = n_xy
|
||||
|
||||
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
final_scores = np.sum(self.scores[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
final_scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': final_scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
233
recommenders/nearest_neighbors_recommender.py
Normal file
233
recommenders/nearest_neighbors_recommender.py
Normal file
@ -0,0 +1,233 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class NearestNeighborsRecommender(Recommender):
|
||||
"""
|
||||
Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.
|
||||
|
||||
Possible similarity measures:
|
||||
- 'cosine',
|
||||
- 'pearson'.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.r = None
|
||||
self.similarities = None
|
||||
self.most_popular_items = None
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
self.n_neighbors = 10
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'n_neighbors' in params:
|
||||
self.n_neighbors = params['n_neighbors']
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
del users_df, items_df
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
if self.collaboration_type == 'item':
|
||||
r = r.T
|
||||
|
||||
self.r = r
|
||||
|
||||
# Calculate all similarities
|
||||
|
||||
similarities = None
|
||||
if self.similarity_measure == 'cosine':
|
||||
n_uv = np.matmul(r, r.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
elif self.similarity_measure == 'pearson':
|
||||
r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
|
||||
n_uv = np.matmul(r_shifted, r_shifted.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
norms[norms == 0] = 0.000001
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
|
||||
np.fill_diagonal(similarities, -1000)
|
||||
|
||||
self.similarities = similarities
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
chosen_ids = []
|
||||
scores = []
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
if self.collaboration_type == 'user':
|
||||
neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
|
||||
user_similarities = self.similarities[mapped_user_id][neighbor_ids]
|
||||
|
||||
item_ids = items_df['item_id'].tolist()
|
||||
|
||||
v_i = self.r[neighbor_ids][:, item_ids]
|
||||
|
||||
scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
elif self.collaboration_type == 'item':
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores = np.sum(self.similarities[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
||||
|
||||
|
||||
class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'pearson'
|
||||
|
||||
|
||||
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'pearson'
|
305
recommenders/netflix_recommender.py
Normal file
305
recommenders/netflix_recommender.py
Normal file
@ -0,0 +1,305 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy.special as scisp
|
||||
from livelossplot import PlotLosses
|
||||
from collections import defaultdict, deque
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class NetflixRecommender(Recommender):
|
||||
"""
|
||||
Collaborative filtering based on matrix factorization with the following choice of an optimizer:
|
||||
- Stochastic Gradient Descent (SGD),
|
||||
- Mini-Batch Gradient Descent (MBGD),
|
||||
- Alternating Least Squares (ALS).
|
||||
"""
|
||||
|
||||
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.r = None
|
||||
self.most_popular_items = None
|
||||
|
||||
self.n_neg_per_pos = n_neg_per_pos
|
||||
if 'optimizer' in params:
|
||||
self.optimizer = params['optimizer']
|
||||
else:
|
||||
self.optimizer = 'SGD'
|
||||
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
|
||||
self.n_epochs = params['n_epochs']
|
||||
else:
|
||||
self.n_epochs = 10
|
||||
if 'lr' in params: # learning rate
|
||||
self.lr = params['lr']
|
||||
else:
|
||||
self.lr = 0.01
|
||||
if 'reg_l' in params: # regularization coefficient
|
||||
self.reg_l = params['reg_l']
|
||||
else:
|
||||
self.reg_l = 0.1
|
||||
if 'embedding_dim' in params:
|
||||
self.embedding_dim = params['embedding_dim']
|
||||
else:
|
||||
self.embedding_dim = 8
|
||||
|
||||
self.user_repr = None
|
||||
self.item_repr = None
|
||||
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
else:
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
self.validation_set_size = 0.2
|
||||
|
||||
self.seed = seed
|
||||
self.rng = np.random.RandomState(seed=seed)
|
||||
|
||||
self.print_type = print_type
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
del users_df, items_df
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
self.r = r
|
||||
|
||||
# Generate negative interactions
|
||||
negative_interactions = []
|
||||
|
||||
i = 0
|
||||
while i < self.n_neg_per_pos * len(interactions_df):
|
||||
sample_size = 1000
|
||||
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
|
||||
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
|
||||
|
||||
j = 0
|
||||
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
|
||||
if r[user_ids[j]][item_ids[j]] == 0:
|
||||
negative_interactions.append([user_ids[j], item_ids[j], 0])
|
||||
i += 1
|
||||
j += 1
|
||||
|
||||
interactions_df = pd.concat(
|
||||
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
|
||||
|
||||
# Initialize user and item embeddings as random vectors (from Gaussian distribution)
|
||||
|
||||
self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim))
|
||||
self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim))
|
||||
|
||||
# Initialize losses and loss visualization
|
||||
|
||||
if self.print_type is not None and self.print_type == 'live':
|
||||
liveloss = PlotLosses()
|
||||
|
||||
training_losses = deque(maxlen=50)
|
||||
training_avg_losses = []
|
||||
training_epoch_losses = []
|
||||
validation_losses = deque(maxlen=50)
|
||||
validation_avg_losses = []
|
||||
validation_epoch_losses = []
|
||||
last_training_total_loss = 0.0
|
||||
last_validation_total_loss = 0.0
|
||||
|
||||
# Split the data
|
||||
|
||||
interaction_ids = self.rng.permutation(len(interactions_df))
|
||||
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
|
||||
training_ids = interaction_ids[:train_validation_slice_idx]
|
||||
validation_ids = interaction_ids[train_validation_slice_idx:]
|
||||
|
||||
# Train the model
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
if self.print_type is not None and self.print_type == 'live':
|
||||
logs = {}
|
||||
|
||||
# Train
|
||||
|
||||
training_losses.clear()
|
||||
training_total_loss = 0.0
|
||||
batch_idx = 0
|
||||
for idx in training_ids:
|
||||
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||
|
||||
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||
self.user_repr[user_id] = self.user_repr[user_id] \
|
||||
+ self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id])
|
||||
self.item_repr[item_id] = self.item_repr[item_id] \
|
||||
+ self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id])
|
||||
|
||||
loss = e_ui**2
|
||||
training_total_loss += loss
|
||||
|
||||
if self.print_type is not None and self.print_type == 'text':
|
||||
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
|
||||
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
|
||||
|
||||
batch_idx += 1
|
||||
|
||||
training_losses.append(loss)
|
||||
training_avg_losses.append(np.mean(training_losses))
|
||||
|
||||
# Validate
|
||||
|
||||
validation_losses.clear()
|
||||
validation_total_loss = 0.0
|
||||
for idx in validation_ids:
|
||||
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||
|
||||
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||
|
||||
loss = e_ui**2
|
||||
validation_total_loss += loss
|
||||
|
||||
validation_losses.append(loss)
|
||||
validation_avg_losses.append(np.mean(validation_losses))
|
||||
|
||||
# Save and print epoch losses
|
||||
|
||||
training_last_avg_loss = training_total_loss / len(training_ids)
|
||||
training_epoch_losses.append(training_last_avg_loss)
|
||||
validation_last_avg_loss = validation_total_loss / len(validation_ids)
|
||||
validation_epoch_losses.append(validation_last_avg_loss)
|
||||
|
||||
if self.print_type is not None and self.print_type == 'live' and epoch >= 3:
|
||||
# A bound on epoch prevents showing extremely high losses in the first epochs
|
||||
# noinspection PyUnboundLocalVariable
|
||||
logs['loss'] = training_last_avg_loss
|
||||
logs['val_loss'] = validation_last_avg_loss
|
||||
# noinspection PyUnboundLocalVariable
|
||||
liveloss.update(logs)
|
||||
liveloss.send()
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
ids_list = items_df['item_id'].tolist()
|
||||
id_to_pos = np.array([0]*len(ids_list))
|
||||
for k in range(len(ids_list)):
|
||||
id_to_pos[ids_list[k]] = k
|
||||
scores = np.matmul(self.user_repr[mapped_user_id].reshape(1, -1),
|
||||
self.item_repr[ids_list].T).flatten()
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores[id_to_pos[x_list]] = -1e100
|
||||
|
||||
chosen_pos = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
for item_pos in chosen_pos:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
|
||||
'score': scores[item_pos]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
||||
|
||||
def get_user_repr(self, user_id):
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
return self.user_repr[mapped_user_id]
|
||||
|
||||
def get_item_repr(self, item_id):
|
||||
mapped_item_id = self.item_id_mapping[item_id]
|
||||
return self.item_repr[mapped_item_id]
|
52
recommenders/recommender.py
Normal file
52
recommenders/recommender.py
Normal file
@ -0,0 +1,52 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class Recommender(object):
|
||||
"""
|
||||
Base recommender class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
|
||||
:param int seed: Seed for the random number generator.
|
||||
"""
|
||||
pass
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
|
||||
"""
|
||||
pass
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
for ix, user in users_df.iterrows():
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [-1] * n_recommendations,
|
||||
'score': [3.0] * n_recommendations})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
102
recommenders/tfidf_recommender.py
Normal file
102
recommenders/tfidf_recommender.py
Normal file
@ -0,0 +1,102 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from collections import defaultdict
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class TFIDFRecommender(Recommender):
|
||||
"""
|
||||
Recommender based on the TF-IDF method.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
"""
|
||||
super().__init__()
|
||||
self.tfidf_scores = None
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
|
||||
and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
|
||||
and the item feature columns.
|
||||
"""
|
||||
|
||||
self.tfidf_scores = defaultdict(lambda: 0.0)
|
||||
|
||||
# Prepare the corpus for tfidf calculation
|
||||
|
||||
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
|
||||
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
|
||||
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
|
||||
user_ids = user_genres.index.tolist()
|
||||
genres_corpus = user_genres['genres'].tolist()
|
||||
|
||||
# Calculate tf-idf scores
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_scores = vectorizer.fit_transform(genres_corpus)
|
||||
|
||||
# Transform results into a dict {(user_id, genre): score}
|
||||
|
||||
for u in range(tfidf_scores.shape[0]):
|
||||
for g in range(tfidf_scores.shape[1]):
|
||||
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
|
||||
should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
# Transform genres to a unified form used by the vectorizer
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
|
||||
|
||||
# Score items
|
||||
|
||||
for uix, user in users_df.iterrows():
|
||||
items = []
|
||||
for iix, item in items_df.iterrows():
|
||||
score = 0.0
|
||||
for genre in item['genres']:
|
||||
score += self.tfidf_scores[(user['user_id'], genre)]
|
||||
score /= len(item['genres'])
|
||||
items.append((item['item_id'], score))
|
||||
|
||||
items = sorted(items, key=lambda x: x[1], reverse=True)
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [item[0] for item in items][:n_recommendations],
|
||||
'score': [item[1] for item in items][:n_recommendations]})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
Loading…
Reference in New Issue
Block a user