REK-proj-2/recommenders/netflix_recommender.py
2021-06-28 20:18:14 +02:00

306 lines
12 KiB
Python

# Load libraries ---------------------------------------------
import pandas as pd
import numpy as np
import scipy.special as scisp
from livelossplot import PlotLosses
from collections import defaultdict, deque
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class NetflixRecommender(Recommender):
"""
Collaborative filtering based on matrix factorization with the following choice of an optimizer:
- Stochastic Gradient Descent (SGD),
- Mini-Batch Gradient Descent (MBGD),
- Alternating Least Squares (ALS).
"""
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.r = None
self.most_popular_items = None
self.n_neg_per_pos = n_neg_per_pos
if 'optimizer' in params:
self.optimizer = params['optimizer']
else:
self.optimizer = 'SGD'
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
self.n_epochs = params['n_epochs']
else:
self.n_epochs = 10
if 'lr' in params: # learning rate
self.lr = params['lr']
else:
self.lr = 0.01
if 'reg_l' in params: # regularization coefficient
self.reg_l = params['reg_l']
else:
self.reg_l = 0.1
if 'embedding_dim' in params:
self.embedding_dim = params['embedding_dim']
else:
self.embedding_dim = 8
self.user_repr = None
self.item_repr = None
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
else:
self.should_recommend_already_bought = False
self.validation_set_size = 0.2
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
self.print_type = print_type
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
del users_df, items_df
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df
n_users = np.max(interactions_df['user_id']) + 1
n_items = np.max(interactions_df['item_id']) + 1
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
self.r = r
# Generate negative interactions
negative_interactions = []
i = 0
while i < self.n_neg_per_pos * len(interactions_df):
sample_size = 1000
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
j = 0
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
if r[user_ids[j]][item_ids[j]] == 0:
negative_interactions.append([user_ids[j], item_ids[j], 0])
i += 1
j += 1
interactions_df = pd.concat(
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
# Initialize user and item embeddings as random vectors (from Gaussian distribution)
self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim))
self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim))
# Initialize losses and loss visualization
if self.print_type is not None and self.print_type == 'live':
liveloss = PlotLosses()
training_losses = deque(maxlen=50)
training_avg_losses = []
training_epoch_losses = []
validation_losses = deque(maxlen=50)
validation_avg_losses = []
validation_epoch_losses = []
last_training_total_loss = 0.0
last_validation_total_loss = 0.0
# Split the data
interaction_ids = self.rng.permutation(len(interactions_df))
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
training_ids = interaction_ids[:train_validation_slice_idx]
validation_ids = interaction_ids[train_validation_slice_idx:]
# Train the model
for epoch in range(self.n_epochs):
if self.print_type is not None and self.print_type == 'live':
logs = {}
# Train
training_losses.clear()
training_total_loss = 0.0
batch_idx = 0
for idx in training_ids:
user_id = int(interactions_df.iloc[idx]['user_id'])
item_id = int(interactions_df.iloc[idx]['item_id'])
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
self.user_repr[user_id] = self.user_repr[user_id] \
+ self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id])
self.item_repr[item_id] = self.item_repr[item_id] \
+ self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id])
loss = e_ui**2
training_total_loss += loss
if self.print_type is not None and self.print_type == 'text':
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
batch_idx += 1
training_losses.append(loss)
training_avg_losses.append(np.mean(training_losses))
# Validate
validation_losses.clear()
validation_total_loss = 0.0
for idx in validation_ids:
user_id = int(interactions_df.iloc[idx]['user_id'])
item_id = int(interactions_df.iloc[idx]['item_id'])
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
loss = e_ui**2
validation_total_loss += loss
validation_losses.append(loss)
validation_avg_losses.append(np.mean(validation_losses))
# Save and print epoch losses
training_last_avg_loss = training_total_loss / len(training_ids)
training_epoch_losses.append(training_last_avg_loss)
validation_last_avg_loss = validation_total_loss / len(validation_ids)
validation_epoch_losses.append(validation_last_avg_loss)
if self.print_type is not None and self.print_type == 'live' and epoch >= 3:
# A bound on epoch prevents showing extremely high losses in the first epochs
# noinspection PyUnboundLocalVariable
logs['loss'] = training_last_avg_loss
logs['val_loss'] = validation_last_avg_loss
# noinspection PyUnboundLocalVariable
liveloss.update(logs)
liveloss.send()
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
mapped_user_id = self.user_id_mapping[user_id]
ids_list = items_df['item_id'].tolist()
id_to_pos = np.array([0]*len(ids_list))
for k in range(len(ids_list)):
id_to_pos[ids_list[k]] = k
scores = np.matmul(self.user_repr[mapped_user_id].reshape(1, -1),
self.item_repr[ids_list].T).flatten()
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
x_list = self.interactions_df.loc[
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
scores[id_to_pos[x_list]] = -1e100
chosen_pos = np.argsort(-scores)[:n_recommendations]
for item_pos in chosen_pos:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
'score': scores[item_pos]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df
def get_user_repr(self, user_id):
mapped_user_id = self.user_id_mapping[user_id]
return self.user_repr[mapped_user_id]
def get_item_repr(self, item_id):
mapped_item_id = self.item_id_mapping[item_id]
return self.item_repr[mapped_item_id]