REK-proj-2/recommenders/tfidf_recommender.py
2021-06-28 20:18:14 +02:00

103 lines
4.3 KiB
Python

# Load libraries ---------------------------------------------
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from recommenders.recommender import Recommender
# ------------------------------------------------------------
class TFIDFRecommender(Recommender):
"""
Recommender based on the TF-IDF method.
"""
def __init__(self):
"""
Initialize base recommender params and variables.
"""
super().__init__()
self.tfidf_scores = None
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
and the item feature columns.
"""
self.tfidf_scores = defaultdict(lambda: 0.0)
# Prepare the corpus for tfidf calculation
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
user_ids = user_genres.index.tolist()
genres_corpus = user_genres['genres'].tolist()
# Calculate tf-idf scores
vectorizer = TfidfVectorizer()
tfidf_scores = vectorizer.fit_transform(genres_corpus)
# Transform results into a dict {(user_id, genre): score}
for u in range(tfidf_scores.shape[0]):
for g in range(tfidf_scores.shape[1]):
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
# Transform genres to a unified form used by the vectorizer
items_df = items_df.copy()
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
# Score items
for uix, user in users_df.iterrows():
items = []
for iix, item in items_df.iterrows():
score = 0.0
for genre in item['genres']:
score += self.tfidf_scores[(user['user_id'], genre)]
score /= len(item['genres'])
items.append((item['item_id'], score))
items = sorted(items, key=lambda x: x[1], reverse=True)
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
'item_id': [item[0] for item in items][:n_recommendations],
'score': [item[1] for item in items][:n_recommendations]})
recommendations = pd.concat([recommendations, user_recommendations])
return recommendations