103 lines
4.3 KiB
Python
103 lines
4.3 KiB
Python
# Load libraries ---------------------------------------------
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from collections import defaultdict
|
|
|
|
from recommenders.recommender import Recommender
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
class TFIDFRecommender(Recommender):
|
|
"""
|
|
Recommender based on the TF-IDF method.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize base recommender params and variables.
|
|
"""
|
|
super().__init__()
|
|
self.tfidf_scores = None
|
|
|
|
def fit(self, interactions_df, users_df, items_df):
|
|
"""
|
|
Training of the recommender.
|
|
|
|
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
|
defined by user_id, item_id and features of the interaction.
|
|
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
|
|
and the user feature columns.
|
|
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
|
|
and the item feature columns.
|
|
"""
|
|
|
|
self.tfidf_scores = defaultdict(lambda: 0.0)
|
|
|
|
# Prepare the corpus for tfidf calculation
|
|
|
|
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
|
|
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
|
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
|
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
|
|
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
|
|
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
|
|
user_ids = user_genres.index.tolist()
|
|
genres_corpus = user_genres['genres'].tolist()
|
|
|
|
# Calculate tf-idf scores
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
tfidf_scores = vectorizer.fit_transform(genres_corpus)
|
|
|
|
# Transform results into a dict {(user_id, genre): score}
|
|
|
|
for u in range(tfidf_scores.shape[0]):
|
|
for g in range(tfidf_scores.shape[1]):
|
|
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
|
|
|
|
def recommend(self, users_df, items_df, n_recommendations=1):
|
|
"""
|
|
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
|
top n_recommendations for each user.
|
|
|
|
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
|
|
should be generated.
|
|
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
|
:param int n_recommendations: Number of recommendations to be returned for each user.
|
|
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
|
for each user.
|
|
:rtype: pd.DataFrame
|
|
"""
|
|
|
|
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
|
|
|
# Transform genres to a unified form used by the vectorizer
|
|
|
|
items_df = items_df.copy()
|
|
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
|
|
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
|
|
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
|
|
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
|
|
|
|
# Score items
|
|
|
|
for uix, user in users_df.iterrows():
|
|
items = []
|
|
for iix, item in items_df.iterrows():
|
|
score = 0.0
|
|
for genre in item['genres']:
|
|
score += self.tfidf_scores[(user['user_id'], genre)]
|
|
score /= len(item['genres'])
|
|
items.append((item['item_id'], score))
|
|
|
|
items = sorted(items, key=lambda x: x[1], reverse=True)
|
|
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
|
'item_id': [item[0] for item in items][:n_recommendations],
|
|
'score': [item[1] for item in items][:n_recommendations]})
|
|
|
|
recommendations = pd.concat([recommendations, user_recommendations])
|
|
|
|
return recommendations
|