REK-proj-1/class_6_collaborative_filtering.ipynb
Aleksander Piotrowski f6ce2585b8 first commit
2021-05-18 16:18:33 +02:00

81 KiB

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

Load data

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = [1, 318, 1193, 1208, 1214, 1721, 2959, 3578, 4306, 109487]

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(HTML(ml_movies_df.head(10).to_html()))

print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
277 318 Shawshank Redemption, The (1994) Crime|Drama
896 1193 One Flew Over the Cuckoo's Nest (1975) Drama
909 1208 Apocalypse Now (1979) Action|Drama|War
915 1214 Alien (1979) Horror|Sci-Fi
1291 1721 Titanic (1997) Drama|Romance
2226 2959 Fight Club (1999) Action|Crime|Drama|Thriller
2674 3578 Gladiator (2000) Action|Adventure|Drama
3194 4306 Shrek (2001) Adventure|Animation|Children|Comedy|Fantasy|Romance
8376 109487 Interstellar (2014) Sci-Fi|IMAX
Number of interactions left: 1689

Shift item ids and user ids so that they are consecutive

interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

display(HTML(interactions_df.head(10).to_html()))
user_id item_id rating timestamp
0 0 0 4.0 964982703
72 0 1 4.0 964983250
75 0 2 4.0 964981855
192 0 3 5.0 964983282
219 0 4 5.0 964980668
232 1 5 3.0 1445714835
235 1 4 4.0 1445714885
255 1 6 3.0 1445715145
458 2 3 2.0 945078528
516 3 0 4.0 847434962

Get the number of items and users

n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))
n_items=10
n_users=521

Get the user-item interaction matrix

# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])
[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]

Calculate cosine similarities of users

$$ \text{Sim}(\vec{u}, \vec{v}) = \text{Cos}(\vec{u}, \vec{v}) = \frac{\vec{u} \cdot \vec{v}}{\lVert u \rVert \lVert v \rVert} = \frac{\sum_{i = 1}^n u_i v_i}{\sqrt{\sum_{i = 1}^n u_i^2} \sqrt{\sum_{i = 1}^n v_i^2}} $$

For interaction vectors cosine similarity changes from 0 to 1. 1 means that both vectors are identical. 0 means that they have no 1's in common.

def cosine(u, v):
    return np.sum(u * v) / np.sqrt(np.sum(u * u) * np.sum(v * v))

print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))

0.7071067811865475
1.0
0.0
print("Cosine similarity between user 0 and 1")
print(r[0])
print(r[1])
print(cosine(r[0], r[1]))
Cosine similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
0.2581988897471611
print("Cosine similarity between user 0 and 5")
print(r[0])
print(r[5])
print(cosine(r[0], r[5]))
Cosine similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.6708203932499369

Calculate Pearson similarities of users

$$ \text{Sim}(\vec{u}, \vec{v}) = \text{Pearson}(\vec{u}, \vec{v}) = \frac{\sum_{i = 1}^n (u_i - \bar{u}) (v_i - \bar{v})}{\sqrt{\sum_{i = 1}^n (u_i - \bar{u})^2} \sqrt{\sum_{i = 1}^n (v_i - \bar{v})^2}} $$

Correlation changes from -1 to 1. Correlation of 1 means that vectors are identical, -1 means they are opposites.

def pearson(u, v):
    return np.sum((u - np.mean(u)) * (v - np.mean(v))) / (len(u) * np.std(u) * np.std(v))

print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))
0.5773502691896258
1.0
-1.0
print("Pearson similarity between user 0 and 1")
print(r[0])
print(r[1])
print(pearson(r[0], r[1]))
Pearson similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
-0.2182178902359924
print("Pearson similarity between user 0 and 5")
print(r[0])
print(r[5])
print(pearson(r[0], r[5]))
Pearson similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.40824829046386296

All cosine similarities

n_uv = np.matmul(r, r.T)

norms = np.sqrt(np.diag(n_uv))

cos_sim = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

print("Scalar products")
print(n_uv[:10, :10])
print()

print("Norms")
print(np.around(norms[:10], 3))
print()

print("Cosine similarities")
print(np.around(cos_sim[:10, :10], 3))
Scalar products
[[5. 1. 1. 1. 0. 3. 0. 2. 0. 0.]
 [1. 3. 0. 1. 1. 1. 1. 2. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 0.]
 [1. 1. 0. 2. 1. 1. 1. 0. 1. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0.]
 [3. 1. 0. 1. 0. 4. 0. 2. 0. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0.]
 [2. 2. 1. 0. 0. 2. 0. 4. 0. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 2. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]

Norms
[2.236 1.732 1.    1.414 1.    2.    1.    2.    1.414 1.   ]

Cosine similarities
[[1.    0.258 0.447 0.316 0.    0.671 0.    0.447 0.    0.   ]
 [0.258 1.    0.    0.408 0.577 0.289 0.577 0.577 0.408 0.   ]
 [0.447 0.    1.    0.    0.    0.    0.    0.5   0.    0.   ]
 [0.316 0.408 0.    1.    0.707 0.354 0.707 0.    0.5   0.   ]
 [0.    0.577 0.    0.707 1.    0.    1.    0.    0.707 0.   ]
 [0.671 0.289 0.    0.354 0.    1.    0.    0.5   0.    0.   ]
 [0.    0.577 0.    0.707 1.    0.    1.    0.    0.707 0.   ]
 [0.447 0.577 0.5   0.    0.    0.5   0.    1.    0.    0.   ]
 [0.    0.408 0.    0.5   0.707 0.    0.707 0.    1.    0.707]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.707 1.   ]]

All Pearson similarities

r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)

n_uv = np.matmul(r_shifted, r_shifted.T)

norms = np.sqrt(np.diag(n_uv))

norms[norms == 0] = 0.000001

person_sim = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

print("Scalar products")
print(np.around(n_uv[:10, :10], 3))
print()

print("Norms")
print(np.around(norms[:10], 3))
print()

print("Pearson similarities")
print(np.around(person_sim[:10, :10], 3))
Scalar products
[[ 2.5 -0.5  0.5 -0.  -0.5  1.  -0.5 -0.  -1.  -0.5]
 [-0.5  2.1 -0.3  0.4  0.7 -0.2  0.7  0.8  0.4 -0.3]
 [ 0.5 -0.3  0.9 -0.2 -0.1 -0.4 -0.1  0.6 -0.2 -0.1]
 [-0.   0.4 -0.2  1.6  0.8  0.2  0.8 -0.8  0.6 -0.2]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1]
 [ 1.  -0.2 -0.4  0.2 -0.4  2.4 -0.4  0.4 -0.8 -0.4]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1]
 [-0.   0.8  0.6 -0.8 -0.4  0.4 -0.4  2.4 -0.8 -0.4]
 [-1.   0.4 -0.2  0.6  0.8 -0.8  0.8 -0.8  1.6  0.8]
 [-0.5 -0.3 -0.1 -0.2 -0.1 -0.4 -0.1 -0.4  0.8  0.9]]

Norms
[1.581 1.449 0.949 1.265 0.949 1.549 0.949 1.549 1.265 0.949]

Pearson similarities
[[ 1.    -0.218  0.333 -0.    -0.333  0.408 -0.333 -0.    -0.5   -0.333]
 [-0.218  1.    -0.218  0.218  0.509 -0.089  0.509  0.356  0.218 -0.218]
 [ 0.333 -0.218  1.    -0.167 -0.111 -0.272 -0.111  0.408 -0.167 -0.111]
 [-0.     0.218 -0.167  1.     0.667  0.102  0.667 -0.408  0.375 -0.167]
 [-0.333  0.509 -0.111  0.667  1.    -0.272  1.    -0.272  0.667 -0.111]
 [ 0.408 -0.089 -0.272  0.102 -0.272  1.    -0.272  0.167 -0.408 -0.272]
 [-0.333  0.509 -0.111  0.667  1.    -0.272  1.    -0.272  0.667 -0.111]
 [-0.     0.356  0.408 -0.408 -0.272  0.167 -0.272  1.    -0.408 -0.272]
 [-0.5    0.218 -0.167  0.375  0.667 -0.408  0.667 -0.408  1.     0.667]
 [-0.333 -0.218 -0.111 -0.167 -0.111 -0.272 -0.111 -0.272  0.667  1.   ]]

Calculate scores of all items for user 0

Find n closest neighbors

np.fill_diagonal(cos_sim, -1)

user_id = 0
n_neighbors = 10

neighbor_ids = np.argsort(-cos_sim[user_id])[:n_neighbors]

print("Nearest neighbors")
print(neighbor_ids)
print()

print("User {}".format(user_id))
print(r[user_id])
print()
print("User 138")
print(r[138])
print()
print("User 387")
print(r[387])
print()
print("User 240")
print(r[240])
Nearest neighbors
[138 387 240 399 513 285 473 172  24 270]

User 0
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]

User 138
[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]

User 387
[1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]

User 240
[1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]

Score all items

$$ \text{score(i)} = \frac{\sum_{v \in N(u)} \text{Sim}(u, v) \cdot v(i)}{\sum_{v \in N(u)} |\text{Sim}(u, v)|} $$
def score(similarities, v_i):
    return np.sum(similarities * v_i) / np.sum(similarities)

item_id = 6

print("Interactions for nearest neighbors")
print(r[neighbor_ids])
print()

similarities = cos_sim[user_id][neighbor_ids]
print("similarities")
print(similarities)
print()

v_i = r[neighbor_ids][:, item_id]
print("v_i")
print(v_i)
print()

print("score for user_id={} and item_id={}".format(user_id, item_id))
print(score(similarities, v_i))
Interactions for nearest neighbors
[[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]
 [1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 1. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0. 1. 1.]
 [1. 1. 1. 1. 1. 1. 0. 1. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]]

similarities
[0.91287093 0.84515425 0.84515425 0.84515425 0.84515425 0.84515425
 0.8        0.8        0.8        0.8       ]

v_i
[0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]

score for user_id=0 and item_id=6
0.202707883171415
print(r[user_id])

for i in range(10):
    similarities = cos_sim[user_id][neighbor_ids]
    v_i = r[neighbor_ids][:, i]
    print("score for user_id={} and item_id={}".format(user_id, i))
    print(round(score(similarities, v_i), 2))
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
score for user_id=0 and item_id=0
0.81
score for user_id=0 and item_id=1
1.0
score for user_id=0 and item_id=2
1.0
score for user_id=0 and item_id=3
1.0
score for user_id=0 and item_id=4
0.81
score for user_id=0 and item_id=5
0.69
score for user_id=0 and item_id=6
0.2
score for user_id=0 and item_id=7
0.21
score for user_id=0 and item_id=8
0.2
score for user_id=0 and item_id=9
0.2
# The same scoring with a single operation

item_ids = list(range(10))

v_i = r[neighbor_ids][:, item_ids]

scores = np.matmul(similarities, v_i) / np.sum(similarities)

print(scores)
[0.80812224 1.         1.         1.         0.80812224 0.68781735
 0.20270788 0.21082871 0.20270788 0.20270788]

Load a bigger dataset

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(HTML(ml_movies_df.head(10).to_html()))

print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id title genres
118 145 Bad Boys (1995) Action|Comedy|Crime|Drama|Thriller
143 171 Jeffrey (1995) Comedy|Drama
194 228 Destiny Turns on the Radio (1995) Comedy
199 233 Exotica (1994) Drama
230 267 Major Payne (1995) Comedy
313 355 Flintstones, The (1994) Children|Comedy|Fantasy
379 435 Coneheads (1993) Comedy|Sci-Fi
419 481 Kalifornia (1993) Drama|Thriller
615 780 Independence Day (a.k.a. ID4) (1996) Action|Adventure|Sci-Fi|Thriller
737 959 Of Human Bondage (1934) Drama
Number of interactions left: 1170

User-based neighborhood recommender

from recommenders.recommender import Recommender

class NearestNeighborsRecommender(Recommender):
    """
    Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.

    Possible similarity measures:
        - 'cosine',
        - 'pearson'.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.r = None
        self.similarities = None
        self.most_popular_items = None

        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        self.n_neighbors = 10
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'n_neighbors' in params:
            self.n_neighbors = params['n_neighbors']
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        del users_df, items_df

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        if self.collaboration_type == 'item':
            r = r.T

        self.r = r

        # Calculate all similarities

        similarities = None
        if self.similarity_measure == 'cosine':
            n_uv = np.matmul(r, r.T)
            norms = np.sqrt(np.diag(n_uv))
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
        elif self.similarity_measure == 'pearson':
            r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
            n_uv = np.matmul(r_shifted, r_shifted.T)
            norms = np.sqrt(np.diag(n_uv))
            norms[norms == 0] = 0.000001
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

        np.fill_diagonal(similarities, -1000)

        self.similarities = similarities

        # Find the most popular items for the cold start problem

        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]

        # Handle users not in the training data

        # Map item ids

        items_df = items_df.copy()
        items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []

            user_id = user['user_id']

            if user_id in self.user_id_mapping:
                chosen_ids = []
                scores = []
                mapped_user_id = self.user_id_mapping[user_id]

                if self.collaboration_type == 'user':
                    neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
                    user_similarities = self.similarities[mapped_user_id][neighbor_ids]

                    item_ids = items_df['item_id'].tolist()

                    v_i = self.r[neighbor_ids][:, item_ids]

                    scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)

                    # Choose n recommendations based on highest scores
                    if not self.should_recommend_already_bought:
                        x_list = self.interactions_df.loc[
                            self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                        scores[x_list] = -1e100

                    chosen_ids = np.argsort(-scores)[:n_recommendations]

                elif self.collaboration_type == 'item':
                    x_list = self.interactions_df.loc[
                        self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                    scores = np.sum(self.similarities[x_list], axis=0)

                    # Choose n recommendations based on highest scores
                    if not self.should_recommend_already_bought:
                        scores[x_list] = -1e100

                    chosen_ids = np.argsort(-scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
    

class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        
        
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'pearson'
        
        
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'cosine'
        

class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'pearson'
# Quick test of the recommender

nearest_neighbors_recommender = NearestNeighborsRecommender()
nearest_neighbors_recommender.initialize(n_neighbors=20)
nearest_neighbors_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = nearest_neighbors_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))
Recommendations
user_id item_id score title genres
0 1 355 0.955688 Flintstones, The (1994) Children|Comedy|Fantasy
1 1 73323 0.291761 Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009) Action|Crime|Mystery
2 1 8605 0.280261 Taxi 3 (2003) Action|Comedy
3 1 3628 0.059450 Flying Tigers (1942) Action|Drama|Romance|War
4 1 6755 0.059450 Bubba Ho-tep (2002) Comedy|Horror
5 1 3165 0.051167 Boiling Point (1993) Action|Drama
6 1 4031 0.044312 All the Pretty Horses (2000) Drama|Romance|Western
7 1 1914 0.044312 Smoke Signals (1998) Comedy|Drama
8 1 7282 0.000000 Hip Hop Witch, Da (2000) Comedy|Horror|Thriller
9 1 2190 0.000000 Why Do Fools Fall In Love? (1998) Drama
10 4 4031 0.556855 All the Pretty Horses (2000) Drama|Romance|Western
11 4 73323 0.556855 Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009) Action|Crime|Mystery
12 4 355 0.098477 Flintstones, The (1994) Children|Comedy|Fantasy
13 4 2806 0.098477 Teaching Mrs. Tingle (1999) Comedy|Thriller
14 4 5673 0.056855 Punch-Drunk Love (2002) Comedy|Drama|Romance
15 4 3567 0.056855 Bossa Nova (2000) Comedy|Drama|Romance
16 4 145 0.049238 Bad Boys (1995) Action|Comedy|Crime|Drama|Thriller
17 4 3165 0.049238 Boiling Point (1993) Action|Drama
18 4 2275 0.000000 Six-String Samurai (1998) Action|Adventure|Sci-Fi
19 4 4483 0.000000 Caddyshack II (1988) Comedy
20 6 4896 0.653889 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) Adventure|Children|Fantasy
21 6 1914 0.242896 Smoke Signals (1998) Comedy|Drama
22 6 4031 0.045903 All the Pretty Horses (2000) Drama|Romance|Western
23 6 2275 0.045903 Six-String Samurai (1998) Action|Adventure|Sci-Fi
24 6 3567 0.045903 Bossa Nova (2000) Comedy|Drama|Romance
25 6 73323 0.045543 Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009) Action|Crime|Mystery
26 6 1500 0.042938 Grosse Pointe Blank (1997) Comedy|Crime|Romance
27 6 4483 0.000000 Caddyshack II (1988) Comedy
28 6 7282 0.000000 Hip Hop Witch, Da (2000) Comedy|Horror|Thriller
29 6 2190 0.000000 Why Do Fools Fall In Love? (1998) Drama

Training-test split evaluation

from evaluation_and_testing.testing import evaluate_train_test_split_implicit
ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_tts_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_cos_nn_tts_results = pd.DataFrame(
    ub_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_cos_nn_tts_results.to_html()))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:146: RuntimeWarning: invalid value encountered in true_divide
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.103896 0.142857 0.227273 0.409091 0.103896 0.126777 0.161141 0.219215
ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_tts_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_pearson_nn_tts_results = pd.DataFrame(
    ub_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_pearson_nn_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedPearsonNearestNeighborsRecommender 0.103896 0.149351 0.227273 0.415584 0.103896 0.130024 0.162161 0.221924
ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_tts_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_cos_nn_tts_results = pd.DataFrame(
    ib_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_cos_nn_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 ItemBasedCosineNearestNeighborsRecommender 0.25974 0.545455 0.694805 0.954545 0.25974 0.421302 0.482493 0.566174
ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_tts_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_pearson_nn_tts_results = pd.DataFrame(
    ib_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_pearson_nn_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 ItemBasedPearsonNearestNeighborsRecommender 0.175325 0.350649 0.448052 0.558442 0.175325 0.27744 0.317397 0.352948
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.34147
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799
tts_results = pd.concat([ub_cos_nn_tts_results, ub_pearson_nn_tts_results, ib_cos_nn_tts_results, 
                         ib_pearson_nn_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.103896 0.142857 0.227273 0.409091 0.103896 0.126777 0.161141 0.219215
1 UserBasedPearsonNearestNeighborsRecommender 0.103896 0.149351 0.227273 0.415584 0.103896 0.130024 0.162161 0.221924
2 ItemBasedCosineNearestNeighborsRecommender 0.259740 0.545455 0.694805 0.954545 0.259740 0.421302 0.482493 0.566174
3 ItemBasedPearsonNearestNeighborsRecommender 0.175325 0.350649 0.448052 0.558442 0.175325 0.277440 0.317397 0.352948
4 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.341470
5 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799

Leave-one-out evaluation

from evaluation_and_testing.testing import evaluate_leave_one_out_implicit
ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_loo_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_cos_nn_loo_results = pd.DataFrame(
    ub_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_cos_nn_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.096667 0.146667 0.186667 0.306667 0.096667 0.124285 0.140782 0.178962
ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_loo_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_pearson_nn_loo_results = pd.DataFrame(
    ub_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_pearson_nn_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedPearsonNearestNeighborsRecommender 0.1 0.15 0.18 0.313333 0.1 0.127182 0.139518 0.181748
ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_loo_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_cos_nn_loo_results = pd.DataFrame(
    ib_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_cos_nn_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 ItemBasedCosineNearestNeighborsRecommender 0.266667 0.42 0.513333 0.65 0.266667 0.357736 0.396033 0.440599
ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_loo_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_pearson_nn_loo_results = pd.DataFrame(
    ib_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_pearson_nn_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 ItemBasedPearsonNearestNeighborsRecommender 0.173333 0.28 0.336667 0.42 0.173333 0.234522 0.257759 0.284723
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.166667 0.256667 0.32 0.426667 0.166667 0.219086 0.245486 0.279978
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151
loo_results = pd.concat([ub_cos_nn_loo_results, ub_pearson_nn_loo_results, ib_cos_nn_loo_results, 
                         ib_pearson_nn_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 UserBasedCosineNearestNeighborsRecommender 0.096667 0.146667 0.186667 0.306667 0.096667 0.124285 0.140782 0.178962
1 UserBasedPearsonNearestNeighborsRecommender 0.100000 0.150000 0.180000 0.313333 0.100000 0.127182 0.139518 0.181748
2 ItemBasedCosineNearestNeighborsRecommender 0.266667 0.420000 0.513333 0.650000 0.266667 0.357736 0.396033 0.440599
3 ItemBasedPearsonNearestNeighborsRecommender 0.173333 0.280000 0.336667 0.420000 0.173333 0.234522 0.257759 0.284723
4 AmazonRecommender 0.166667 0.256667 0.320000 0.426667 0.166667 0.219086 0.245486 0.279978
5 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151

Tasks

Task 1. Add euclidean distance as eligible similarity measure in the nearest neighbors recommender and compare the results of such a recommender to other recommenders tested in this notebook.

# Write your code in the original class and tests here

Task 2. Find the optimal number of neighbors for the User-Based Cosine Nearest Neighbors Recommender for $1 \leq \text{n_neighbors} \leq 100$ and the train-test split testing scheme.

# Write your code here