%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

Load data

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = [1, 318, 1193, 1208, 1214, 1721, 2959, 3578, 4306, 109487]

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(HTML(ml_movies_df.head(10).to_html()))

print("Number of interactions left: {}".format(len(ml_ratings_df)))

	item_id	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
277	318	Shawshank Redemption, The (1994)	Crime\|Drama
896	1193	One Flew Over the Cuckoo's Nest (1975)	Drama
909	1208	Apocalypse Now (1979)	Action\|Drama\|War
915	1214	Alien (1979)	Horror\|Sci-Fi
1291	1721	Titanic (1997)	Drama\|Romance
2226	2959	Fight Club (1999)	Action\|Crime\|Drama\|Thriller
2674	3578	Gladiator (2000)	Action\|Adventure\|Drama
3194	4306	Shrek (2001)	Adventure\|Animation\|Children\|Comedy\|Fantasy\|Romance
8376	109487	Interstellar (2014)	Sci-Fi\|IMAX

Number of interactions left: 1689

Shift item ids and user ids so that they are consecutive

interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

display(HTML(interactions_df.head(10).to_html()))

	user_id	item_id	rating	timestamp
0	0	0	4.0	964982703
72	0	1	4.0	964983250
75	0	2	4.0	964981855
192	0	3	5.0	964983282
219	0	4	5.0	964980668
232	1	5	3.0	1445714835
235	1	4	4.0	1445714885
255	1	6	3.0	1445715145
458	2	3	2.0	945078528
516	3	0	4.0	847434962

Get the number of items and users

n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))

n_items=10
n_users=521

Get the user-item interaction matrix

# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])

[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]

Calculate cosine similarities of users

$$ \text{Sim}(\vec{u}, \vec{v}) = \text{Cos}(\vec{u}, \vec{v}) = \frac{\vec{u} \cdot \vec{v}}{\lVert u \rVert \lVert v \rVert} = \frac{\sum_{i = 1}^n u_i v_i}{\sqrt{\sum_{i = 1}^n u_i^2} \sqrt{\sum_{i = 1}^n v_i^2}} $$

For interaction vectors cosine similarity changes from 0 to 1. 1 means that both vectors are identical. 0 means that they have no 1's in common.

def cosine(u, v):
    return np.sum(u * v) / np.sqrt(np.sum(u * u) * np.sum(v * v))

print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(cosine(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))

0.7071067811865475
1.0
0.0

print("Cosine similarity between user 0 and 1")
print(r[0])
print(r[1])
print(cosine(r[0], r[1]))

Cosine similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
0.2581988897471611

print("Cosine similarity between user 0 and 5")
print(r[0])
print(r[5])
print(cosine(r[0], r[5]))

Cosine similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.6708203932499369

Calculate Pearson similarities of users

$$ \text{Sim}(\vec{u}, \vec{v}) = \text{Pearson}(\vec{u}, \vec{v}) = \frac{\sum_{i = 1}^n (u_i - \bar{u}) (v_i - \bar{v})}{\sqrt{\sum_{i = 1}^n (u_i - \bar{u})^2} \sqrt{\sum_{i = 1}^n (v_i - \bar{v})^2}} $$

Correlation changes from -1 to 1. Correlation of 1 means that vectors are identical, -1 means they are opposites.

def pearson(u, v):
    return np.sum((u - np.mean(u)) * (v - np.mean(v))) / (len(u) * np.std(u) * np.std(v))

print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 0, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([1, 0, 1, 0])))
print(pearson(np.array([1, 0, 1, 0]), np.array([0, 1, 0, 1])))

0.5773502691896258
1.0
-1.0

print("Pearson similarity between user 0 and 1")
print(r[0])
print(r[1])
print(pearson(r[0], r[1]))

Pearson similarity between user 0 and 1
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
-0.2182178902359924

print("Pearson similarity between user 0 and 5")
print(r[0])
print(r[5])
print(pearson(r[0], r[5]))

Pearson similarity between user 0 and 5
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
0.40824829046386296

All cosine similarities

n_uv = np.matmul(r, r.T)

norms = np.sqrt(np.diag(n_uv))

cos_sim = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

print("Scalar products")
print(n_uv[:10, :10])
print()

print("Norms")
print(np.around(norms[:10], 3))
print()

print("Cosine similarities")
print(np.around(cos_sim[:10, :10], 3))

Scalar products
[[5. 1. 1. 1. 0. 3. 0. 2. 0. 0.]
 [1. 3. 0. 1. 1. 1. 1. 2. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 0.]
 [1. 1. 0. 2. 1. 1. 1. 0. 1. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0.]
 [3. 1. 0. 1. 0. 4. 0. 2. 0. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 1. 0.]
 [2. 2. 1. 0. 0. 2. 0. 4. 0. 0.]
 [0. 1. 0. 1. 1. 0. 1. 0. 2. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]

Norms
[2.236 1.732 1.    1.414 1.    2.    1.    2.    1.414 1.   ]

Cosine similarities
[[1.    0.258 0.447 0.316 0.    0.671 0.    0.447 0.    0.   ]
 [0.258 1.    0.    0.408 0.577 0.289 0.577 0.577 0.408 0.   ]
 [0.447 0.    1.    0.    0.    0.    0.    0.5   0.    0.   ]
 [0.316 0.408 0.    1.    0.707 0.354 0.707 0.    0.5   0.   ]
 [0.    0.577 0.    0.707 1.    0.    1.    0.    0.707 0.   ]
 [0.671 0.289 0.    0.354 0.    1.    0.    0.5   0.    0.   ]
 [0.    0.577 0.    0.707 1.    0.    1.    0.    0.707 0.   ]
 [0.447 0.577 0.5   0.    0.    0.5   0.    1.    0.    0.   ]
 [0.    0.408 0.    0.5   0.707 0.    0.707 0.    1.    0.707]
 [0.    0.    0.    0.    0.    0.    0.    0.    0.707 1.   ]]

All Pearson similarities

r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)

n_uv = np.matmul(r_shifted, r_shifted.T)

norms = np.sqrt(np.diag(n_uv))

norms[norms == 0] = 0.000001

person_sim = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

print("Scalar products")
print(np.around(n_uv[:10, :10], 3))
print()

print("Norms")
print(np.around(norms[:10], 3))
print()

print("Pearson similarities")
print(np.around(person_sim[:10, :10], 3))

Scalar products
[[ 2.5 -0.5  0.5 -0.  -0.5  1.  -0.5 -0.  -1.  -0.5]
 [-0.5  2.1 -0.3  0.4  0.7 -0.2  0.7  0.8  0.4 -0.3]
 [ 0.5 -0.3  0.9 -0.2 -0.1 -0.4 -0.1  0.6 -0.2 -0.1]
 [-0.   0.4 -0.2  1.6  0.8  0.2  0.8 -0.8  0.6 -0.2]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1]
 [ 1.  -0.2 -0.4  0.2 -0.4  2.4 -0.4  0.4 -0.8 -0.4]
 [-0.5  0.7 -0.1  0.8  0.9 -0.4  0.9 -0.4  0.8 -0.1]
 [-0.   0.8  0.6 -0.8 -0.4  0.4 -0.4  2.4 -0.8 -0.4]
 [-1.   0.4 -0.2  0.6  0.8 -0.8  0.8 -0.8  1.6  0.8]
 [-0.5 -0.3 -0.1 -0.2 -0.1 -0.4 -0.1 -0.4  0.8  0.9]]

Norms
[1.581 1.449 0.949 1.265 0.949 1.549 0.949 1.549 1.265 0.949]

Pearson similarities
[[ 1.    -0.218  0.333 -0.    -0.333  0.408 -0.333 -0.    -0.5   -0.333]
 [-0.218  1.    -0.218  0.218  0.509 -0.089  0.509  0.356  0.218 -0.218]
 [ 0.333 -0.218  1.    -0.167 -0.111 -0.272 -0.111  0.408 -0.167 -0.111]
 [-0.     0.218 -0.167  1.     0.667  0.102  0.667 -0.408  0.375 -0.167]
 [-0.333  0.509 -0.111  0.667  1.    -0.272  1.    -0.272  0.667 -0.111]
 [ 0.408 -0.089 -0.272  0.102 -0.272  1.    -0.272  0.167 -0.408 -0.272]
 [-0.333  0.509 -0.111  0.667  1.    -0.272  1.    -0.272  0.667 -0.111]
 [-0.     0.356  0.408 -0.408 -0.272  0.167 -0.272  1.    -0.408 -0.272]
 [-0.5    0.218 -0.167  0.375  0.667 -0.408  0.667 -0.408  1.     0.667]
 [-0.333 -0.218 -0.111 -0.167 -0.111 -0.272 -0.111 -0.272  0.667  1.   ]]

Calculate scores of all items for user 0

Find n closest neighbors

np.fill_diagonal(cos_sim, -1)

user_id = 0
n_neighbors = 10

neighbor_ids = np.argsort(-cos_sim[user_id])[:n_neighbors]

print("Nearest neighbors")
print(neighbor_ids)
print()

print("User {}".format(user_id))
print(r[user_id])
print()
print("User 138")
print(r[138])
print()
print("User 387")
print(r[387])
print()
print("User 240")
print(r[240])

Nearest neighbors
[138 387 240 399 513 285 473 172  24 270]

User 0
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]

User 138
[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]

User 387
[1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]

User 240
[1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]

Score all items

$$ \text{score(i)} = \frac{\sum_{v \in N(u)} \text{Sim}(u, v) \cdot v(i)}{\sum_{v \in N(u)} |\text{Sim}(u, v)|} $$

def score(similarities, v_i):
    return np.sum(similarities * v_i) / np.sum(similarities)

item_id = 6

print("Interactions for nearest neighbors")
print(r[neighbor_ids])
print()

similarities = cos_sim[user_id][neighbor_ids]
print("similarities")
print(similarities)
print()

v_i = r[neighbor_ids][:, item_id]
print("v_i")
print(v_i)
print()

print("score for user_id={} and item_id={}".format(user_id, item_id))
print(score(similarities, v_i))

Interactions for nearest neighbors
[[1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]
 [1. 1. 1. 1. 1. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 1. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0. 1. 1.]
 [1. 1. 1. 1. 1. 1. 0. 1. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]]

similarities
[0.91287093 0.84515425 0.84515425 0.84515425 0.84515425 0.84515425
 0.8        0.8        0.8        0.8       ]

v_i
[0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]

score for user_id=0 and item_id=6
0.202707883171415

print(r[user_id])

for i in range(10):
    similarities = cos_sim[user_id][neighbor_ids]
    v_i = r[neighbor_ids][:, i]
    print("score for user_id={} and item_id={}".format(user_id, i))
    print(round(score(similarities, v_i), 2))

[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
score for user_id=0 and item_id=0
0.81
score for user_id=0 and item_id=1
1.0
score for user_id=0 and item_id=2
1.0
score for user_id=0 and item_id=3
1.0
score for user_id=0 and item_id=4
0.81
score for user_id=0 and item_id=5
0.69
score for user_id=0 and item_id=6
0.2
score for user_id=0 and item_id=7
0.21
score for user_id=0 and item_id=8
0.2
score for user_id=0 and item_id=9
0.2

# The same scoring with a single operation

item_ids = list(range(10))

v_i = r[neighbor_ids][:, item_ids]

scores = np.matmul(similarities, v_i) / np.sum(similarities)

print(scores)

[0.80812224 1.         1.         1.         0.80812224 0.68781735
 0.20270788 0.21082871 0.20270788 0.20270788]

Load a bigger dataset

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

display(HTML(ml_movies_df.head(10).to_html()))

print("Number of interactions left: {}".format(len(ml_ratings_df)))

	item_id	title	genres
118	145	Bad Boys (1995)	Action\|Comedy\|Crime\|Drama\|Thriller
143	171	Jeffrey (1995)	Comedy\|Drama
194	228	Destiny Turns on the Radio (1995)	Comedy
199	233	Exotica (1994)	Drama
230	267	Major Payne (1995)	Comedy
313	355	Flintstones, The (1994)	Children\|Comedy\|Fantasy
379	435	Coneheads (1993)	Comedy\|Sci-Fi
419	481	Kalifornia (1993)	Drama\|Thriller
615	780	Independence Day (a.k.a. ID4) (1996)	Action\|Adventure\|Sci-Fi\|Thriller
737	959	Of Human Bondage (1934)	Drama

Number of interactions left: 1170

User-based neighborhood recommender

from recommenders.recommender import Recommender

class NearestNeighborsRecommender(Recommender):
    """
    Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.

    Possible similarity measures:
        - 'cosine',
        - 'pearson'.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.r = None
        self.similarities = None
        self.most_popular_items = None

        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        self.n_neighbors = 10
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'n_neighbors' in params:
            self.n_neighbors = params['n_neighbors']
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        del users_df, items_df

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        if self.collaboration_type == 'item':
            r = r.T

        self.r = r

        # Calculate all similarities

        similarities = None
        if self.similarity_measure == 'cosine':
            n_uv = np.matmul(r, r.T)
            norms = np.sqrt(np.diag(n_uv))
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
        elif self.similarity_measure == 'pearson':
            r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
            n_uv = np.matmul(r_shifted, r_shifted.T)
            norms = np.sqrt(np.diag(n_uv))
            norms[norms == 0] = 0.000001
            similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]

        np.fill_diagonal(similarities, -1000)

        self.similarities = similarities

        # Find the most popular items for the cold start problem

        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]

        # Handle users not in the training data

        # Map item ids

        items_df = items_df.copy()
        items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
        items_df.replace({'item_id': self.item_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []

            user_id = user['user_id']

            if user_id in self.user_id_mapping:
                chosen_ids = []
                scores = []
                mapped_user_id = self.user_id_mapping[user_id]

                if self.collaboration_type == 'user':
                    neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
                    user_similarities = self.similarities[mapped_user_id][neighbor_ids]

                    item_ids = items_df['item_id'].tolist()

                    v_i = self.r[neighbor_ids][:, item_ids]

                    scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)

                    # Choose n recommendations based on highest scores
                    if not self.should_recommend_already_bought:
                        x_list = self.interactions_df.loc[
                            self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                        scores[x_list] = -1e100

                    chosen_ids = np.argsort(-scores)[:n_recommendations]

                elif self.collaboration_type == 'item':
                    x_list = self.interactions_df.loc[
                        self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                    scores = np.sum(self.similarities[x_list], axis=0)

                    # Choose n recommendations based on highest scores
                    if not self.should_recommend_already_bought:
                        scores[x_list] = -1e100

                    chosen_ids = np.argsort(-scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
    

class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'cosine'
        
        
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'user'
        self.similarity_measure = 'pearson'
        
        
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'cosine'
        

class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
    
    def __init__(self):
        super().__init__()
        
        self.collaboration_type = 'item'
        self.similarity_measure = 'pearson'

# Quick test of the recommender

nearest_neighbors_recommender = NearestNeighborsRecommender()
nearest_neighbors_recommender.initialize(n_neighbors=20)
nearest_neighbors_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = nearest_neighbors_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

Recommendations

	user_id	item_id	score	title	genres
0	1	355	0.955688	Flintstones, The (1994)	Children\|Comedy\|Fantasy
1	1	73323	0.291761	Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)	Action\|Crime\|Mystery
2	1	8605	0.280261	Taxi 3 (2003)	Action\|Comedy
3	1	3628	0.059450	Flying Tigers (1942)	Action\|Drama\|Romance\|War
4	1	6755	0.059450	Bubba Ho-tep (2002)	Comedy\|Horror
5	1	3165	0.051167	Boiling Point (1993)	Action\|Drama
6	1	4031	0.044312	All the Pretty Horses (2000)	Drama\|Romance\|Western
7	1	1914	0.044312	Smoke Signals (1998)	Comedy\|Drama
8	1	7282	0.000000	Hip Hop Witch, Da (2000)	Comedy\|Horror\|Thriller
9	1	2190	0.000000	Why Do Fools Fall In Love? (1998)	Drama
10	4	4031	0.556855	All the Pretty Horses (2000)	Drama\|Romance\|Western
11	4	73323	0.556855	Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)	Action\|Crime\|Mystery
12	4	355	0.098477	Flintstones, The (1994)	Children\|Comedy\|Fantasy
13	4	2806	0.098477	Teaching Mrs. Tingle (1999)	Comedy\|Thriller
14	4	5673	0.056855	Punch-Drunk Love (2002)	Comedy\|Drama\|Romance
15	4	3567	0.056855	Bossa Nova (2000)	Comedy\|Drama\|Romance
16	4	145	0.049238	Bad Boys (1995)	Action\|Comedy\|Crime\|Drama\|Thriller
17	4	3165	0.049238	Boiling Point (1993)	Action\|Drama
18	4	2275	0.000000	Six-String Samurai (1998)	Action\|Adventure\|Sci-Fi
19	4	4483	0.000000	Caddyshack II (1988)	Comedy
20	6	4896	0.653889	Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)	Adventure\|Children\|Fantasy
21	6	1914	0.242896	Smoke Signals (1998)	Comedy\|Drama
22	6	4031	0.045903	All the Pretty Horses (2000)	Drama\|Romance\|Western
23	6	2275	0.045903	Six-String Samurai (1998)	Action\|Adventure\|Sci-Fi
24	6	3567	0.045903	Bossa Nova (2000)	Comedy\|Drama\|Romance
25	6	73323	0.045543	Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)	Action\|Crime\|Mystery
26	6	1500	0.042938	Grosse Pointe Blank (1997)	Comedy\|Crime\|Romance
27	6	4483	0.000000	Caddyshack II (1988)	Comedy
28	6	7282	0.000000	Hip Hop Witch, Da (2000)	Comedy\|Horror\|Thriller
29	6	2190	0.000000	Why Do Fools Fall In Love? (1998)	Drama

Training-test split evaluation

from evaluation_and_testing.testing import evaluate_train_test_split_implicit

ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_tts_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_cos_nn_tts_results = pd.DataFrame(
    ub_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_cos_nn_tts_results.to_html()))

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:146: RuntimeWarning: invalid value encountered in true_divide

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedCosineNearestNeighborsRecommender	0.103896	0.142857	0.227273	0.409091	0.103896	0.126777	0.161141	0.219215

ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_tts_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ub_pearson_nn_tts_results = pd.DataFrame(
    ub_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_pearson_nn_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedPearsonNearestNeighborsRecommender	0.103896	0.149351	0.227273	0.415584	0.103896	0.130024	0.162161	0.221924

ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_tts_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_cos_nn_tts_results = pd.DataFrame(
    ib_cos_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_cos_nn_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	ItemBasedCosineNearestNeighborsRecommender	0.25974	0.545455	0.694805	0.954545	0.25974	0.421302	0.482493	0.566174

ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_tts_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_train_test_split_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

ib_pearson_nn_tts_results = pd.DataFrame(
    ib_pearson_nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_pearson_nn_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	ItemBasedPearsonNearestNeighborsRecommender	0.175325	0.350649	0.448052	0.558442	0.175325	0.27744	0.317397	0.352948

from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.181818	0.311688	0.402597	0.551948	0.181818	0.257806	0.294682	0.34147

from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	TFIDFRecommender	0.025974	0.090909	0.136364	0.318182	0.025974	0.064393	0.083685	0.140799

tts_results = pd.concat([ub_cos_nn_tts_results, ub_pearson_nn_tts_results, ib_cos_nn_tts_results, 
                         ib_pearson_nn_tts_results, amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedCosineNearestNeighborsRecommender	0.103896	0.142857	0.227273	0.409091	0.103896	0.126777	0.161141	0.219215
1	UserBasedPearsonNearestNeighborsRecommender	0.103896	0.149351	0.227273	0.415584	0.103896	0.130024	0.162161	0.221924
2	ItemBasedCosineNearestNeighborsRecommender	0.259740	0.545455	0.694805	0.954545	0.259740	0.421302	0.482493	0.566174
3	ItemBasedPearsonNearestNeighborsRecommender	0.175325	0.350649	0.448052	0.558442	0.175325	0.277440	0.317397	0.352948
4	AmazonRecommender	0.181818	0.311688	0.402597	0.551948	0.181818	0.257806	0.294682	0.341470
5	TFIDFRecommender	0.025974	0.090909	0.136364	0.318182	0.025974	0.064393	0.083685	0.140799

Leave-one-out evaluation

from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

ub_cos_nn_recommender = UserBasedCosineNearestNeighborsRecommender()
ub_cos_nn_recommender.initialize(n_neighbors=30)

ub_cos_nn_loo_results = [['UserBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_cos_nn_loo_results = pd.DataFrame(
    ub_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_cos_nn_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedCosineNearestNeighborsRecommender	0.096667	0.146667	0.186667	0.306667	0.096667	0.124285	0.140782	0.178962

ub_pearson_nn_recommender = UserBasedPearsonNearestNeighborsRecommender()
ub_pearson_nn_recommender.initialize(n_neighbors=30)

ub_pearson_nn_loo_results = [['UserBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ub_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ub_pearson_nn_loo_results = pd.DataFrame(
    ub_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ub_pearson_nn_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedPearsonNearestNeighborsRecommender	0.1	0.15	0.18	0.313333	0.1	0.127182	0.139518	0.181748

ib_cos_nn_recommender = ItemBasedCosineNearestNeighborsRecommender()
ib_cos_nn_recommender.initialize(n_neighbors=30)

ib_cos_nn_loo_results = [['ItemBasedCosineNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_cos_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_cos_nn_loo_results = pd.DataFrame(
    ib_cos_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_cos_nn_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	ItemBasedCosineNearestNeighborsRecommender	0.266667	0.42	0.513333	0.65	0.266667	0.357736	0.396033	0.440599

ib_pearson_nn_recommender = ItemBasedPearsonNearestNeighborsRecommender()
ib_pearson_nn_recommender.initialize(n_neighbors=30)

ib_pearson_nn_loo_results = [['ItemBasedPearsonNearestNeighborsRecommender'] + list(evaluate_leave_one_out_implicit(
    ib_pearson_nn_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

ib_pearson_nn_loo_results = pd.DataFrame(
    ib_pearson_nn_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(ib_pearson_nn_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	ItemBasedPearsonNearestNeighborsRecommender	0.173333	0.28	0.336667	0.42	0.173333	0.234522	0.257759	0.284723

from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.166667	0.256667	0.32	0.426667	0.166667	0.219086	0.245486	0.279978

tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	TFIDFRecommender	0.006667	0.053333	0.123333	0.233333	0.006667	0.033491	0.062178	0.096151

loo_results = pd.concat([ub_cos_nn_loo_results, ub_pearson_nn_loo_results, ib_cos_nn_loo_results, 
                         ib_pearson_nn_loo_results, amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	UserBasedCosineNearestNeighborsRecommender	0.096667	0.146667	0.186667	0.306667	0.096667	0.124285	0.140782	0.178962
1	UserBasedPearsonNearestNeighborsRecommender	0.100000	0.150000	0.180000	0.313333	0.100000	0.127182	0.139518	0.181748
2	ItemBasedCosineNearestNeighborsRecommender	0.266667	0.420000	0.513333	0.650000	0.266667	0.357736	0.396033	0.440599
3	ItemBasedPearsonNearestNeighborsRecommender	0.173333	0.280000	0.336667	0.420000	0.173333	0.234522	0.257759	0.284723
4	AmazonRecommender	0.166667	0.256667	0.320000	0.426667	0.166667	0.219086	0.245486	0.279978
5	TFIDFRecommender	0.006667	0.053333	0.123333	0.233333	0.006667	0.033491	0.062178	0.096151

Tasks

Task 1. Add euclidean distance as eligible similarity measure in the nearest neighbors recommender and compare the results of such a recommender to other recommenders tested in this notebook.

# Write your code in the original class and tests here

Task 2. Find the optimal number of neighbors for the User-Based Cosine Nearest Neighbors Recommender for $1 \leq \text{n_neighbors} \leq 100$ and the train-test split testing scheme.

# Write your code here

81 KiB Raw Blame History