# Self made simplified I-KNN

In [1]:
import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

In [2]:
class IKNN():
    
    def fit(self, train_ui):
        self.train_ui=train_ui
        
        train_iu=train_ui.transpose()
        norms=np.linalg.norm(train_iu.A, axis=1) # here we compute lenth of each item ratings vector
        norms=np.vectorize(lambda x: max(x,1))(norms[:,None]) # to avoid dividing by zero

        normalized_train_iu=sparse.csr_matrix(train_iu/norms)

        self.similarity_matrix_ii=normalized_train_iu*normalized_train_iu.transpose()
        
        self.estimations=np.array(train_ui*self.similarity_matrix_ii/((train_ui>0)*self.similarity_matrix_ii))
        
    def recommend(self, user_code_id, item_code_id, topK=10):
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(self.estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated and not np.isnan(score):
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], 
                           self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])
        return result

In [3]:
# toy example
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])

toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)


model=IKNN()
model.fit(toy_train_ui)

print('toy train ui:')
display(toy_train_ui.A)

print('similarity matrix:')
display(model.similarity_matrix_ii.A)

print('estimations matrix:')
display(model.estimations)

model.recommend(toy_user_code_id, toy_item_code_id)

toy train ui:


array([[3, 4, 0, 0, 5, 0, 0, 4],
       [0, 1, 2, 3, 0, 0, 0, 0],
       [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)

similarity matrix:


array([[1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.9701425 , 1.        , 0.24253563, 0.12478355, 0.9701425 ,
        0.        , 0.        , 0.9701425 ],
       [0.        , 0.24253563, 1.        , 0.51449576, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.12478355, 0.51449576, 1.        , 0.        ,
        0.85749293, 0.85749293, 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ]])

estimations matrix:


array([[4.        , 4.        , 4.        , 4.        , 4.        ,
               nan,        nan, 4.        ],
       [1.        , 1.35990333, 2.15478388, 2.53390319, 1.        ,
        3.        , 3.        , 1.        ],
       [       nan, 5.        , 5.        , 4.05248907,        nan,
        3.95012863, 3.95012863,        nan]])

[[0, 20, 4.0, 30, 4.0],
 [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],
 [20, 10, 5.0, 20, 5.0]]

In [4]:
model=IKNN()
model.fit(train_ui)

top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv('Recommendations generated/ml-100k/Self_IKNN_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', index=False, header=False)

In [5]:
import evaluation_measures as ev
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Self_IKNN_reco.csv', delimiter=',')

ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
            estimations_df=estimations_df, 
            reco=reco,
            super_reactions=[4,5])

943it [00:00, 3162.40it/s]


Unnamed: 0,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,F_2,Whole_average,Reco in test,Test coverage,Shannon,Gini
0,1.018363,0.808793,0.000318,0.000108,0.00014,0.000189,0.0,0.0,0.000214,3.7e-05,0.000368,0.496391,0.003181,0.000118,0.041755,0.392153,0.11544,4.174741,0.965327


In [6]:
import imp
imp.reload(ev)

import evaluation_measures as ev
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 3730.64it/s]
943it [00:00, 3921.13it/s]
943it [00:00, 3732.17it/s]
943it [00:00, 4078.27it/s]
943it [00:00, 2833.82it/s]
943it [00:00, 4027.94it/s]
943it [00:00, 4634.12it/s]
943it [00:00, 4453.36it/s]
943it [00:00, 4301.74it/s]
943it [00:00, 5008.94it/s]
943it [00:00, 3542.76it/s]
943it [00:00, 3280.94it/s]
943it [00:00, 3370.61it/s]
943it [00:00, 4467.43it/s]
943it [00:00, 3794.77it/s]
943it [00:00, 3759.22it/s]
943it [00:00, 4144.81it/s]
943it [00:00, 4232.41it/s]


Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,F_2,Whole_average,Reco in test,Test coverage,Shannon,Gini
0,Self_RP3Beta,3.702928,3.527713,0.322694,0.216069,0.212152,0.247538,0.245279,0.284983,0.388271,0.248239,0.636318,0.605683,0.910923,0.20545,0.376967,0.999788,0.178932,4.549663,0.950182
0,Self_P3,3.702446,3.527273,0.282185,0.192092,0.186749,0.21698,0.204185,0.240096,0.339114,0.204905,0.572157,0.593544,0.875928,0.181702,0.340803,1.0,0.077201,3.875892,0.974947
0,Self_TopPop,2.508258,2.217909,0.188865,0.116919,0.118732,0.141584,0.130472,0.137473,0.214651,0.111707,0.400939,0.555546,0.765642,0.11275,0.249607,1.0,0.038961,3.159079,0.987317
0,Self_SVDBaseline,3.645666,3.480246,0.137858,0.082398,0.084151,0.101063,0.10794,0.109393,0.164477,0.082973,0.342374,0.538097,0.638388,0.07986,0.205748,0.999894,0.279221,5.159076,0.90722
0,Ready_SVD,0.952563,0.750158,0.094486,0.046274,0.051389,0.065625,0.082618,0.07415,0.10932,0.051383,0.240693,0.519849,0.47508,0.046237,0.154759,0.993425,0.206349,4.442996,0.952832
0,Self_SVD,0.91489,0.717962,0.102969,0.042325,0.052022,0.069313,0.093562,0.074994,0.105416,0.050278,0.191533,0.51789,0.462354,0.044591,0.150604,0.867656,0.141414,3.929249,0.971112
0,Ready_Baseline,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463,0.095957,0.043178,0.198193,0.515501,0.437964,0.039549,0.1419,1.0,0.033911,2.836513,0.991139
0,Self_KNNSurprisetask,0.946255,0.745209,0.083457,0.032848,0.041227,0.055493,0.074785,0.04889,0.089577,0.040902,0.189057,0.513076,0.417815,0.034996,0.135177,0.888547,0.130592,3.611806,0.978659
0,Self_TopRated,2.508258,2.217909,0.079321,0.032667,0.039983,0.05317,0.068884,0.048582,0.070766,0.027602,0.11479,0.512943,0.411453,0.034385,0.124546,1.0,0.024531,2.761238,0.99166
0,Ready_SVDBiased,0.942141,0.74276,0.08123,0.032344,0.040302,0.053932,0.072639,0.051126,0.087552,0.039346,0.191285,0.512818,0.416755,0.034405,0.134478,0.997667,0.165224,4.147579,0.96469


# Ready-made KNNs - Surprise implementation

### I-KNN - basic

In [7]:
import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...


### U-KNN - basic

In [8]:
import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': True}  # compute similarities between users
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_U-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...


### I-KNN - on top baseline

In [9]:
import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBaseline()

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...


# project task 4:  use a version of your choice of Surprise KNNalgorithm

In [11]:
# read the docs and try to find best parameter configuration (let say in terms of RMSE)
# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline
# the solution here can be similar to examples above
# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and
# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'

## SOLUTION TASK 4

import helpers
import surprise as sp
import imp

imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}

algo = sp.KNNBaseline(sim_options = sim_options)

helpers.ready_made(algo, reco_path = 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv',
          estimations_path = 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv')

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...


In [12]:
import imp
imp.reload(ev)

import evaluation_measures as ev
dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv('./Datasets/ml-100k/test.csv', sep = '\t', header = None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 2579.01it/s]
943it [00:00, 2473.28it/s]
943it [00:00, 2787.61it/s]
943it [00:00, 2862.03it/s]
943it [00:00, 2636.14it/s]
943it [00:00, 2764.91it/s]
943it [00:00, 2362.52it/s]
943it [00:00, 2446.87it/s]
943it [00:00, 2961.39it/s]
943it [00:00, 2858.86it/s]
943it [00:00, 2449.24it/s]
943it [00:00, 2748.70it/s]
943it [00:00, 2379.95it/s]
943it [00:00, 2599.51it/s]
943it [00:00, 2705.51it/s]
943it [00:00, 2574.33it/s]
943it [00:00, 2450.80it/s]
943it [00:00, 2242.87it/s]


Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,F_2,Whole_average,Reco in test,Test coverage,Shannon,Gini
0,Self_RP3Beta,3.702928,3.527713,0.322694,0.216069,0.212152,0.247538,0.245279,0.284983,0.388271,0.248239,0.636318,0.605683,0.910923,0.20545,0.376967,0.999788,0.178932,4.549663,0.950182
0,Self_P3,3.702446,3.527273,0.282185,0.192092,0.186749,0.21698,0.204185,0.240096,0.339114,0.204905,0.572157,0.593544,0.875928,0.181702,0.340803,1.0,0.077201,3.875892,0.974947
0,Self_TopPop,2.508258,2.217909,0.188865,0.116919,0.118732,0.141584,0.130472,0.137473,0.214651,0.111707,0.400939,0.555546,0.765642,0.11275,0.249607,1.0,0.038961,3.159079,0.987317
0,Self_SVDBaseline,3.645666,3.480246,0.137858,0.082398,0.084151,0.101063,0.10794,0.109393,0.164477,0.082973,0.342374,0.538097,0.638388,0.07986,0.205748,0.999894,0.279221,5.159076,0.90722
0,Ready_SVD,0.952563,0.750158,0.094486,0.046274,0.051389,0.065625,0.082618,0.07415,0.10932,0.051383,0.240693,0.519849,0.47508,0.046237,0.154759,0.993425,0.206349,4.442996,0.952832
0,Self_SVD,0.91489,0.717962,0.102969,0.042325,0.052022,0.069313,0.093562,0.074994,0.105416,0.050278,0.191533,0.51789,0.462354,0.044591,0.150604,0.867656,0.141414,3.929249,0.971112
0,Ready_Baseline,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463,0.095957,0.043178,0.198193,0.515501,0.437964,0.039549,0.1419,1.0,0.033911,2.836513,0.991139
0,Self_KNNSurprisetask,0.946255,0.745209,0.083457,0.032848,0.041227,0.055493,0.074785,0.04889,0.089577,0.040902,0.189057,0.513076,0.417815,0.034996,0.135177,0.888547,0.130592,3.611806,0.978659
0,Self_TopRated,2.508258,2.217909,0.079321,0.032667,0.039983,0.05317,0.068884,0.048582,0.070766,0.027602,0.11479,0.512943,0.411453,0.034385,0.124546,1.0,0.024531,2.761238,0.99166
0,Ready_SVDBiased,0.942141,0.74276,0.08123,0.032344,0.040302,0.053932,0.072639,0.051126,0.087552,0.039346,0.191285,0.512818,0.416755,0.034405,0.134478,0.997667,0.165224,4.147579,0.96469
