Self made simplified I-KNN

import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

class IKNN():
    
    def fit(self, train_ui):
        self.train_ui=train_ui
        
        train_iu=train_ui.transpose()
        norms=np.linalg.norm(train_iu.A, axis=1) # here we compute lenth of each item ratings vector
        norms=np.vectorize(lambda x: max(x,1))(norms[:,None]) # to avoid dividing by zero

        normalized_train_iu=sparse.csr_matrix(train_iu/norms)

        self.similarity_matrix_ii=normalized_train_iu*normalized_train_iu.transpose()
        
        self.estimations=np.array(train_ui*self.similarity_matrix_ii/((train_ui>0)*self.similarity_matrix_ii))
        
    def recommend(self, user_code_id, item_code_id, topK=10):
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(self.estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated and not np.isnan(score):
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], 
                           self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])
        return result

# toy example
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])

toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)


model=IKNN()
model.fit(toy_train_ui)

print('toy train ui:')
display(toy_train_ui.A)

print('similarity matrix:')
display(model.similarity_matrix_ii.A)

print('estimations matrix:')
display(model.estimations)

model.recommend(toy_user_code_id, toy_item_code_id)

toy train ui:

array([[3, 4, 0, 0, 5, 0, 0, 4],
       [0, 1, 2, 3, 0, 0, 0, 0],
       [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)

similarity matrix:

array([[1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.9701425 , 1.        , 0.24253563, 0.12478355, 0.9701425 ,
        0.        , 0.        , 0.9701425 ],
       [0.        , 0.24253563, 1.        , 0.51449576, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.12478355, 0.51449576, 1.        , 0.        ,
        0.85749293, 0.85749293, 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ]])

estimations matrix:

array([[4.        , 4.        , 4.        , 4.        , 4.        ,
               nan,        nan, 4.        ],
       [1.        , 1.35990333, 2.15478388, 2.53390319, 1.        ,
        3.        , 3.        , 1.        ],
       [       nan, 5.        , 5.        , 4.05248907,        nan,
        3.95012863, 3.95012863,        nan]])

[[0, 20, 4.0, 30, 4.0],
 [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],
 [20, 10, 5.0, 20, 5.0]]

model=IKNN()
model.fit(train_ui)

top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv('Recommendations generated/ml-100k/Self_IKNN_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', index=False, header=False)

import evaluation_measures as ev
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Self_IKNN_reco.csv', delimiter=',')

ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
            estimations_df=estimations_df, 
            reco=reco,
            super_reactions=[4,5])

943it [00:00, 12078.80it/s]

	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	F_2	Whole_average	Reco in test	Test coverage	Shannon	Gini
0	1.018363	0.808793	0.000318	0.000108	0.00014	0.000189	0.0	0.0	0.000214	0.000037	0.000368	0.496391	0.003181	0.000118	0.041755	0.392153	0.11544	4.174741	0.965327

import imp
imp.reload(ev)

import evaluation_measures as ev
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 12078.88it/s]
943it [00:00, 11925.80it/s]
943it [00:00, 11925.88it/s]
943it [00:00, 11215.86it/s]
943it [00:00, 12561.94it/s]
943it [00:00, 12233.66it/s]
943it [00:00, 12396.66it/s]
943it [00:00, 12235.55it/s]
943it [00:00, 12233.85it/s]
943it [00:00, 12078.62it/s]
943it [00:00, 11925.44it/s]
943it [00:00, 12235.67it/s]
943it [00:00, 12235.44it/s]
943it [00:00, 11776.81it/s]
943it [00:00, 11489.54it/s]
943it [00:00, 11489.44it/s]

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	F_2	Whole_average	Reco in test	Test coverage	Shannon	Gini
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	0.112750	0.249607	1.000000	0.038961	3.159079	0.987317
Self_SVDBaseline	3.642454	3.477190	0.135101	0.078585	0.082031	0.099077	0.107189	0.105339	0.161370	0.081348	0.335256	0.536192	0.635207	0.077167	0.202822	0.999894	0.287157	5.167630	0.906365
Ready_SVD	0.950188	0.749989	0.096288	0.044814	0.052190	0.067156	0.085408	0.070906	0.105477	0.048037	0.225665	0.519108	0.488865	0.046118	0.154169	0.997667	0.213564	4.410143	0.952728
Self_SVD	0.914262	0.717023	0.104666	0.043109	0.052955	0.070403	0.095064	0.074470	0.107580	0.051132	0.198774	0.518287	0.479321	0.045457	0.153435	0.860233	0.140693	3.924150	0.971320
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	0.039549	0.141900	1.000000	0.033911	2.836513	0.991139
Ready_SVDBiased	0.940534	0.742068	0.081866	0.035675	0.041920	0.055096	0.072103	0.051976	0.089568	0.039835	0.195104	0.514486	0.429480	0.036595	0.136976	0.996925	0.165945	4.136236	0.965487
Self_GlobalAvg	1.125760	0.943534	0.061188	0.025968	0.031383	0.041343	0.040558	0.032107	0.067695	0.027470	0.171187	0.509546	0.384942	0.027213	0.118383	1.000000	0.025974	2.711772	0.992003
Ready_Random	1.525730	1.225537	0.045917	0.020462	0.023786	0.031070	0.026931	0.021781	0.051318	0.019634	0.132275	0.506747	0.316013	0.020936	0.101406	0.987275	0.183261	5.096275	0.908275
Ready_I-KNN	1.030386	0.813067	0.026087	0.006908	0.010593	0.016046	0.021137	0.009522	0.024214	0.008958	0.048068	0.499885	0.154825	0.008007	0.069521	0.402333	0.434343	5.133650	0.877999
Self_KNNSurprisetask	0.955921	0.754037	0.004984	0.003225	0.003406	0.003956	0.004506	0.003861	0.006815	0.002906	0.020332	0.497969	0.039236	0.003210	0.049534	0.587699	0.071429	2.699278	0.991353
Ready_I-KNNBaseline	0.935327	0.737424	0.002545	0.000755	0.001105	0.001602	0.002253	0.000930	0.003444	0.001362	0.011760	0.496724	0.021209	0.000862	0.045379	0.482821	0.059885	2.232578	0.994487
Ready_U-KNN	1.023495	0.807913	0.000742	0.000205	0.000305	0.000449	0.000536	0.000198	0.000845	0.000274	0.002744	0.496441	0.007423	0.000235	0.042533	0.602121	0.010823	2.089186	0.995706
Self_TopRated	2.508258	2.217909	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000220	0.042809	0.699046	0.005051	1.945910	0.995669
Self_BaselineIU	0.958136	0.754051	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000220	0.042809	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.000201	0.042622	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.000118	0.041755	0.392153	0.115440	4.174741	0.965327

Ready-made KNNs - Surprise implementation

I-KNN - basic

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

U-KNN - basic

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': True}  # compute similarities between users
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_U-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

I-KNN - on top baseline

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBaseline()

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

project task 4: use a version of your choice of Surprise KNNalgorithm

# read the docs and try to find best parameter configuration (let say in terms of RMSE)
# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline
# the solution here can be similar to examples above
# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and
# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'
import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}

#Diffrent configurations, manual tests
#algo = sp.KNNBaseline(sim_options=sim_options)	            0.946255	0.745209	0.083457	0.032848	0.041227	0.055493	
#algo = sp.KNNBaseline(k=50 ,sim_options=sim_options)	    0.943462	0.743119	0.083881	0.032691	0.041071	0.055468	
#algo = sp.KNNBaseline(k=80 ,sim_options=sim_options)       0.941287	0.741588	0.083775	0.032891	0.041282	0.055585
#algo = sp.KNNWithMeans(sim_options=sim_options)	        0.948685	0.744850	0.005620	0.002081	0.002779	0.003794	
#algo = sp.KNNWithZScore(sim_options=sim_options)	        0.950328	0.745109	0.003924	0.001466	0.001953	0.002652	
algo = sp.KNNBaseline(sim_options=sim_options)	 #Maybe not the best RMSE but overall KNN best

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv')

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

import imp
imp.reload(ev)

import evaluation_measures as ev
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 12235.63it/s]
943it [00:00, 12128.25it/s]
943it [00:00, 11489.47it/s]
943it [00:00, 11216.05it/s]
943it [00:00, 11010.63it/s]
943it [00:00, 11489.57it/s]
943it [00:00, 11925.84it/s]
943it [00:00, 12235.67it/s]
943it [00:00, 11485.30it/s]
943it [00:00, 11351.15it/s]
943it [00:00, 11489.71it/s]
943it [00:00, 12396.62it/s]
943it [00:00, 11925.84it/s]
943it [00:00, 11776.81it/s]
943it [00:00, 11925.80it/s]
943it [00:00, 11776.77it/s]

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	F_2	Whole_average	Reco in test	Test coverage	Shannon	Gini
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	0.112750	0.249607	1.000000	0.038961	3.159079	0.987317
Self_SVDBaseline	3.642454	3.477190	0.135101	0.078585	0.082031	0.099077	0.107189	0.105339	0.161370	0.081348	0.335256	0.536192	0.635207	0.077167	0.202822	0.999894	0.287157	5.167630	0.906365
Ready_SVD	0.950188	0.749989	0.096288	0.044814	0.052190	0.067156	0.085408	0.070906	0.105477	0.048037	0.225665	0.519108	0.488865	0.046118	0.154169	0.997667	0.213564	4.410143	0.952728
Self_SVD	0.914262	0.717023	0.104666	0.043109	0.052955	0.070403	0.095064	0.074470	0.107580	0.051132	0.198774	0.518287	0.479321	0.045457	0.153435	0.860233	0.140693	3.924150	0.971320
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	0.039549	0.141900	1.000000	0.033911	2.836513	0.991139
Ready_SVDBiased	0.940534	0.742068	0.081866	0.035675	0.041920	0.055096	0.072103	0.051976	0.089568	0.039835	0.195104	0.514486	0.429480	0.036595	0.136976	0.996925	0.165945	4.136236	0.965487
Self_KNNSurprisetask	0.946255	0.745209	0.083457	0.032848	0.041227	0.055493	0.074785	0.048890	0.089577	0.040902	0.189057	0.513076	0.417815	0.034996	0.135177	0.888547	0.130592	3.611806	0.978659
Self_TopRated	2.508258	2.217909	0.079321	0.032667	0.039983	0.053170	0.068884	0.048582	0.070766	0.027602	0.114790	0.512943	0.411453	0.034385	0.124546	1.000000	0.024531	2.761238	0.991660
Self_GlobalAvg	1.125760	0.943534	0.061188	0.025968	0.031383	0.041343	0.040558	0.032107	0.067695	0.027470	0.171187	0.509546	0.384942	0.027213	0.118383	1.000000	0.025974	2.711772	0.992003
Ready_Random	1.525730	1.225537	0.045917	0.020462	0.023786	0.031070	0.026931	0.021781	0.051318	0.019634	0.132275	0.506747	0.316013	0.020936	0.101406	0.987275	0.183261	5.096275	0.908275
Ready_I-KNN	1.030386	0.813067	0.026087	0.006908	0.010593	0.016046	0.021137	0.009522	0.024214	0.008958	0.048068	0.499885	0.154825	0.008007	0.069521	0.402333	0.434343	5.133650	0.877999
Ready_I-KNNBaseline	0.935327	0.737424	0.002545	0.000755	0.001105	0.001602	0.002253	0.000930	0.003444	0.001362	0.011760	0.496724	0.021209	0.000862	0.045379	0.482821	0.059885	2.232578	0.994487
Ready_U-KNN	1.023495	0.807913	0.000742	0.000205	0.000305	0.000449	0.000536	0.000198	0.000845	0.000274	0.002744	0.496441	0.007423	0.000235	0.042533	0.602121	0.010823	2.089186	0.995706
Self_BaselineIU	0.958136	0.754051	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000220	0.042809	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.000201	0.042622	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.000118	0.041755	0.392153	0.115440	4.174741	0.965327

57 KiB Raw Permalink Blame History

Self made simplified I-KNN

Ready-made KNNs - Surprise implementation

I-KNN - basic

U-KNN - basic

I-KNN - on top baseline

project task 4: use a version of your choice of Surprise KNNalgorithm

57 KiB

Raw Permalink Blame History