Self made simplified I-KNN

import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

class IKNN():
    
    def fit(self, train_ui):
        self.train_ui=train_ui
        
        train_iu=train_ui.transpose()
        norms=np.linalg.norm(train_iu.A, axis=1) # here we compute lenth of each item ratings vector
        norms=np.vectorize(lambda x: max(x,1))(norms[:,None]) # to avoid dividing by zero

        normalized_train_iu=sparse.csr_matrix(train_iu/norms)

        self.similarity_matrix_ii=normalized_train_iu*normalized_train_iu.transpose()
        
        self.estimations=np.array(train_ui*self.similarity_matrix_ii/((train_ui>0)*self.similarity_matrix_ii))
        
    def recommend(self, user_code_id, item_code_id, topK=10):
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(self.estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated and not np.isnan(score):
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], 
                           self.estimations[user,item] if not np.isnan(self.estimations[user,item]) else 1])
        return result

# toy example
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])

toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)


model=IKNN()
model.fit(toy_train_ui)

print('toy train ui:')
display(toy_train_ui.A)

print('similarity matrix:')
display(model.similarity_matrix_ii.A)

print('estimations matrix:')
display(model.estimations)

model.recommend(toy_user_code_id, toy_item_code_id)

toy train ui:

array([[3, 4, 0, 0, 5, 0, 0, 4],
       [0, 1, 2, 3, 0, 0, 0, 0],
       [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)

similarity matrix:

array([[1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.9701425 , 1.        , 0.24253563, 0.12478355, 0.9701425 ,
        0.        , 0.        , 0.9701425 ],
       [0.        , 0.24253563, 1.        , 0.51449576, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.12478355, 0.51449576, 1.        , 0.        ,
        0.85749293, 0.85749293, 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ]])

estimations matrix:

array([[4.        , 4.        , 4.        , 4.        , 4.        ,
               nan,        nan, 4.        ],
       [1.        , 1.35990333, 2.15478388, 2.53390319, 1.        ,
        3.        , 3.        , 1.        ],
       [       nan, 5.        , 5.        , 4.05248907,        nan,
        3.95012863, 3.95012863,        nan]])

[[0, 20, 4.0, 30, 4.0],
 [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],
 [20, 10, 5.0, 20, 5.0]]

model=IKNN()
model.fit(train_ui)

top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv('Recommendations generated/ml-100k/Self_IKNN_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', index=False, header=False)

import evaluation_measures as ev
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Self_IKNN_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Self_IKNN_reco.csv', delimiter=',')

ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
            estimations_df=estimations_df, 
            reco=reco,
            super_reactions=[4,5])

943it [00:00, 7381.00it/s]

	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	HR2	Reco in test	Test coverage	Shannon	Gini
0	1.018363	0.808793	0.000318	0.000108	0.00014	0.000189	0.0	0.0	0.000214	0.000037	0.000368	0.496391	0.003181	0.0	0.392153	0.11544	4.174741	0.965327

import imp
imp.reload(ev)

import evaluation_measures as ev
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 6244.78it/s]
943it [00:00, 6960.47it/s]
943it [00:00, 6090.17it/s]
943it [00:00, 6876.64it/s]
943it [00:00, 7185.17it/s]
943it [00:00, 6481.90it/s]
943it [00:00, 4245.42it/s]
943it [00:00, 6388.64it/s]

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	HR2	Reco in test	Test coverage	Shannon	Gini
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	0.492047	1.000000	0.038961	3.159079	0.987317
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	0.239661	1.000000	0.033911	2.836513	0.991139
Self_GlobalAvg	1.125760	0.943534	0.061188	0.025968	0.031383	0.041343	0.040558	0.032107	0.067695	0.027470	0.171187	0.509546	0.384942	0.142100	1.000000	0.025974	2.711772	0.992003
Ready_Random	1.517593	1.220181	0.046023	0.019038	0.023118	0.030734	0.029292	0.021639	0.050818	0.019958	0.126646	0.506031	0.305408	0.111347	0.988547	0.174603	5.082383	0.908434
Self_TopRated	2.508258	2.217909	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000000	0.699046	0.005051	1.945910	0.995669
Self_BaselineIU	0.958136	0.754051	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000000	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.000000	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.000000	0.392153	0.115440	4.174741	0.965327

Ready-made KNNs - Surprise implementation

I-KNN - basic

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

U-KNN - basic

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': True}  # compute similarities between users
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_U-KNN_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

I-KNN - on top baseline

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNBaseline()

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv')

project task 4: use a version of your choice of Surprise KNNalgorithm

# read the docs and try to find best parameter configuration (let say in terms of RMSE)
# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline
# the solution here can be similar to examples above
# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and
# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNWithMeans()

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNWithMeans_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNNWithMeans_estimations.csv')

Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
algo = sp.KNNWithZScore()

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Ready_I-KNNWithZScore_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Ready_I-KNNWithZScore_estimations.csv')

Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}  # compute similarities between items
k = 38

for i in range(10):
    path1 = "Recommendations generated/ml-100k/Self_I-KNNBaseline%d_reco.csv" % (k)
    path2 = "Recommendations generated/ml-100k/Self_I-KNNBaseline%d_estimations.csv" % (k)
    algo = sp.KNNBaseline(k=k)
    helpers.ready_made(algo, reco_path=path1,
          estimations_path=path2)
    k+=1

dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

result = ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 6566.70it/s]
943it [00:00, 6053.18it/s]
943it [00:00, 6753.76it/s]
943it [00:00, 6451.06it/s]
943it [00:00, 3763.62it/s]
943it [00:00, 4634.14it/s]
943it [00:00, 6520.99it/s]
943it [00:00, 6061.07it/s]
943it [00:00, 5946.69it/s]
943it [00:00, 6520.59it/s]
943it [00:00, 4047.05it/s]
943it [00:00, 6061.15it/s]
943it [00:00, 6430.82it/s]
943it [00:00, 6519.56it/s]
943it [00:00, 6127.91it/s]
943it [00:00, 6220.07it/s]
943it [00:00, 6731.95it/s]
943it [00:00, 5617.04it/s]
943it [00:00, 5984.37it/s]
943it [00:00, 3923.26it/s]
943it [00:00, 4799.65it/s]
943it [00:00, 6678.60it/s]
943it [00:00, 5984.12it/s]
943it [00:00, 7217.79it/s]
943it [00:00, 4799.62it/s]
943it [00:00, 4799.67it/s]
943it [00:00, 6566.16it/s]

result.sort_values(by='RMSE')

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	HR2	Reco in test	Test coverage	Shannon	Gini
Self_SVDBaseline	0.913253	0.719475	0.105090	0.043952	0.053454	0.070803	0.095279	0.073469	0.118152	0.058739	0.244096	0.518714	0.471898	0.279958	0.999682	0.111111	3.572421	0.980655
Self_SVD	0.914521	0.717680	0.102757	0.043043	0.052432	0.069515	0.094528	0.075122	0.106751	0.051431	0.198701	0.518248	0.462354	0.255567	0.854931	0.147186	3.888926	0.972044
Self_I-KNNBaseline42	0.935028	0.737210	0.002969	0.000980	0.001374	0.001929	0.002682	0.001217	0.004069	0.001677	0.013349	0.496838	0.023330	0.006363	0.481972	0.059163	2.227849	0.994531
Self_KNNSurprisetask	0.935028	0.737210	0.002969	0.000980	0.001374	0.001929	0.002682	0.001217	0.004069	0.001677	0.013349	0.496838	0.023330	0.006363	0.481972	0.059163	2.227849	0.994531
Self_I-KNNBaseline41	0.935205	0.737439	0.002651	0.000774	0.001138	0.001658	0.002361	0.000959	0.003537	0.001435	0.011494	0.496734	0.021209	0.005302	0.482503	0.057720	2.228123	0.994555
Self_I-KNNBaseline43	0.935241	0.737463	0.002863	0.000952	0.001331	0.001862	0.002575	0.001186	0.004014	0.001663	0.013467	0.496824	0.023330	0.005302	0.482609	0.055556	2.225996	0.994623
Self_I-KNNBaseline46	0.935244	0.737512	0.003287	0.001096	0.001534	0.002148	0.003004	0.001376	0.004398	0.001856	0.013719	0.496898	0.024390	0.007423	0.482397	0.057720	2.225807	0.994607
Self_I-KNNBaseline44	0.935259	0.737530	0.002969	0.000902	0.001305	0.001880	0.002682	0.001129	0.004215	0.001823	0.013977	0.496799	0.023330	0.005302	0.482397	0.057720	2.225495	0.994598
Self_I-KNNBaseline45	0.935268	0.737543	0.003075	0.001044	0.001450	0.002016	0.002790	0.001317	0.004287	0.001812	0.014189	0.496871	0.024390	0.005302	0.482609	0.058442	2.225340	0.994599
Self_I-KNNBaseline47	0.935295	0.737563	0.003075	0.001044	0.001450	0.002016	0.002790	0.001317	0.004199	0.001735	0.013888	0.496871	0.024390	0.005302	0.482397	0.055556	2.221942	0.994676
Self_I-KNNBaseline40	0.935327	0.737424	0.002545	0.000755	0.001105	0.001602	0.002253	0.000930	0.003444	0.001362	0.011760	0.496724	0.021209	0.004242	0.482821	0.059885	2.232578	0.994487
Ready_I-KNNBaseline	0.935327	0.737424	0.002545	0.000755	0.001105	0.001602	0.002253	0.000930	0.003444	0.001362	0.011760	0.496724	0.021209	0.004242	0.482821	0.059885	2.232578	0.994487
Self_I-KNNBaseline39	0.935520	0.737631	0.002757	0.000856	0.001230	0.001758	0.002468	0.001048	0.003899	0.001620	0.013296	0.496775	0.022269	0.005302	0.483351	0.059885	2.235102	0.994479
Self_I-KNNBaseline38	0.935685	0.737828	0.002651	0.000837	0.001197	0.001702	0.002361	0.001020	0.003635	0.001443	0.012589	0.496765	0.022269	0.004242	0.483245	0.059163	2.235851	0.994507
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	0.239661	1.000000	0.033911	2.836513	0.991139
Ready_I-KNNWithMeans	0.955921	0.754037	0.004984	0.003225	0.003406	0.003956	0.004506	0.003861	0.006815	0.002906	0.020332	0.497969	0.039236	0.007423	0.587699	0.071429	2.699278	0.991353
Ready_I-KNNWithZScore	0.957701	0.752387	0.003712	0.001994	0.002380	0.002919	0.003433	0.002401	0.005137	0.002158	0.016458	0.497349	0.027572	0.007423	0.389926	0.067821	2.475747	0.992793
Self_BaselineIU	0.958136	0.754051	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000000	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.000000	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.000000	0.392153	0.115440	4.174741	0.965327
Ready_U-KNN	1.023495	0.807913	0.000742	0.000205	0.000305	0.000449	0.000536	0.000198	0.000845	0.000274	0.002744	0.496441	0.007423	0.000000	0.602121	0.010823	2.089186	0.995706
Ready_I-KNN	1.030386	0.813067	0.026087	0.006908	0.010593	0.016046	0.021137	0.009522	0.024214	0.008958	0.048068	0.499885	0.154825	0.072110	0.402333	0.434343	5.133650	0.877999
Self_GlobalAvg	1.125760	0.943534	0.061188	0.025968	0.031383	0.041343	0.040558	0.032107	0.067695	0.027470	0.171187	0.509546	0.384942	0.142100	1.000000	0.025974	2.711772	0.992003
Ready_Random	1.517593	1.220181	0.046023	0.019038	0.023118	0.030734	0.029292	0.021639	0.050818	0.019958	0.126646	0.506031	0.305408	0.111347	0.988547	0.174603	5.082383	0.908434
Self_TopRated	2.508258	2.217909	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.000000	0.699046	0.005051	1.945910	0.995669
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	0.492047	1.000000	0.038961	3.159079	0.987317
Self_P3	3.702446	3.527273	0.282185	0.192092	0.186749	0.216980	0.204185	0.240096	0.339114	0.204905	0.572157	0.593544	0.875928	0.685048	1.000000	0.077201	3.875892	0.974947

import helpers
import surprise as sp
import imp
imp.reload(helpers)

sim_options = {'name': 'cosine',
              'user_based': False}
algo = sp.KNNBaseline(k=42)

helpers.ready_made(algo, reco_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv',
          estimations_path='Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

60 KiB Raw Permalink Blame History

Self made simplified I-KNN

Ready-made KNNs - Surprise implementation

I-KNN - basic

U-KNN - basic

I-KNN - on top baseline

project task 4: use a version of your choice of Surprise KNNalgorithm

60 KiB

Raw Permalink Blame History