Self made simplified I-KNN

import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read = pd.read_csv("./Datasets/ml-100k/train.csv", sep="\t", header=None)
test_read = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
(
    train_ui,
    test_ui,
    user_code_id,
    user_id_code,
    item_code_id,
    item_id_code,
) = helpers.data_to_csr(train_read, test_read)

class IKNN:
    def fit(self, train_ui):
        self.train_ui = train_ui

        train_iu = train_ui.transpose()
        norms = np.linalg.norm(
            train_iu.A, axis=1
        )  # here we compute length of each item ratings vector
        norms = np.vectorize(lambda x: max(x, 1))(
            norms[:, None]
        )  # to avoid dividing by zero

        normalized_train_iu = sparse.csr_matrix(train_iu / norms)

        self.similarity_matrix_ii = (
            normalized_train_iu * normalized_train_iu.transpose()
        )

        self.estimations = np.array(
            train_ui
            * self.similarity_matrix_ii
            / ((train_ui > 0) * self.similarity_matrix_ii)
        )

    def recommend(self, user_code_id, item_code_id, topK=10):

        top_k = defaultdict(list)
        for nb_user, user in enumerate(self.estimations):

            user_rated = self.train_ui.indices[
                self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]
            ]
            for item, score in enumerate(user):
                if item not in user_rated and not np.isnan(score):
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result = []
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid] + list(chain(*item_scores[:topK])))
        return result

    def estimate(self, user_code_id, item_code_id, test_ui):
        result = []
        for user, item in zip(*test_ui.nonzero()):
            result.append(
                [
                    user_code_id[user],
                    item_code_id[item],
                    self.estimations[user, item]
                    if not np.isnan(self.estimations[user, item])
                    else 1,
                ]
            )
        return result

# toy example
toy_train_read = pd.read_csv(
    "./Datasets/toy-example/train.csv",
    sep="\t",
    header=None,
    names=["user", "item", "rating", "timestamp"],
)
toy_test_read = pd.read_csv(
    "./Datasets/toy-example/test.csv",
    sep="\t",
    header=None,
    names=["user", "item", "rating", "timestamp"],
)

(
    toy_train_ui,
    toy_test_ui,
    toy_user_code_id,
    toy_user_id_code,
    toy_item_code_id,
    toy_item_id_code,
) = helpers.data_to_csr(toy_train_read, toy_test_read)


model = IKNN()
model.fit(toy_train_ui)

print("toy train ui:")
display(toy_train_ui.A)

print("similarity matrix:")
display(model.similarity_matrix_ii.A)

print("estimations matrix:")
display(model.estimations)

model.recommend(toy_user_code_id, toy_item_code_id)

toy train ui:

array([[3, 4, 0, 0, 5, 0, 0, 4],
       [0, 1, 2, 3, 0, 0, 0, 0],
       [0, 0, 0, 5, 0, 3, 4, 0]])

similarity matrix:

array([[1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.9701425 , 1.        , 0.24253563, 0.12478355, 0.9701425 ,
        0.        , 0.        , 0.9701425 ],
       [0.        , 0.24253563, 1.        , 0.51449576, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.12478355, 0.51449576, 1.        , 0.        ,
        0.85749293, 0.85749293, 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.85749293, 0.        ,
        1.        , 1.        , 0.        ],
       [1.        , 0.9701425 , 0.        , 0.        , 1.        ,
        0.        , 0.        , 1.        ]])

estimations matrix:

array([[4.        , 4.        , 4.        , 4.        , 4.        ,
               nan,        nan, 4.        ],
       [1.        , 1.35990333, 2.15478388, 2.53390319, 1.        ,
        3.        , 3.        , 1.        ],
       [       nan, 5.        , 5.        , 4.05248907,        nan,
        3.95012863, 3.95012863,        nan]])

[[0, 20, 4.0, 30, 4.0],
 [10, 50, 3.0, 60, 3.0, 0, 1.0, 40, 1.0, 70, 1.0],
 [20, 10, 5.0, 20, 5.0]]

model = IKNN()
model.fit(train_ui)

top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv(
    "Recommendations generated/ml-100k/Self_IKNN_reco.csv", index=False, header=False
)

estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv(
    "Recommendations generated/ml-100k/Self_IKNN_estimations.csv",
    index=False,
    header=False,
)

import evaluation_measures as ev

estimations_df = pd.read_csv(
    "Recommendations generated/ml-100k/Self_IKNN_estimations.csv", header=None
)
reco = np.loadtxt("Recommendations generated/ml-100k/Self_IKNN_reco.csv", delimiter=",")

ev.evaluate(
    test=pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None),
    estimations_df=estimations_df,
    reco=reco,
    super_reactions=[4, 5],
)

943it [00:00, 9004.71it/s]

	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	Reco in test	Test coverage	Shannon	Gini
0	1.018363	0.808793	0.000318	0.000108	0.00014	0.000189	0.0	0.0	0.000214	0.000037	0.000368	0.496391	0.003181	0.392153	0.11544	4.174741	0.965327

dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 8517.83it/s]
943it [00:00, 11438.64it/s]
943it [00:00, 11933.36it/s]
943it [00:00, 10307.81it/s]
943it [00:00, 12250.41it/s]
943it [00:00, 12064.07it/s]

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	Reco in test	Test coverage	Shannon	Gini
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	1.000000	0.038961	3.159079	0.987317
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	1.000000	0.033911	2.836513	0.991139
Ready_Random	1.521845	1.225949	0.047190	0.020753	0.024810	0.032269	0.029506	0.023707	0.050075	0.018728	0.121957	0.506893	0.329799	0.986532	0.184704	5.099706	0.907217
Self_TopRated	1.030712	0.820904	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.392153	0.115440	4.174741	0.965327

Ready-made KNNs - Surprise implementation

I-KNN - basic

import helpers
import surprise as sp

sim_options = {
    "name": "cosine",
    "user_based": False,
}  # compute similarities between items
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(
    algo,
    reco_path="Recommendations generated/ml-100k/Ready_I-KNN_reco.csv",
    estimations_path="Recommendations generated/ml-100k/Ready_I-KNN_estimations.csv",
)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

U-KNN - basic

sim_options = {
    "name": "cosine",
    "user_based": True,
}  # compute similarities between users
algo = sp.KNNBasic(sim_options=sim_options)

helpers.ready_made(
    algo,
    reco_path="Recommendations generated/ml-100k/Ready_U-KNN_reco.csv",
    estimations_path="Recommendations generated/ml-100k/Ready_U-KNN_estimations.csv",
)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

I-KNN - on top baseline

sim_options = {
    "name": "cosine",
    "user_based": False,
}  # compute similarities between items
algo = sp.KNNBaseline()

helpers.ready_made(
    algo,
    reco_path="Recommendations generated/ml-100k/Ready_I-KNNBaseline_reco.csv",
    estimations_path="Recommendations generated/ml-100k/Ready_I-KNNBaseline_estimations.csv",
)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Generating top N recommendations...
Generating predictions...

dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 11286.27it/s]
943it [00:00, 10874.86it/s]
943it [00:00, 11509.97it/s]
943it [00:00, 11855.81it/s]
943it [00:00, 11574.00it/s]
943it [00:00, 11080.19it/s]
943it [00:00, 11550.84it/s]
943it [00:00, 12148.14it/s]
943it [00:00, 10779.39it/s]

Model	RMSE	MAE	precision	recall	F_1	F_05	precision_super	recall_super	NDCG	mAP	MRR	LAUC	HR	Reco in test	Test coverage	Shannon	Gini
Self_TopPop	2.508258	2.217909	0.188865	0.116919	0.118732	0.141584	0.130472	0.137473	0.214651	0.111707	0.400939	0.555546	0.765642	1.000000	0.038961	3.159079	0.987317
Ready_Baseline	0.949459	0.752487	0.091410	0.037652	0.046030	0.061286	0.079614	0.056463	0.095957	0.043178	0.198193	0.515501	0.437964	1.000000	0.033911	2.836513	0.991139
Ready_Random	1.521845	1.225949	0.047190	0.020753	0.024810	0.032269	0.029506	0.023707	0.050075	0.018728	0.121957	0.506893	0.329799	0.986532	0.184704	5.099706	0.907217
Ready_I-KNN	1.030386	0.813067	0.026087	0.006908	0.010593	0.016046	0.021137	0.009522	0.024214	0.008958	0.048068	0.499885	0.154825	0.402333	0.434343	5.133650	0.877999
Ready_I-KNNBaseline	0.935327	0.737424	0.002545	0.000755	0.001105	0.001602	0.002253	0.000930	0.003444	0.001362	0.011760	0.496724	0.021209	0.482821	0.059885	2.232578	0.994487
Ready_U-KNN	1.023495	0.807913	0.000742	0.000205	0.000305	0.000449	0.000536	0.000198	0.000845	0.000274	0.002744	0.496441	0.007423	0.602121	0.010823	2.089186	0.995706
Self_TopRated	1.030712	0.820904	0.000954	0.000188	0.000298	0.000481	0.000644	0.000223	0.001043	0.000335	0.003348	0.496433	0.009544	0.699046	0.005051	1.945910	0.995669
Self_BaselineUI	0.967585	0.762740	0.000954	0.000170	0.000278	0.000463	0.000644	0.000189	0.000752	0.000168	0.001677	0.496424	0.009544	0.600530	0.005051	1.803126	0.996380
Self_IKNN	1.018363	0.808793	0.000318	0.000108	0.000140	0.000189	0.000000	0.000000	0.000214	0.000037	0.000368	0.496391	0.003181	0.392153	0.115440	4.174741	0.965327

project task 3: use a version of your choice of Surprise KNNalgorithm

# read the docs and try to find best parameter configuration (let say in terms of RMSE)
# https://surprise.readthedocs.io/en/stable/knn_inspired.html##surprise.prediction_algorithms.knns.KNNBaseline
# the solution here can be similar to examples above
# please save the output in 'Recommendations generated/ml-100k/Self_KNNSurprisetask_reco.csv' and
# 'Recommendations generated/ml-100k/Self_KNNSurprisetask_estimations.csv'

35 KiB Raw Blame History

Self made simplified I-KNN

Ready-made KNNs - Surprise implementation

I-KNN - basic

U-KNN - basic

I-KNN - on top baseline

project task 3: use a version of your choice of Surprise KNNalgorithm

35 KiB

Raw Blame History