WSS-project/P2. Evaluation.ipynb

53 KiB

Prepare test set

import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
from tqdm import tqdm

# In evaluation we do not load train set - it is not needed
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
test.columns = ["user", "item", "rating", "timestamp"]

test["user_code"] = test["user"].astype("category").cat.codes
test["item_code"] = test["item"].astype("category").cat.codes

user_code_id = dict(enumerate(test["user"].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test["item"].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())

test_ui = sparse.csr_matrix((test["rating"], (test["user_code"], test["item_code"])))

Estimations metrics

estimations_df = pd.read_csv(
    "Recommendations generated/ml-100k/Ready_Baseline_estimations.csv", header=None
)
estimations_df.columns = ["user", "item", "score"]

estimations_df["user_code"] = [user_id_code[user] for user in estimations_df["user"]]
estimations_df["item_code"] = [item_id_code[item] for item in estimations_df["item"]]
estimations = sparse.csr_matrix(
    (
        estimations_df["score"],
        (estimations_df["user_code"], estimations_df["item_code"]),
    ),
    shape=test_ui.shape,
)
def estimations_metrics(test_ui, estimations):
    result = []

    RMSE = (np.sum((estimations.data - test_ui.data) ** 2) / estimations.nnz) ** (1 / 2)
    result.append(["RMSE", RMSE])

    MAE = np.sum(abs(estimations.data - test_ui.data)) / estimations.nnz
    result.append(["MAE", MAE])

    df_result = (pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns = list(zip(*result))[0]
    return df_result
# in case of error (in the laboratories) you might have to switch to the other version of pandas
# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel

estimations_metrics(test_ui, estimations)
RMSE MAE
0 0.949459 0.752487

Ranking metrics

import numpy as np

reco = np.loadtxt(
    "Recommendations generated/ml-100k/Ready_Baseline_reco.csv", delimiter=","
)
# Let's ignore scores - they are not used in evaluation:
users = reco[:, :1]
items = reco[:, 1::2]
# Let's use inner ids instead of real ones
users = np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)
items = np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items)
reco = np.concatenate((users, items), axis=1)
reco
array([[663, 475,  62, ..., 472, 269, 503],
       [ 48, 313, 475, ..., 591, 175, 466],
       [351, 313, 475, ..., 591, 175, 466],
       ...,
       [259, 313, 475, ...,  11, 591, 175],
       [ 33, 313, 475, ...,  11, 591, 175],
       [ 77, 313, 475, ...,  11, 591, 175]])
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):

    nb_items = test_ui.shape[1]
    (
        relevant_users,
        super_relevant_users,
        prec,
        rec,
        F_1,
        F_05,
        prec_super,
        rec_super,
        ndcg,
        mAP,
        MRR,
        LAUC,
        HR,
    ) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

    cg = 1.0 / np.log2(np.arange(2, topK + 2))
    cg_sum = np.cumsum(cg)

    for (nb_user, user) in tqdm(enumerate(reco[:, 0])):
        u_rated_items = test_ui.indices[test_ui.indptr[user] : test_ui.indptr[user + 1]]
        nb_u_rated_items = len(u_rated_items)
        if (
            nb_u_rated_items > 0
        ):  # skip users with no items in test set (still possible that there will be no super items)
            relevant_users += 1

            u_super_items = u_rated_items[
                np.vectorize(lambda x: x in super_reactions)(
                    test_ui.data[test_ui.indptr[user] : test_ui.indptr[user + 1]]
                )
            ]
            # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
            # but accesing test_ui[user,item] is expensive -we should avoid doing it
            if len(u_super_items) > 0:
                super_relevant_users += 1

            user_successes = np.zeros(topK)
            nb_user_successes = 0
            user_super_successes = np.zeros(topK)
            nb_user_super_successes = 0

            # evaluation
            for (item_position, item) in enumerate(reco[nb_user, 1 : topK + 1]):
                if item in u_rated_items:
                    user_successes[item_position] = 1
                    nb_user_successes += 1
                    if item in u_super_items:
                        user_super_successes[item_position] = 1
                        nb_user_super_successes += 1

            prec_u = nb_user_successes / topK
            prec += prec_u

            rec_u = nb_user_successes / nb_u_rated_items
            rec += rec_u

            F_1 += 2 * (prec_u * rec_u) / (prec_u + rec_u) if prec_u + rec_u > 0 else 0
            F_05 += (
                (0.5 ** 2 + 1) * (prec_u * rec_u) / (0.5 ** 2 * prec_u + rec_u)
                if prec_u + rec_u > 0
                else 0
            )

            prec_super += nb_user_super_successes / topK
            rec_super += nb_user_super_successes / max(
                len(u_super_items), 1
            )  # to set 0 if no super items
            ndcg += np.dot(user_successes, cg) / cg_sum[min(topK, nb_u_rated_items) - 1]

            cumsum_successes = np.cumsum(user_successes)
            mAP += np.dot(
                cumsum_successes / np.arange(1, topK + 1), user_successes
            ) / min(topK, nb_u_rated_items)
            MRR += (
                1 / (user_successes.nonzero()[0][0] + 1)
                if user_successes.nonzero()[0].size > 0
                else 0
            )
            LAUC += (
                np.dot(cumsum_successes, 1 - user_successes)
                + (nb_user_successes + nb_u_rated_items)
                / 2
                * ((nb_items - nb_u_rated_items) - (topK - nb_user_successes))
            ) / ((nb_items - nb_u_rated_items) * nb_u_rated_items)

            HR += nb_user_successes > 0

    result = []
    result.append(("precision", prec / relevant_users))
    result.append(("recall", rec / relevant_users))
    result.append(("F_1", F_1 / relevant_users))
    result.append(("F_05", F_05 / relevant_users))
    result.append(("precision_super", prec_super / super_relevant_users))
    result.append(("recall_super", rec_super / super_relevant_users))
    result.append(("NDCG", ndcg / relevant_users))
    result.append(("mAP", mAP / relevant_users))
    result.append(("MRR", MRR / relevant_users))
    result.append(("LAUC", LAUC / relevant_users))
    result.append(("HR", HR / relevant_users))

    df_result = (pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns = list(zip(*result))[0]
    return df_result
ranking_metrics(test_ui, reco, super_reactions=[4, 5], topK=10)
943it [00:00, 9434.06it/s]
precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR
0 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 0.437964

Diversity metrics

def diversity_metrics(test_ui, reco, topK=10):

    frequencies = defaultdict(int)

    # let's assign 0 to all items in test set
    for item in list(set(test_ui.indices)):
        frequencies[item] = 0

    # counting frequencies
    for item in reco[:, 1:].flat:
        frequencies[item] += 1

    nb_reco_outside_test = frequencies[-1]
    del frequencies[-1]

    frequencies = np.array(list(frequencies.values()))

    nb_rec_items = len(frequencies[frequencies > 0])
    nb_reco_inside_test = np.sum(frequencies)

    frequencies = frequencies / np.sum(frequencies)
    frequencies = np.sort(frequencies)

    with np.errstate(
        divide="ignore"
    ):  # let's put zeros put items with 0 frequency and ignore division warning
        log_frequencies = np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)

    result = []
    result.append(
        (
            "Reco in test",
            nb_reco_inside_test / (nb_reco_inside_test + nb_reco_outside_test),
        )
    )
    result.append(("Test coverage", nb_rec_items / test_ui.shape[1]))
    result.append(("Shannon", -np.dot(frequencies, log_frequencies)))
    result.append(
        (
            "Gini",
            np.dot(frequencies, np.arange(1 - len(frequencies), len(frequencies), 2))
            / (len(frequencies) - 1),
        )
    )

    df_result = (pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns = list(zip(*result))[0]
    return df_result
# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel

x = diversity_metrics(test_ui, reco, topK=10)
x
Reco in test Test coverage Shannon Gini
0 1.0 0.033911 2.836513 0.991139

To be used in other notebooks

import evaluation_measures as ev

estimations_df = pd.read_csv(
    "Recommendations generated/ml-100k/Ready_Baseline_estimations.csv", header=None
)
reco = np.loadtxt(
    "Recommendations generated/ml-100k/Ready_Baseline_reco.csv", delimiter=","
)

ev.evaluate(
    test=pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None),
    estimations_df=estimations_df,
    reco=reco,
    super_reactions=[4, 5],
)
# also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default
943it [00:00, 11012.47it/s]
RMSE MAE precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR Reco in test Test coverage Shannon Gini
0 0.949459 0.752487 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 0.437964 1.0 0.033911 2.836513 0.991139
dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)

df = ev.evaluate_all(test, dir_path, super_reactions)
# also you can just type ev.evaluate_all() - I put above values as default
943it [00:00, 10346.82it/s]
943it [00:00, 11772.32it/s]
943it [00:00, 10636.62it/s]
943it [00:00, 10767.92it/s]
943it [00:00, 12019.93it/s]
df.iloc[:, :9]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super
0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 0.141584 0.130472 0.137473
0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 0.061286 0.079614 0.056463
0 Ready_Random 1.521845 1.225949 0.047190 0.020753 0.024810 0.032269 0.029506 0.023707
0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 0.000481 0.000644 0.000223
0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 0.000463 0.000644 0.000189
df.iloc[:, np.append(0, np.arange(9, df.shape[1]))]
Model NDCG mAP MRR LAUC HR Reco in test Test coverage Shannon Gini
0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317
0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139
0 Ready_Random 0.050075 0.018728 0.121957 0.506893 0.329799 0.986532 0.184704 5.099706 0.907217
0 Self_TopRated 0.001043 0.000335 0.003348 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669
0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380

Check metrics on toy dataset

import helpers

dir_path = "Recommendations generated/toy-example/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/toy-example/test.csv", sep="\t", header=None)

display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))
# also you can just type ev.evaluate_all() - I put above values as default

toy_train_read = pd.read_csv(
    "./Datasets/toy-example/train.csv",
    sep="\t",
    header=None,
    names=["user", "item", "rating", "timestamp"],
)
toy_test_read = pd.read_csv(
    "./Datasets/toy-example/test.csv",
    sep="\t",
    header=None,
    names=["user", "item", "rating", "timestamp"],
)
reco = pd.read_csv(
    "Recommendations generated/toy-example/Self_BaselineUI_reco.csv", header=None
)
estimations = pd.read_csv(
    "Recommendations generated/toy-example/Self_BaselineUI_estimations.csv",
    names=["user", "item", "est_score"],
)
(
    toy_train_ui,
    toy_test_ui,
    toy_user_code_id,
    toy_user_id_code,
    toy_item_code_id,
    toy_item_id_code,
) = helpers.data_to_csr(toy_train_read, toy_test_read)

print("Training data:")
display(toy_train_ui.todense())

print("Test data:")
display(toy_test_ui.todense())

print("Recommendations:")
display(reco)

print("Estimations:")
display(estimations)
3it [00:00, 5771.98it/s]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR Reco in test Test coverage Shannon Gini
0 Self_BaselineUI 1.612452 1.4 0.444444 0.888889 0.555556 0.478632 0.333333 0.75 0.676907 0.574074 0.611111 0.638889 1.0 0.888889 0.8 1.386294 0.25
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4],
        [0, 1, 2, 3, 0, 0, 0, 0],
        [0, 0, 0, 5, 0, 3, 4, 0]])
Test data:
matrix([[0, 0, 0, 0, 0, 0, 3, 0],
        [0, 0, 0, 0, 5, 0, 0, 0],
        [5, 0, 4, 0, 0, 0, 0, 2]])
Recommendations:
0 1 2 3 4 5 6
0 0 30 5.0 20 4.0 60 4.0
1 10 40 3.0 60 2.0 70 2.0
2 20 40 5.0 20 4.0 70 4.0
Estimations:
user item est_score
0 0 60 4.0
1 10 40 3.0
2 20 0 3.0
3 20 20 4.0
4 20 70 4.0

Sample recommendations

train = pd.read_csv(
    "./Datasets/ml-100k/train.csv",
    sep="\t",
    header=None,
    names=["user", "item", "rating", "timestamp"],
)
items = pd.read_csv("./Datasets/ml-100k/movies.csv")

user = random.choice(list(set(train["user"])))

train_content = pd.merge(train, items, left_on="item", right_on="id")

print("Here is what user rated high:")
display(
    train_content[train_content["user"] == user][
        ["user", "rating", "title", "genres"]
    ].sort_values(by="rating", ascending=False)[:15]
)

reco = np.loadtxt(
    "Recommendations generated/ml-100k/Self_BaselineUI_reco.csv", delimiter=","
)
items = pd.read_csv("./Datasets/ml-100k/movies.csv")

# Let's ignore scores - they are not used in evaluation:
reco_users = reco[:, :1]
reco_items = reco[:, 1::2]
# Let's put them into one array
reco = np.concatenate((reco_users, reco_items), axis=1)

# Let's rebuild it user-item dataframe
recommended = []
for row in reco:
    for rec_nb, entry in enumerate(row[1:]):
        recommended.append((row[0], rec_nb + 1, entry))
recommended = pd.DataFrame(recommended, columns=["user", "rec_nb", "item"])

recommended_content = pd.merge(recommended, items, left_on="item", right_on="id")

print("Here is what we recommend:")
recommended_content[recommended_content["user"] == user][
    ["user", "rec_nb", "title", "genres"]
].sort_values(by="rec_nb")
Here is what user rated high:
user rating title genres
57482 2 5 Emma (1996) Drama, Romance
54506 2 5 Sense and Sensibility (1995) Drama, Romance
40581 2 5 Titanic (1997) Action, Drama, Romance
2949 2 5 Star Wars (1977) Action, Adventure, Romance, Sci-Fi, War
69653 2 5 Wings of the Dove, The (1997) Drama, Romance, Thriller
7906 2 5 As Good As It Gets (1997) Comedy, Drama
69400 2 5 Shall We Dance? (1996) Comedy
14469 2 5 Fargo (1996) Crime, Drama, Thriller
46151 2 5 L.A. Confidential (1997) Crime, Film-Noir, Mystery, Thriller
67293 2 5 Good Will Hunting (1997) Drama
20923 2 5 Secrets & Lies (1996) Drama
52921 2 5 Kolya (1996) Comedy
50103 2 4 Mrs. Brown (Her Majesty, Mrs. Brown) (1997) Drama, Romance
51972 2 4 Mighty Aphrodite (1995) Comedy
515 2 4 Heat (1995) Action, Crime, Thriller
Here is what we recommend:
user rec_nb title genres
1 2.0 1 Great Day in Harlem, A (1994) Documentary
943 2.0 2 Tough and Deadly (1995) Action, Drama, Thriller
1885 2.0 3 Aiqing wansui (1994) Drama
2827 2.0 4 Delta of Venus (1994) Drama
3769 2.0 5 Someone Else's America (1995) Drama
4711 2.0 6 Saint of Fort Washington, The (1993) Drama
5653 2.0 7 Celestial Clockwork (1994) Comedy
6595 2.0 8 Some Mother's Son (1996) Drama
8489 2.0 9 Maya Lin: A Strong Clear Vision (1994) Documentary
7536 2.0 10 Prefontaine (1997) Drama

project task 2: implement some other evaluation measure

# it may be your idea, modification of what we have already implemented
# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations)
# or something well-known
# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure