59 KiB
59 KiB
Prepare test set
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
from tqdm import tqdm
# In evaluation we do not load train set - it is not needed
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
test.columns=['user', 'item', 'rating', 'timestamp']
test['user_code'] = test['user'].astype("category").cat.codes
test['item_code'] = test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
test_ui = sparse.csr_matrix((test['rating'], (test['user_code'], test['item_code'])))
Estimations metrics
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
estimations_df.columns=['user', 'item' ,'score']
estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
def estimations_metrics(test_ui, estimations):
result=[]
RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
result.append(['RMSE', RMSE])
MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
result.append(['MAE', MAE])
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
# in case of error (in the laboratories) you might have to switch to the other version of pandas
# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel
estimations_metrics(test_ui, estimations)
RMSE | MAE | |
---|---|---|
0 | 0.949459 | 0.752487 |
Ranking metrics
import numpy as np
reco = np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')
# Let's ignore scores - they are not used in evaluation:
users=reco[:,:1]
items=reco[:,1::2]
# Let's use inner ids instead of real ones
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
# Let's put them into one array
reco=np.concatenate((users, items), axis=1)
reco
array([[663, 475, 62, ..., 472, 269, 503], [ 48, 313, 475, ..., 591, 175, 466], [351, 313, 475, ..., 591, 175, 466], ..., [259, 313, 475, ..., 11, 591, 175], [ 33, 313, 475, ..., 11, 591, 175], [ 77, 313, 475, ..., 11, 591, 175]])
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
nb_items=test_ui.shape[1]
relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\
0,0,0,0,0,0,0,0,0,0,0,0,0
cg = (1.0 / np.log2(np.arange(2, topK + 2)))
cg_sum = np.cumsum(cg)
for (nb_user, user) in tqdm(enumerate(reco[:,0])):
u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
nb_u_rated_items=len(u_rated_items)
if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
relevant_users+=1
u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
(test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
# more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
# but accesing test_ui[user,item] is expensive -we should avoid doing it
if len(u_super_items)>0:
super_relevant_users+=1
user_successes=np.zeros(topK)
nb_user_successes=0
user_super_successes=np.zeros(topK)
nb_user_super_successes=0
# evaluation
for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
if item in u_rated_items:
user_successes[item_position]=1
nb_user_successes+=1
if item in u_super_items:
user_super_successes[item_position]=1
nb_user_super_successes+=1
prec_u=nb_user_successes/topK
prec+=prec_u
rec_u=nb_user_successes/nb_u_rated_items
rec+=rec_u
F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
prec_super+=nb_user_super_successes/topK
rec_super+=nb_user_super_successes/max(len(u_super_items),1) # to set 0 if no super items
ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
cumsum_successes=np.cumsum(user_successes)
mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
(nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
((nb_items-nb_u_rated_items)*nb_u_rated_items)
HR+=nb_user_successes>0
result=[]
result.append(('precision', prec/relevant_users))
result.append(('recall', rec/relevant_users))
result.append(('F_1', F_1/relevant_users))
result.append(('F_05', F_05/relevant_users))
result.append(('precision_super', prec_super/super_relevant_users))
result.append(('recall_super', rec_super/super_relevant_users))
result.append(('NDCG', ndcg/relevant_users))
result.append(('mAP', mAP/relevant_users))
result.append(('MRR', MRR/relevant_users))
result.append(('LAUC', LAUC/relevant_users))
result.append(('HR', HR/relevant_users))
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
ranking_metrics(test_ui, reco, super_reactions=[4,5], topK=10)
943it [00:00, 6497.15it/s]
precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.09141 | 0.037652 | 0.04603 | 0.061286 | 0.079614 | 0.056463 | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 |
Diversity metrics
def diversity_metrics(test_ui, reco, topK=10):
frequencies=defaultdict(int)
# let's assign 0 to all items in test set
for item in list(set(test_ui.indices)):
frequencies[item]=0
# counting frequencies
for item in reco[:,1:].flat:
frequencies[item]+=1
nb_reco_outside_test=frequencies[-1]
del frequencies[-1]
frequencies=np.array(list(frequencies.values()))
nb_rec_items=len(frequencies[frequencies>0])
nb_reco_inside_test=np.sum(frequencies)
frequencies=frequencies/np.sum(frequencies)
frequencies=np.sort(frequencies)
with np.errstate(divide='ignore'): # let's put zeros put items with 0 frequency and ignore division warning
log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
result=[]
result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel
import evaluation_measures as ev
import imp
imp.reload(ev)
x=diversity_metrics(test_ui, reco, topK=10)
x
Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|
0 | 1.0 | 0.033911 | 2.836513 | 0.991139 |
To be used in other notebooks
import evaluation_measures as ev
import imp
imp.reload(ev)
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')
ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
estimations_df=estimations_df,
reco=reco,
super_reactions=[4,5])
#also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default
943it [00:00, 5143.71it/s]
RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.949459 | 0.752487 | 0.09141 | 0.037652 | 0.04603 | 0.061286 | 0.079614 | 0.056463 | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.0 | 0.033911 | 2.836513 | 0.991139 |
import evaluation_measures as ev
import imp
imp.reload(ev)
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
df=ev.evaluate_all(test, dir_path, super_reactions)
#also you can just type ev.evaluate_all() - I put above values as default
943it [00:00, 3573.64it/s] 943it [00:00, 5141.54it/s] 943it [00:00, 2827.19it/s] 943it [00:00, 2513.13it/s] 943it [00:00, 3555.67it/s]
df.iloc[:,:9]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | |
---|---|---|---|---|---|---|---|---|---|
0 | Self_TopPop | 2.508258 | 2.217909 | 0.188865 | 0.116919 | 0.118732 | 0.141584 | 0.130472 | 0.137473 |
0 | Ready_Baseline | 0.949459 | 0.752487 | 0.091410 | 0.037652 | 0.046030 | 0.061286 | 0.079614 | 0.056463 |
0 | Ready_Random | 1.525959 | 1.225122 | 0.047402 | 0.020629 | 0.024471 | 0.032042 | 0.027682 | 0.019353 |
0 | Self_TopRated | 1.030712 | 0.820904 | 0.000954 | 0.000188 | 0.000298 | 0.000481 | 0.000644 | 0.000223 |
0 | Self_BaselineUI | 0.967585 | 0.762740 | 0.000954 | 0.000170 | 0.000278 | 0.000463 | 0.000644 | 0.000189 |
df.iloc[:,np.append(0,np.arange(9, df.shape[1]))]
Model | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Self_TopPop | 0.214651 | 0.111707 | 0.400939 | 0.555546 | 0.765642 | 1.000000 | 0.038961 | 3.159079 | 0.987317 |
0 | Ready_Baseline | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.000000 | 0.033911 | 2.836513 | 0.991139 |
0 | Ready_Random | 0.051593 | 0.019428 | 0.129062 | 0.506826 | 0.336161 | 0.987593 | 0.175325 | 5.087656 | 0.908118 |
0 | Self_TopRated | 0.001043 | 0.000335 | 0.003348 | 0.496433 | 0.009544 | 0.699046 | 0.005051 | 1.945910 | 0.995669 |
0 | Self_BaselineUI | 0.000752 | 0.000168 | 0.001677 | 0.496424 | 0.009544 | 0.600530 | 0.005051 | 1.803126 | 0.996380 |
Check metrics on toy dataset
import evaluation_measures as ev
import imp
import helpers
imp.reload(ev)
dir_path="Recommendations generated/toy-example/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None)
display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))
#also you can just type ev.evaluate_all() - I put above values as default
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
reco=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', header=None)
estimations=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', names=['user', 'item', 'est_score'])
toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)
print('Training data:')
display(toy_train_ui.todense())
print('Test data:')
display(toy_test_ui.todense())
print('Recommendations:')
display(reco)
print('Estimations:')
display(estimations)
3it [00:00, 1191.68it/s]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Self_BaselineUI | 1.612452 | 1.4 | 0.444444 | 0.888889 | 0.555556 | 0.478632 | 0.333333 | 0.75 | 0.676907 | 0.574074 | 0.611111 | 0.638889 | 1.0 | 0.888889 | 0.8 | 1.386294 | 0.25 |
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4], [0, 1, 2, 3, 0, 0, 0, 0], [0, 0, 0, 5, 0, 3, 4, 0]])
Test data:
matrix([[0, 0, 0, 0, 0, 0, 3, 0], [0, 0, 0, 0, 5, 0, 0, 0], [5, 0, 4, 0, 0, 0, 0, 2]])
Recommendations:
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 0 | 30 | 5.0 | 20 | 4.0 | 60 | 4.0 |
1 | 10 | 40 | 3.0 | 60 | 2.0 | 70 | 2.0 |
2 | 20 | 40 | 5.0 | 20 | 4.0 | 70 | 4.0 |
Estimations:
user | item | est_score | |
---|---|---|---|
0 | 0 | 60 | 4.0 |
1 | 10 | 40 | 3.0 |
2 | 20 | 0 | 3.0 |
3 | 20 | 20 | 4.0 |
4 | 20 | 70 | 4.0 |
Sample recommendations
train=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
items=pd.read_csv('./Datasets/ml-100k/movies.csv')
user=random.choice(list(set(train['user'])))
train_content=pd.merge(train, items, left_on='item', right_on='id')
print('Here is what user rated high:')
display(train_content[train_content['user']==user][['user', 'rating', 'title', 'genres']]\
.sort_values(by='rating', ascending=False)[:15])
reco = np.loadtxt('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', delimiter=',')
items=pd.read_csv('./Datasets/ml-100k/movies.csv')
# Let's ignore scores - they are not used in evaluation:
reco_users=reco[:,:1]
reco_items=reco[:,1::2]
# Let's put them into one array
reco=np.concatenate((reco_users, reco_items), axis=1)
# Let's rebuild it user-item dataframe
recommended=[]
for row in reco:
for rec_nb, entry in enumerate(row[1:]):
recommended.append((row[0], rec_nb+1, entry))
recommended=pd.DataFrame(recommended, columns=['user','rec_nb', 'item'])
recommended_content=pd.merge(recommended, items, left_on='item', right_on='id')
print('Here is what we recommend:')
recommended_content[recommended_content['user']==user][['user', 'rec_nb', 'title', 'genres']].sort_values(by='rec_nb')
Here is what user rated high:
user | rating | title | genres | |
---|---|---|---|---|
50941 | 661 | 5 | It's a Wonderful Life (1946) | Drama |
9531 | 661 | 5 | Wizard of Oz, The (1939) | Adventure, Children's, Drama, Musical |
27182 | 661 | 5 | Empire Strikes Back, The (1980) | Action, Adventure, Drama, Romance, Sci-Fi, War |
23944 | 661 | 5 | Apocalypse Now (1979) | Drama, War |
20285 | 661 | 5 | Return of the Jedi (1983) | Action, Adventure, Romance, Sci-Fi, War |
37504 | 661 | 5 | Aladdin (1992) | Animation, Children's, Comedy, Musical |
68312 | 661 | 5 | Babe (1995) | Children's, Comedy, Drama |
16362 | 661 | 5 | Apollo 13 (1995) | Action, Drama, Thriller |
15168 | 661 | 5 | Indiana Jones and the Last Crusade (1989) | Action, Adventure |
29402 | 661 | 5 | Psycho (1960) | Horror, Romance, Thriller |
40755 | 661 | 5 | Jean de Florette (1986) | Drama |
41950 | 661 | 5 | Die Hard (1988) | Action, Thriller |
58932 | 661 | 5 | Enchanted April (1991) | Drama |
43013 | 661 | 5 | 2001: A Space Odyssey (1968) | Drama, Mystery, Sci-Fi, Thriller |
65664 | 661 | 5 | Star Trek: The Wrath of Khan (1982) | Action, Adventure, Sci-Fi |
Here is what we recommend:
user | rec_nb | title | genres | |
---|---|---|---|---|
659 | 661.0 | 1 | Great Day in Harlem, A (1994) | Documentary |
1601 | 661.0 | 2 | Tough and Deadly (1995) | Action, Drama, Thriller |
2543 | 661.0 | 3 | Aiqing wansui (1994) | Drama |
3485 | 661.0 | 4 | Delta of Venus (1994) | Drama |
4427 | 661.0 | 5 | Someone Else's America (1995) | Drama |
5369 | 661.0 | 6 | Saint of Fort Washington, The (1993) | Drama |
6311 | 661.0 | 7 | Celestial Clockwork (1994) | Comedy |
7253 | 661.0 | 8 | Some Mother's Son (1996) | Drama |
9148 | 661.0 | 9 | Maya Lin: A Strong Clear Vision (1994) | Documentary |
8194 | 661.0 | 10 | Prefontaine (1997) | Drama |
project task 3: implement some other evaluation measure
# it may be your idea, modification of what we have already implemented
# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations)
# or something well-known
# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
ev.evaluate_all(test, dir_path, super_reactions)
943it [00:00, 4220.01it/s] 943it [00:00, 3015.35it/s] 943it [00:00, 2308.31it/s] 943it [00:00, 3461.11it/s] 943it [00:00, 3442.41it/s]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Self_TopPop | 2.508258 | 2.217909 | 0.188865 | 0.116919 | 0.118732 | 0.141584 | 0.130472 | 0.137473 | 0.214651 | 0.111707 | 0.400939 | 0.555546 | 0.765642 | 1.000000 | 0.038961 | 3.159079 | 0.987317 |
0 | Ready_Baseline | 0.949459 | 0.752487 | 0.091410 | 0.037652 | 0.046030 | 0.061286 | 0.079614 | 0.056463 | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.000000 | 0.033911 | 2.836513 | 0.991139 |
0 | Ready_Random | 1.525959 | 1.225122 | 0.047402 | 0.020629 | 0.024471 | 0.032042 | 0.027682 | 0.019353 | 0.051593 | 0.019428 | 0.129062 | 0.506826 | 0.336161 | 0.987593 | 0.175325 | 5.087656 | 0.908118 |
0 | Self_TopRated | 1.030712 | 0.820904 | 0.000954 | 0.000188 | 0.000298 | 0.000481 | 0.000644 | 0.000223 | 0.001043 | 0.000335 | 0.003348 | 0.496433 | 0.009544 | 0.699046 | 0.005051 | 1.945910 | 0.995669 |
0 | Self_BaselineUI | 0.967585 | 0.762740 | 0.000954 | 0.000170 | 0.000278 | 0.000463 | 0.000644 | 0.000189 | 0.000752 | 0.000168 | 0.001677 | 0.496424 | 0.009544 | 0.600530 | 0.005051 | 1.803126 | 0.996380 |