workshops_recommender_systems/evaluation_measures.py

214 lines
8.4 KiB
Python
Raw Normal View History

2020-05-21 13:42:50 +02:00
import os
import sys
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import normalize
from tqdm import tqdm
from datetime import datetime, date
import random
import scipy.sparse as sparse
from os import listdir
from os.path import isfile, join
from collections import defaultdict
def evaluate(test,
estimations_df,
reco,
super_reactions=[4,5],
topK=10):
estimations_df=estimations_df.copy()
reco=reco.copy()
test_df=test.copy()
# prepare testset
test_df.columns=['user', 'item', 'rating', 'timestamp']
test_df['user_code'] = test_df['user'].astype("category").cat.codes
test_df['item_code'] = test_df['item'].astype("category").cat.codes
user_code_id = dict(enumerate(test_df['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test_df['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])))
#prepare estimations
estimations_df.columns=['user', 'item' ,'score']
estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
#compute_estimations
estimations_df=estimations_metrics(test_ui, estimations)
#prepare reco
users=reco[:,:1]
items=reco[:,1::2]
# Let's use inner ids instead of real ones
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users) # maybe users we recommend are not in test set
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
# Let's put them into one array
reco=np.concatenate((users, items), axis=1)
#compute ranking metrics
ranking_df=ranking_metrics(test_ui, reco, super_reactions=super_reactions, topK=topK)
#compute diversity metrics
diversity_df=diversity_metrics(test_ui, reco, topK)
result=pd.concat([estimations_df, ranking_df, diversity_df], axis=1)
return(result)
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
nb_items=test_ui.shape[1]
relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\
0,0,0,0,0,0,0,0,0,0,0,0,0
cg = (1.0 / np.log2(np.arange(2, topK + 2)))
cg_sum = np.cumsum(cg)
for (nb_user, user) in tqdm(enumerate(reco[:,0])):
u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
nb_u_rated_items=len(u_rated_items)
if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
relevant_users+=1
u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
(test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
# more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
# but accesing test_ui[user,item] is expensive -we should avoid doing it
if len(u_super_items)>0:
super_relevant_users+=1
user_successes=np.zeros(topK)
nb_user_successes=0
user_super_successes=np.zeros(topK)
nb_user_super_successes=0
# evaluation
for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
if item in u_rated_items:
user_successes[item_position]=1
nb_user_successes+=1
if item in u_super_items:
user_super_successes[item_position]=1
nb_user_super_successes+=1
prec_u=nb_user_successes/topK
prec+=prec_u
rec_u=nb_user_successes/nb_u_rated_items
rec+=rec_u
F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
prec_super+=nb_user_super_successes/topK
rec_super+=nb_user_super_successes/max(len(u_super_items),1)
ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
cumsum_successes=np.cumsum(user_successes)
mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
(nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
((nb_items-nb_u_rated_items)*nb_u_rated_items)
HR+=nb_user_successes>0
result=[]
result.append(('precision', prec/relevant_users))
result.append(('recall', rec/relevant_users))
result.append(('F_1', F_1/relevant_users))
result.append(('F_05', F_05/relevant_users))
result.append(('precision_super', prec_super/super_relevant_users))
result.append(('recall_super', rec_super/super_relevant_users))
result.append(('NDCG', ndcg/relevant_users))
result.append(('mAP', mAP/relevant_users))
result.append(('MRR', MRR/relevant_users))
result.append(('LAUC', LAUC/relevant_users))
result.append(('HR', HR/relevant_users))
df_result=pd.DataFrame()
if len(result)>0:
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def estimations_metrics(test_ui, estimations):
result=[]
RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
result.append(['RMSE', RMSE])
MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
result.append(['MAE', MAE])
df_result=pd.DataFrame()
if len(result)>0:
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def diversity_metrics(test_ui, reco, topK=10):
frequencies=defaultdict(int)
for item in list(set(test_ui.indices)):
frequencies[item]=0
for item in reco[:,1:].flat:
frequencies[item]+=1
nb_reco_outside_test=frequencies[-1]
del frequencies[-1]
frequencies=np.array(list(frequencies.values()))
nb_rec_items=len(frequencies[frequencies>0])
nb_reco_inside_test=np.sum(frequencies)
frequencies=frequencies/np.sum(frequencies)
frequencies=np.sort(frequencies)
with np.errstate(divide='ignore'): # let's put zeros we items with 0 frequency and ignore division warning
log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
result=[]
result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def evaluate_all(test,
dir_path="Recommendations generated/ml-100k/",
super_reactions=[4,5],
topK=10):
models = list(set(['_'.join(f.split('_')[:2]) for f in listdir(dir_path)
if isfile(dir_path+f)]))
result=[]
for model in models:
estimations_df=pd.read_csv('{}{}_estimations.csv'.format(dir_path, model), header=None)
reco=np.loadtxt('{}{}_reco.csv'.format(dir_path, model), delimiter=',')
to_append=evaluate(test, estimations_df, reco, super_reactions, topK)
to_append.insert(0, "Model", model)
result.append(to_append)
result=pd.concat(result)
result=result.sort_values(by='recall', ascending=False)
return result