216 lines
8.5 KiB
Python
216 lines
8.5 KiB
Python
import os
|
|
import sys
|
|
import numpy as np
|
|
import pandas as pd
|
|
import math
|
|
from sklearn.preprocessing import normalize
|
|
from tqdm import tqdm
|
|
from datetime import datetime, date
|
|
import random
|
|
import scipy.sparse as sparse
|
|
from os import listdir
|
|
from os.path import isfile, join
|
|
from collections import defaultdict
|
|
|
|
|
|
def evaluate(test,
|
|
estimations_df,
|
|
reco,
|
|
super_reactions=[4,5],
|
|
topK=10):
|
|
|
|
estimations_df=estimations_df.copy()
|
|
reco=reco.copy()
|
|
test_df=test.copy()
|
|
|
|
# prepare testset
|
|
test_df.columns=['user', 'item', 'rating', 'timestamp']
|
|
test_df['user_code'] = test_df['user'].astype("category").cat.codes
|
|
test_df['item_code'] = test_df['item'].astype("category").cat.codes
|
|
|
|
user_code_id = dict(enumerate(test_df['user'].astype("category").cat.categories))
|
|
user_id_code = dict((v, k) for k, v in user_code_id.items())
|
|
item_code_id = dict(enumerate(test_df['item'].astype("category").cat.categories))
|
|
item_id_code = dict((v, k) for k, v in item_code_id.items())
|
|
|
|
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])))
|
|
|
|
#prepare estimations
|
|
estimations_df.columns=['user', 'item' ,'score']
|
|
estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
|
|
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
|
|
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
|
|
|
|
#compute_estimations
|
|
estimations_df=estimations_metrics(test_ui, estimations)
|
|
|
|
#prepare reco
|
|
users=reco[:,:1]
|
|
items=reco[:,1::2]
|
|
# Let's use inner ids instead of real ones
|
|
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users) # maybe users we recommend are not in test set
|
|
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
|
|
# Let's put them into one array
|
|
reco=np.concatenate((users, items), axis=1)
|
|
|
|
#compute ranking metrics
|
|
ranking_df=ranking_metrics(test_ui, reco, super_reactions=super_reactions, topK=topK)
|
|
|
|
#compute diversity metrics
|
|
diversity_df=diversity_metrics(test_ui, reco, topK)
|
|
|
|
result=pd.concat([estimations_df, ranking_df, diversity_df], axis=1)
|
|
|
|
return(result)
|
|
|
|
|
|
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
|
|
|
|
nb_items=test_ui.shape[1]
|
|
relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR, HR2=\
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
|
|
|
cg = (1.0 / np.log2(np.arange(2, topK + 2)))
|
|
cg_sum = np.cumsum(cg)
|
|
|
|
for (nb_user, user) in tqdm(enumerate(reco[:,0])):
|
|
u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
|
|
nb_u_rated_items=len(u_rated_items)
|
|
if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
|
|
relevant_users+=1
|
|
|
|
u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
|
|
(test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
|
|
# more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
|
|
# but accesing test_ui[user,item] is expensive -we should avoid doing it
|
|
if len(u_super_items)>0:
|
|
super_relevant_users+=1
|
|
|
|
user_successes=np.zeros(topK)
|
|
nb_user_successes=0
|
|
user_super_successes=np.zeros(topK)
|
|
nb_user_super_successes=0
|
|
|
|
# evaluation
|
|
for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
|
|
if item in u_rated_items:
|
|
user_successes[item_position]=1
|
|
nb_user_successes+=1
|
|
if item in u_super_items:
|
|
user_super_successes[item_position]=1
|
|
nb_user_super_successes+=1
|
|
|
|
prec_u=nb_user_successes/topK
|
|
prec+=prec_u
|
|
|
|
rec_u=nb_user_successes/nb_u_rated_items
|
|
rec+=rec_u
|
|
|
|
F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
|
|
F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
|
|
|
|
prec_super+=nb_user_super_successes/topK
|
|
rec_super+=nb_user_super_successes/max(len(u_super_items),1)
|
|
ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
|
|
|
|
cumsum_successes=np.cumsum(user_successes)
|
|
mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
|
|
MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
|
|
LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
|
|
(nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
|
|
((nb_items-nb_u_rated_items)*nb_u_rated_items)
|
|
|
|
HR+=nb_user_successes>0
|
|
HR2+=nb_user_successes>1
|
|
|
|
|
|
result=[]
|
|
result.append(('precision', prec/relevant_users))
|
|
result.append(('recall', rec/relevant_users))
|
|
result.append(('F_1', F_1/relevant_users))
|
|
result.append(('F_05', F_05/relevant_users))
|
|
result.append(('precision_super', prec_super/super_relevant_users))
|
|
result.append(('recall_super', rec_super/super_relevant_users))
|
|
result.append(('NDCG', ndcg/relevant_users))
|
|
result.append(('mAP', mAP/relevant_users))
|
|
result.append(('MRR', MRR/relevant_users))
|
|
result.append(('LAUC', LAUC/relevant_users))
|
|
result.append(('HR', HR/relevant_users))
|
|
result.append(('HR2', HR2/relevant_users))
|
|
|
|
df_result=pd.DataFrame()
|
|
if len(result)>0:
|
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
|
df_result.columns=list(zip(*result))[0]
|
|
return df_result
|
|
|
|
|
|
def estimations_metrics(test_ui, estimations):
|
|
result=[]
|
|
|
|
RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
|
|
result.append(['RMSE', RMSE])
|
|
|
|
MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
|
|
result.append(['MAE', MAE])
|
|
|
|
df_result=pd.DataFrame()
|
|
if len(result)>0:
|
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
|
df_result.columns=list(zip(*result))[0]
|
|
return df_result
|
|
|
|
def diversity_metrics(test_ui, reco, topK=10):
|
|
|
|
frequencies=defaultdict(int)
|
|
|
|
for item in list(set(test_ui.indices)):
|
|
frequencies[item]=0
|
|
|
|
for item in reco[:,1:].flat:
|
|
frequencies[item]+=1
|
|
|
|
nb_reco_outside_test=frequencies[-1]
|
|
del frequencies[-1]
|
|
|
|
frequencies=np.array(list(frequencies.values()))
|
|
|
|
nb_rec_items=len(frequencies[frequencies>0])
|
|
nb_reco_inside_test=np.sum(frequencies)
|
|
|
|
frequencies=frequencies/np.sum(frequencies)
|
|
frequencies=np.sort(frequencies)
|
|
|
|
with np.errstate(divide='ignore'): # let's put zeros we items with 0 frequency and ignore division warning
|
|
log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
|
|
|
|
result=[]
|
|
result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
|
|
result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
|
|
result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
|
|
result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
|
|
|
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
|
df_result.columns=list(zip(*result))[0]
|
|
return df_result
|
|
|
|
|
|
|
|
def evaluate_all(test,
|
|
dir_path="Recommendations generated/ml-100k/",
|
|
super_reactions=[4,5],
|
|
topK=10):
|
|
|
|
models = list(set(['_'.join(f.split('_')[:2]) for f in listdir(dir_path)
|
|
if isfile(dir_path+f)]))
|
|
result=[]
|
|
for model in models:
|
|
estimations_df=pd.read_csv('{}{}_estimations.csv'.format(dir_path, model), header=None)
|
|
reco=np.loadtxt('{}{}_reco.csv'.format(dir_path, model), delimiter=',')
|
|
to_append=evaluate(test, estimations_df, reco, super_reactions, topK)
|
|
|
|
to_append.insert(0, "Model", model)
|
|
result.append(to_append)
|
|
result=pd.concat(result)
|
|
result=result.sort_values(by='recall', ascending=False)
|
|
return result |