import os import sys import numpy as np import pandas as pd import math from sklearn.preprocessing import normalize from tqdm import tqdm from datetime import datetime, date import random import scipy.sparse as sparse from os import listdir from os.path import isfile, join from collections import defaultdict def evaluate(test, estimations_df, reco, super_reactions=[4,5], topK=10): estimations_df=estimations_df.copy() reco=reco.copy() test_df=test.copy() # prepare testset test_df.columns=['user', 'item', 'rating', 'timestamp'] test_df['user_code'] = test_df['user'].astype("category").cat.codes test_df['item_code'] = test_df['item'].astype("category").cat.codes user_code_id = dict(enumerate(test_df['user'].astype("category").cat.categories)) user_id_code = dict((v, k) for k, v in user_code_id.items()) item_code_id = dict(enumerate(test_df['item'].astype("category").cat.categories)) item_id_code = dict((v, k) for k, v in item_code_id.items()) test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code']))) #prepare estimations estimations_df.columns=['user', 'item' ,'score'] estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']] estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']] estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape) #compute_estimations estimations_df=estimations_metrics(test_ui, estimations) #prepare reco users=reco[:,:1] items=reco[:,1::2] # Let's use inner ids instead of real ones users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users) # maybe users we recommend are not in test set items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set # Let's put them into one array reco=np.concatenate((users, items), axis=1) #compute ranking metrics ranking_df=ranking_metrics(test_ui, reco, super_reactions=super_reactions, topK=topK) #compute diversity metrics diversity_df=diversity_metrics(test_ui, reco, topK) result=pd.concat([estimations_df, ranking_df, diversity_df], axis=1) return(result) def ranking_metrics(test_ui, reco, super_reactions=[], topK=10): nb_items=test_ui.shape[1] relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR, HitRate2, HitRate3=\ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 cg = (1.0 / np.log2(np.arange(2, topK + 2))) cg_sum = np.cumsum(cg) for (nb_user, user) in tqdm(enumerate(reco[:,0])): u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]] nb_u_rated_items=len(u_rated_items) if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items) relevant_users+=1 u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\ (test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])] # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions] # but accesing test_ui[user,item] is expensive -we should avoid doing it if len(u_super_items)>0: super_relevant_users+=1 user_successes=np.zeros(topK) nb_user_successes=0 user_super_successes=np.zeros(topK) nb_user_super_successes=0 # evaluation for (item_position,item) in enumerate(reco[nb_user,1:topK+1]): if item in u_rated_items: user_successes[item_position]=1 nb_user_successes+=1 if item in u_super_items: user_super_successes[item_position]=1 nb_user_super_successes+=1 prec_u=nb_user_successes/topK prec+=prec_u rec_u=nb_user_successes/nb_u_rated_items rec+=rec_u F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0 F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0 prec_super+=nb_user_super_successes/topK rec_super+=nb_user_super_successes/max(len(u_super_items),1) ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1] cumsum_successes=np.cumsum(user_successes) mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items) MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0 LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\ (nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\ ((nb_items-nb_u_rated_items)*nb_u_rated_items) HR+=nb_user_successes>0 HitRate2+=nb_user_successes>1 HitRate3+=nb_user_successes>2 result=[] result.append(('precision', prec/relevant_users)) result.append(('recall', rec/relevant_users)) result.append(('F_1', F_1/relevant_users)) result.append(('F_05', F_05/relevant_users)) result.append(('precision_super', prec_super/super_relevant_users)) result.append(('recall_super', rec_super/super_relevant_users)) result.append(('NDCG', ndcg/relevant_users)) result.append(('mAP', mAP/relevant_users)) result.append(('MRR', MRR/relevant_users)) result.append(('LAUC', LAUC/relevant_users)) result.append(('HR', HR/relevant_users)) result.append(('HitRate2', HitRate2/relevant_users)) result.append(('HitRate3', HitRate3/relevant_users)) df_result=pd.DataFrame() if len(result)>0: df_result=(pd.DataFrame(list(zip(*result))[1])).T df_result.columns=list(zip(*result))[0] return df_result def estimations_metrics(test_ui, estimations): result=[] RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2) result.append(['RMSE', RMSE]) MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz result.append(['MAE', MAE]) df_result=pd.DataFrame() if len(result)>0: df_result=(pd.DataFrame(list(zip(*result))[1])).T df_result.columns=list(zip(*result))[0] return df_result def diversity_metrics(test_ui, reco, topK=10): frequencies=defaultdict(int) for item in list(set(test_ui.indices)): frequencies[item]=0 for item in reco[:,1:].flat: frequencies[item]+=1 nb_reco_outside_test=frequencies[-1] del frequencies[-1] frequencies=np.array(list(frequencies.values())) nb_rec_items=len(frequencies[frequencies>0]) nb_reco_inside_test=np.sum(frequencies) frequencies=frequencies/np.sum(frequencies) frequencies=np.sort(frequencies) with np.errstate(divide='ignore'): # let's put zeros we items with 0 frequency and ignore division warning log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0) result=[] result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test))) result.append(('Test coverage', nb_rec_items/test_ui.shape[1])) result.append(('Shannon', -np.dot(frequencies, log_frequencies))) result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1))) df_result=(pd.DataFrame(list(zip(*result))[1])).T df_result.columns=list(zip(*result))[0] return df_result def evaluate_all(test, dir_path="Recommendations generated/ml-100k/", super_reactions=[4,5], topK=10): models = list(set(['_'.join(f.split('_')[:2]) for f in listdir(dir_path) if isfile(dir_path+f)])) result=[] for model in models: estimations_df=pd.read_csv('{}{}_estimations.csv'.format(dir_path, model), header=None) reco=np.loadtxt('{}{}_reco.csv'.format(dir_path, model), delimiter=',') to_append=evaluate(test, estimations_df, reco, super_reactions, topK) to_append.insert(0, "Model", model) result.append(to_append) result=pd.concat(result) result=result.sort_values(by='recall', ascending=False) return result