### Prepare test set

In [0]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
from tqdm import tqdm

# In evaluation we do not load train set - it is not needed
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
test.columns=['user', 'item', 'rating', 'timestamp']

test['user_code'] = test['user'].astype("category").cat.codes
test['item_code'] = test['item'].astype("category").cat.codes

user_code_id = dict(enumerate(test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())

test_ui = sparse.csr_matrix((test['rating'], (test['user_code'], test['item_code'])))

In [0]:
import os
if not os.path.exists('./Recommendations generated/'):
        os.mkdir('./Recommendations generated/')
        os.mkdir('./Recommendations generated/ml-100k/')
        os.mkdir('./Recommendations generated/toy-example/')

### Estimations metrics

In [0]:
estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
estimations_df.columns=['user', 'item' ,'score']

estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)

In [0]:
def estimations_metrics(test_ui, estimations):
    result=[]

    RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
    result.append(['RMSE', RMSE])

    MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
    result.append(['MAE', MAE])
    
    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result

In [0]:
# in case of error (in the laboratories) you might have to switch to the other version of pandas
# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel

estimations_metrics(test_ui, estimations)

Unnamed: 0,RMSE,MAE
0,0.949459,0.752487


### Ranking metrics

In [0]:
import numpy as np
reco = np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')
# Let's ignore scores - they are not used in evaluation: 
users=reco[:,:1]
items=reco[:,1::2]
# Let's use inner ids instead of real ones
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
# Let's put them into one array
reco=np.concatenate((users, items), axis=1)
reco

array([[663, 475,  62, ..., 472, 269, 503],
       [ 48, 313, 475, ..., 591, 175, 466],
       [351, 313, 475, ..., 591, 175, 466],
       ...,
       [259, 313, 475, ...,  11, 591, 175],
       [ 33, 313, 475, ...,  11, 591, 175],
       [ 77, 313, 475, ...,  11, 591, 175]])

In [0]:
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
    
    nb_items=test_ui.shape[1]
    relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\
    0,0,0,0,0,0,0,0,0,0,0,0,0
    
    cg = (1.0 / np.log2(np.arange(2, topK + 2)))
    cg_sum = np.cumsum(cg)
    
    for (nb_user, user) in tqdm(enumerate(reco[:,0])):
        u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
        nb_u_rated_items=len(u_rated_items)
        if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
            relevant_users+=1
            
            u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
            (test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
            # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
            # but accesing test_ui[user,item] is expensive -we should avoid doing it
            if len(u_super_items)>0:
                super_relevant_users+=1
            
            user_successes=np.zeros(topK)
            nb_user_successes=0
            user_super_successes=np.zeros(topK)
            nb_user_super_successes=0
            
            # evaluation
            for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
                if item in u_rated_items:
                    user_successes[item_position]=1
                    nb_user_successes+=1
                    if item in u_super_items:
                        user_super_successes[item_position]=1
                        nb_user_super_successes+=1
                        
            prec_u=nb_user_successes/topK 
            prec+=prec_u
            
            rec_u=nb_user_successes/nb_u_rated_items
            rec+=rec_u
            
            F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
            F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
            
            prec_super+=nb_user_super_successes/topK
            rec_super+=nb_user_super_successes/max(len(u_super_items),1) # to set 0 if no super items
            ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
            
            cumsum_successes=np.cumsum(user_successes)
            mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
            MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
            LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
            (nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
            ((nb_items-nb_u_rated_items)*nb_u_rated_items)
            
            HR+=nb_user_successes>0
            
            
    result=[]
    result.append(('precision', prec/relevant_users))
    result.append(('recall', rec/relevant_users))
    result.append(('F_1', F_1/relevant_users))
    result.append(('F_05', F_05/relevant_users))
    result.append(('precision_super', prec_super/super_relevant_users))
    result.append(('recall_super', rec_super/super_relevant_users))
    result.append(('NDCG', ndcg/relevant_users))
    result.append(('mAP', mAP/relevant_users))
    result.append(('MRR', MRR/relevant_users))
    result.append(('LAUC', LAUC/relevant_users))
    result.append(('HR', HR/relevant_users))

    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result

In [0]:
ranking_metrics(test_ui, reco, super_reactions=[4,5], topK=10)

943it [00:00, 6981.89it/s]


Unnamed: 0,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR
0,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463,0.095957,0.043178,0.198193,0.515501,0.437964


### Diversity metrics

In [0]:
def diversity_metrics(test_ui, reco, topK=10):
    
    frequencies=defaultdict(int)
    
    # let's assign 0 to all items in test set
    for item in list(set(test_ui.indices)):
        frequencies[item]=0
        
    # counting frequencies
    for item in reco[:,1:].flat:
        frequencies[item]+=1
        
    nb_reco_outside_test=frequencies[-1]
    del frequencies[-1]
    
    frequencies=np.array(list(frequencies.values()))
                         
    nb_rec_items=len(frequencies[frequencies>0])
    nb_reco_inside_test=np.sum(frequencies)
                         
    frequencies=frequencies/np.sum(frequencies)
    frequencies=np.sort(frequencies)
    
    with np.errstate(divide='ignore'): # let's put zeros put items with 0 frequency and ignore division warning
        log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
                         
    result=[]
    result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
    result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
    result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
    result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
    
    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result

In [0]:
# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel

import evaluation_measures as ev
import imp
imp.reload(ev)

x=diversity_metrics(test_ui, reco, topK=10)
x

Unnamed: 0,Reco in test,Test coverage,Shannon,Gini
0,1.0,0.033911,2.836513,0.991139


# To be used in other notebooks

In [0]:
import evaluation_measures as ev
import imp
imp.reload(ev)

estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')

ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
            estimations_df=estimations_df, 
            reco=reco,
            super_reactions=[4,5])
#also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default

943it [00:00, 6721.23it/s]


Unnamed: 0,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,HR2,Reco in test,Test coverage,Shannon,Gini
0,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463,0.095957,0.043178,0.198193,0.515501,0.437964,0.239661,1.0,0.033911,2.836513,0.991139


In [0]:
import evaluation_measures as ev
import imp
imp.reload(ev)

dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

df=ev.evaluate_all(test, dir_path, super_reactions)
#also you can just type ev.evaluate_all() - I put above values as default

943it [00:00, 7140.87it/s]


In [0]:
df.iloc[:,:9]

Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super
0,Ready_Baseline,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463


In [0]:
df.iloc[:,np.append(0,np.arange(9, df.shape[1]))]

Unnamed: 0,Model,NDCG,mAP,MRR,LAUC,HR,HR2,Reco in test,Test coverage,Shannon,Gini
0,Ready_Baseline,0.095957,0.043178,0.198193,0.515501,0.437964,0.239661,1.0,0.033911,2.836513,0.991139


In [0]:
pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.6MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675370 sha256=b9c28146ba19d464e7357de16052301c4261e1e492c668f6ec5ec59683796e4f
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


# Check metrics on toy dataset

In [0]:
import evaluation_measures as ev
import imp
import helpers
imp.reload(ev)

dir_path="Recommendations generated/toy-example/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None)

display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))
#also you can just type ev.evaluate_all() - I put above values as default

toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
reco=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', header=None)
estimations=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', names=['user', 'item', 'est_score'])
toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)

print('Training data:')
display(toy_train_ui.todense())

print('Test data:')
display(toy_test_ui.todense())

print('Recommendations:')
display(reco)

print('Estimations:')
display(estimations)

943it [00:00, 12096.61it/s]


Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,HR2,Reco in test,Test coverage,Shannon,Gini
0,Self_BaselineUI,0.967585,0.76274,0.000353,4.4e-05,7.9e-05,0.000147,0.0,0.0,0.000314,0.000177,0.00053,0.498923,0.00106,0.0,0.60053,0.005051,1.803126,0.99638


Training data:


matrix([[5, 3, 4, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [5, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Test data:


matrix([[0, 0, 0, ..., 0, 0, 0],
        [4, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 5, 0, ..., 0, 0, 0]], dtype=int64)

Recommendations:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,814,5.449584,1491,5.232561,1536,5.188667,1306,5.169081,1599,4.989625,1467,4.931335,1080,4.859644,1642,4.857250,1189,4.782040,1500,4.742546
1,2,814,5.580813,1491,5.363789,1536,5.319895,1306,5.300310,1599,5.120854,1467,5.062563,1080,4.990872,1642,4.988479,119,4.921909,1189,4.913268
2,3,814,4.615469,1491,4.398445,1536,4.354551,1306,4.334965,1599,4.155510,1467,4.097219,1080,4.025528,1642,4.023134,119,3.956565,1189,3.947924
3,4,814,6.244539,1491,6.027515,1536,5.983621,1306,5.964035,1599,5.784579,1467,5.726289,1080,5.654598,1642,5.652204,119,5.585635,1189,5.576994
4,5,814,4.738510,1491,4.521486,1536,4.477592,1306,4.458007,1599,4.278551,1467,4.220260,1080,4.148569,1642,4.146176,119,4.079607,1189,4.070965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,939,814,6.236644,1491,6.019620,1536,5.975726,1306,5.956140,1599,5.776685,1467,5.718394,1080,5.646703,1642,5.644310,119,5.577740,1189,5.569099
939,940,814,5.462440,1491,5.245416,1536,5.201522,1306,5.181936,1599,5.002481,1467,4.944190,1080,4.872499,1642,4.870106,119,4.803536,1189,4.794895
940,941,814,5.894539,1491,5.677515,1536,5.633621,1306,5.614035,1599,5.434579,1467,5.376289,1080,5.304598,1642,5.302204,119,5.235635,1189,5.226994
941,942,814,6.164380,1491,5.947356,1536,5.903462,1306,5.883876,1599,5.704421,1467,5.646130,1080,5.574439,1642,5.572046,119,5.505476,1189,5.496835


Estimations:


Unnamed: 0,user,item,est_score
0,1,5,3.457161
1,1,10,3.798540
2,1,25,3.435415
3,1,32,3.732018
4,1,33,3.531991
...,...,...,...
19995,943,928,2.907189
19996,943,1067,3.485929
19997,943,1074,2.861988
19998,943,1188,2.727428


# Sample recommendations

In [0]:
train=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
items=pd.read_csv('./Datasets/ml-100k/movies.csv')

user=random.choice(list(set(train['user'])))

train_content=pd.merge(train, items, left_on='item', right_on='id')

print('Here is what user rated high:')
display(train_content[train_content['user']==user][['user', 'rating', 'title', 'genres']]\
        .sort_values(by='rating', ascending=False)[:15])

reco = np.loadtxt('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', delimiter=',')
items=pd.read_csv('./Datasets/ml-100k/movies.csv')

# Let's ignore scores - they are not used in evaluation: 
reco_users=reco[:,:1]
reco_items=reco[:,1::2]
# Let's put them into one array
reco=np.concatenate((reco_users, reco_items), axis=1)

# Let's rebuild it user-item dataframe
recommended=[]
for row in reco:
    for rec_nb, entry in enumerate(row[1:]):
        recommended.append((row[0], rec_nb+1, entry))
recommended=pd.DataFrame(recommended, columns=['user','rec_nb', 'item'])

recommended_content=pd.merge(recommended, items, left_on='item', right_on='id')

print('Here is what we recommend:')
recommended_content[recommended_content['user']==user][['user', 'rec_nb', 'title', 'genres']].sort_values(by='rec_nb')

Here is what user rated high:


Unnamed: 0,user,rating,title,genres
5529,181,5,Jerry Maguire (1996),"Drama, Romance"
38687,181,4,"First Wives Club, The (1996)",Comedy
19390,181,4,Scream (1996),"Horror, Thriller"
63720,181,4,"Mirror Has Two Faces, The (1996)","Comedy, Romance"
57287,181,4,Evita (1996),"Drama, Musical"
57106,181,4,Up Close and Personal (1996),"Drama, Romance"
24776,181,4,Gattaca (1997),"Drama, Sci-Fi, Thriller"
11778,181,4,Mission: Impossible (1996),"Action, Adventure, Mystery"
68469,181,4,Eye for an Eye (1996),"Drama, Thriller"
25908,181,4,Dead Man Walking (1995),Drama


Here is what we recommend:


Unnamed: 0,user,rec_nb,title,genres
179,181.0,1,"Great Day in Harlem, A (1994)",Documentary
1122,181.0,2,Tough and Deadly (1995),"Action, Drama, Thriller"
2064,181.0,3,Aiqing wansui (1994),Drama
3005,181.0,4,Delta of Venus (1994),Drama
3948,181.0,5,Someone Else's America (1995),Drama
4890,181.0,6,"Saint of Fort Washington, The (1993)",Drama
5831,181.0,7,Celestial Clockwork (1994),Comedy
6774,181.0,8,Some Mother's Son (1996),Drama
8668,181.0,9,Maya Lin: A Strong Clear Vision (1994),Documentary
7714,181.0,10,Prefontaine (1997),Drama


# project task 3: implement some other evaluation measure

In [0]:
# it may be your idea, modification of what we have already implemented 
# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations) 
# or something well-known
# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure

In [0]:
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)

943it [00:00, 6725.92it/s]
943it [00:00, 6395.43it/s]


Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super,NDCG,mAP,MRR,LAUC,HR,HR2,Reco in test,Test coverage,Shannon,Gini
0,Ready_Baseline,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463,0.095957,0.043178,0.198193,0.515501,0.437964,0.239661,1.0,0.033911,2.836513,0.991139
0,Self_BaselineUI,0.967585,0.76274,0.000954,0.00017,0.000278,0.000463,0.000644,0.000189,0.000752,0.000168,0.001677,0.496424,0.009544,0.0,0.60053,0.005051,1.803126,0.99638
