Uczenie_maszynowe_Systemy_R.../P2. Evaluation.ipynb
2020-06-14 22:23:50 +02:00

76 KiB
Raw Permalink Blame History

Prepare test set

import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
from tqdm import tqdm

# In evaluation we do not load train set - it is not needed
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
test.columns=['user', 'item', 'rating', 'timestamp']

test['user_code'] = test['user'].astype("category").cat.codes
test['item_code'] = test['item'].astype("category").cat.codes

user_code_id = dict(enumerate(test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())

test_ui = sparse.csr_matrix((test['rating'], (test['user_code'], test['item_code'])))
import os
if not os.path.exists('./Recommendations generated/'):
        os.mkdir('./Recommendations generated/')
        os.mkdir('./Recommendations generated/ml-100k/')
        os.mkdir('./Recommendations generated/toy-example/')

Estimations metrics

estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
estimations_df.columns=['user', 'item' ,'score']

estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
def estimations_metrics(test_ui, estimations):
    result=[]

    RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
    result.append(['RMSE', RMSE])

    MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
    result.append(['MAE', MAE])
    
    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result
# in case of error (in the laboratories) you might have to switch to the other version of pandas
# try !pip3 install pandas=='1.0.3' (or pip if you use python 2) and restart the kernel

estimations_metrics(test_ui, estimations)
RMSE MAE
0 0.949459 0.752487

Ranking metrics

import numpy as np
reco = np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')
# Let's ignore scores - they are not used in evaluation: 
users=reco[:,:1]
items=reco[:,1::2]
# Let's use inner ids instead of real ones
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users)
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
# Let's put them into one array
reco=np.concatenate((users, items), axis=1)
reco
array([[663, 475,  62, ..., 472, 269, 503],
       [ 48, 313, 475, ..., 591, 175, 466],
       [351, 313, 475, ..., 591, 175, 466],
       ...,
       [259, 313, 475, ...,  11, 591, 175],
       [ 33, 313, 475, ...,  11, 591, 175],
       [ 77, 313, 475, ...,  11, 591, 175]])
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
    
    nb_items=test_ui.shape[1]
    relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\
    0,0,0,0,0,0,0,0,0,0,0,0,0
    
    cg = (1.0 / np.log2(np.arange(2, topK + 2)))
    cg_sum = np.cumsum(cg)
    
    for (nb_user, user) in tqdm(enumerate(reco[:,0])):
        u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
        nb_u_rated_items=len(u_rated_items)
        if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
            relevant_users+=1
            
            u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
            (test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
            # more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
            # but accesing test_ui[user,item] is expensive -we should avoid doing it
            if len(u_super_items)>0:
                super_relevant_users+=1
            
            user_successes=np.zeros(topK)
            nb_user_successes=0
            user_super_successes=np.zeros(topK)
            nb_user_super_successes=0
            
            # evaluation
            for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
                if item in u_rated_items:
                    user_successes[item_position]=1
                    nb_user_successes+=1
                    if item in u_super_items:
                        user_super_successes[item_position]=1
                        nb_user_super_successes+=1
                        
            prec_u=nb_user_successes/topK 
            prec+=prec_u
            
            rec_u=nb_user_successes/nb_u_rated_items
            rec+=rec_u
            
            F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
            F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
            
            prec_super+=nb_user_super_successes/topK
            rec_super+=nb_user_super_successes/max(len(u_super_items),1) # to set 0 if no super items
            ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
            
            cumsum_successes=np.cumsum(user_successes)
            mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
            MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
            LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
            (nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
            ((nb_items-nb_u_rated_items)*nb_u_rated_items)
            
            HR+=nb_user_successes>0
            
            
    result=[]
    result.append(('precision', prec/relevant_users))
    result.append(('recall', rec/relevant_users))
    result.append(('F_1', F_1/relevant_users))
    result.append(('F_05', F_05/relevant_users))
    result.append(('precision_super', prec_super/super_relevant_users))
    result.append(('recall_super', rec_super/super_relevant_users))
    result.append(('NDCG', ndcg/relevant_users))
    result.append(('mAP', mAP/relevant_users))
    result.append(('MRR', MRR/relevant_users))
    result.append(('LAUC', LAUC/relevant_users))
    result.append(('HR', HR/relevant_users))

    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result
ranking_metrics(test_ui, reco, super_reactions=[4,5], topK=10)
943it [00:00, 6981.89it/s]
precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR
0 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 0.437964

Diversity metrics

def diversity_metrics(test_ui, reco, topK=10):
    
    frequencies=defaultdict(int)
    
    # let's assign 0 to all items in test set
    for item in list(set(test_ui.indices)):
        frequencies[item]=0
        
    # counting frequencies
    for item in reco[:,1:].flat:
        frequencies[item]+=1
        
    nb_reco_outside_test=frequencies[-1]
    del frequencies[-1]
    
    frequencies=np.array(list(frequencies.values()))
                         
    nb_rec_items=len(frequencies[frequencies>0])
    nb_reco_inside_test=np.sum(frequencies)
                         
    frequencies=frequencies/np.sum(frequencies)
    frequencies=np.sort(frequencies)
    
    with np.errstate(divide='ignore'): # let's put zeros put items with 0 frequency and ignore division warning
        log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
                         
    result=[]
    result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
    result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
    result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
    result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
    
    df_result=(pd.DataFrame(list(zip(*result))[1])).T
    df_result.columns=list(zip(*result))[0]
    return df_result
# in case of errors try !pip3 install numpy==1.18.4 (or pip if you use python 2) and restart the kernel

import evaluation_measures as ev
import imp
imp.reload(ev)

x=diversity_metrics(test_ui, reco, topK=10)
x
Reco in test Test coverage Shannon Gini
0 1.0 0.033911 2.836513 0.991139

To be used in other notebooks

import evaluation_measures as ev
import imp
imp.reload(ev)

estimations_df=pd.read_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', header=None)
reco=np.loadtxt('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', delimiter=',')

ev.evaluate(test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None),
            estimations_df=estimations_df, 
            reco=reco,
            super_reactions=[4,5])
#also you can just type ev.evaluate_all(estimations_df, reco) - I put above values as default
943it [00:00, 6721.23it/s]
RMSE MAE precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR HR2 Reco in test Test coverage Shannon Gini
0 0.949459 0.752487 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 0.437964 0.239661 1.0 0.033911 2.836513 0.991139
import evaluation_measures as ev
import imp
imp.reload(ev)

dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

df=ev.evaluate_all(test, dir_path, super_reactions)
#also you can just type ev.evaluate_all() - I put above values as default
943it [00:00, 7140.87it/s]
df.iloc[:,:9]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super
0 Ready_Baseline 0.949459 0.752487 0.09141 0.037652 0.04603 0.061286 0.079614 0.056463
df.iloc[:,np.append(0,np.arange(9, df.shape[1]))]
Model NDCG mAP MRR LAUC HR HR2 Reco in test Test coverage Shannon Gini
0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 0.239661 1.0 0.033911 2.836513 0.991139
pip install surprise
Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
     |████████████████████████████████| 6.5MB 2.6MB/s 
[?25hRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (0.15.1)
Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.18.5)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.4.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from scikit-surprise->surprise) (1.12.0)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1675370 sha256=b9c28146ba19d464e7357de16052301c4261e1e492c668f6ec5ec59683796e4f
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1

Check metrics on toy dataset

import evaluation_measures as ev
import imp
import helpers
imp.reload(ev)

dir_path="Recommendations generated/toy-example/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None)

display(ev.evaluate_all(test, dir_path, super_reactions, topK=3))
#also you can just type ev.evaluate_all() - I put above values as default

toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
reco=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', header=None)
estimations=pd.read_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', names=['user', 'item', 'est_score'])
toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)

print('Training data:')
display(toy_train_ui.todense())

print('Test data:')
display(toy_test_ui.todense())

print('Recommendations:')
display(reco)

print('Estimations:')
display(estimations)
943it [00:00, 12096.61it/s]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR HR2 Reco in test Test coverage Shannon Gini
0 Self_BaselineUI 0.967585 0.76274 0.000353 0.000044 0.000079 0.000147 0.0 0.0 0.000314 0.000177 0.00053 0.498923 0.00106 0.0 0.60053 0.005051 1.803126 0.99638
Training data:
matrix([[5, 3, 4, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [5, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
Test data:
matrix([[0, 0, 0, ..., 0, 0, 0],
        [4, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 5, 0, ..., 0, 0, 0]], dtype=int64)
Recommendations:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
0 1 814 5.449584 1491 5.232561 1536 5.188667 1306 5.169081 1599 4.989625 1467 4.931335 1080 4.859644 1642 4.857250 1189 4.782040 1500 4.742546
1 2 814 5.580813 1491 5.363789 1536 5.319895 1306 5.300310 1599 5.120854 1467 5.062563 1080 4.990872 1642 4.988479 119 4.921909 1189 4.913268
2 3 814 4.615469 1491 4.398445 1536 4.354551 1306 4.334965 1599 4.155510 1467 4.097219 1080 4.025528 1642 4.023134 119 3.956565 1189 3.947924
3 4 814 6.244539 1491 6.027515 1536 5.983621 1306 5.964035 1599 5.784579 1467 5.726289 1080 5.654598 1642 5.652204 119 5.585635 1189 5.576994
4 5 814 4.738510 1491 4.521486 1536 4.477592 1306 4.458007 1599 4.278551 1467 4.220260 1080 4.148569 1642 4.146176 119 4.079607 1189 4.070965
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
938 939 814 6.236644 1491 6.019620 1536 5.975726 1306 5.956140 1599 5.776685 1467 5.718394 1080 5.646703 1642 5.644310 119 5.577740 1189 5.569099
939 940 814 5.462440 1491 5.245416 1536 5.201522 1306 5.181936 1599 5.002481 1467 4.944190 1080 4.872499 1642 4.870106 119 4.803536 1189 4.794895
940 941 814 5.894539 1491 5.677515 1536 5.633621 1306 5.614035 1599 5.434579 1467 5.376289 1080 5.304598 1642 5.302204 119 5.235635 1189 5.226994
941 942 814 6.164380 1491 5.947356 1536 5.903462 1306 5.883876 1599 5.704421 1467 5.646130 1080 5.574439 1642 5.572046 119 5.505476 1189 5.496835
942 943 814 5.284783 1491 5.067759 1536 5.023865 1306 5.004279 1599 4.824823 1467 4.766533 1080 4.694842 1642 4.692448 119 4.625879 1189 4.617238

943 rows × 21 columns

Estimations:
user item est_score
0 1 5 3.457161
1 1 10 3.798540
2 1 25 3.435415
3 1 32 3.732018
4 1 33 3.531991
... ... ... ...
19995 943 928 2.907189
19996 943 1067 3.485929
19997 943 1074 2.861988
19998 943 1188 2.727428
19999 943 1228 2.568442

20000 rows × 3 columns

Sample recommendations

train=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
items=pd.read_csv('./Datasets/ml-100k/movies.csv')

user=random.choice(list(set(train['user'])))

train_content=pd.merge(train, items, left_on='item', right_on='id')

print('Here is what user rated high:')
display(train_content[train_content['user']==user][['user', 'rating', 'title', 'genres']]\
        .sort_values(by='rating', ascending=False)[:15])

reco = np.loadtxt('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', delimiter=',')
items=pd.read_csv('./Datasets/ml-100k/movies.csv')

# Let's ignore scores - they are not used in evaluation: 
reco_users=reco[:,:1]
reco_items=reco[:,1::2]
# Let's put them into one array
reco=np.concatenate((reco_users, reco_items), axis=1)

# Let's rebuild it user-item dataframe
recommended=[]
for row in reco:
    for rec_nb, entry in enumerate(row[1:]):
        recommended.append((row[0], rec_nb+1, entry))
recommended=pd.DataFrame(recommended, columns=['user','rec_nb', 'item'])

recommended_content=pd.merge(recommended, items, left_on='item', right_on='id')

print('Here is what we recommend:')
recommended_content[recommended_content['user']==user][['user', 'rec_nb', 'title', 'genres']].sort_values(by='rec_nb')
Here is what user rated high:
user rating title genres
5529 181 5 Jerry Maguire (1996) Drama, Romance
38687 181 4 First Wives Club, The (1996) Comedy
19390 181 4 Scream (1996) Horror, Thriller
63720 181 4 Mirror Has Two Faces, The (1996) Comedy, Romance
57287 181 4 Evita (1996) Drama, Musical
57106 181 4 Up Close and Personal (1996) Drama, Romance
24776 181 4 Gattaca (1997) Drama, Sci-Fi, Thriller
11778 181 4 Mission: Impossible (1996) Action, Adventure, Mystery
68469 181 4 Eye for an Eye (1996) Drama, Thriller
25908 181 4 Dead Man Walking (1995) Drama
9811 181 4 Time to Kill, A (1996) Drama
21011 181 4 I Know What You Did Last Summer (1997) Horror, Mystery, Thriller
33572 181 4 Independence Day (ID4) (1996) Action, Sci-Fi, War
44631 181 4 Ransom (1996) Drama, Thriller
46726 181 4 Hunchback of Notre Dame, The (1996) Animation, Children's, Musical
Here is what we recommend:
user rec_nb title genres
179 181.0 1 Great Day in Harlem, A (1994) Documentary
1122 181.0 2 Tough and Deadly (1995) Action, Drama, Thriller
2064 181.0 3 Aiqing wansui (1994) Drama
3005 181.0 4 Delta of Venus (1994) Drama
3948 181.0 5 Someone Else's America (1995) Drama
4890 181.0 6 Saint of Fort Washington, The (1993) Drama
5831 181.0 7 Celestial Clockwork (1994) Comedy
6774 181.0 8 Some Mother's Son (1996) Drama
8668 181.0 9 Maya Lin: A Strong Clear Vision (1994) Documentary
7714 181.0 10 Prefontaine (1997) Drama

project task 3: implement some other evaluation measure

# it may be your idea, modification of what we have already implemented 
# (for example Hit2 rate which would count as a success users whoreceived at least 2 relevant recommendations) 
# or something well-known
# expected output: modification of evaluation_measures.py such that evaluate_all will also display your measure
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

ev.evaluate_all(test, dir_path, super_reactions)
943it [00:00, 6725.92it/s]
943it [00:00, 6395.43it/s]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super NDCG mAP MRR LAUC HR HR2 Reco in test Test coverage Shannon Gini
0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 0.061286 0.079614 0.056463 0.095957 0.043178 0.198193 0.515501 0.437964 0.239661 1.00000 0.033911 2.836513 0.991139
0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 0.000463 0.000644 0.000189 0.000752 0.000168 0.001677 0.496424 0.009544 0.000000 0.60053 0.005051 1.803126 0.996380