first 2 lessons

This commit is contained in:
Robert 2021-03-20 20:01:22 +01:00
parent 26165e398f
commit 6c9b3de9d6
7 changed files with 5000 additions and 0 deletions

682
P0. Data preparation.ipynb Normal file

File diff suppressed because one or more lines are too long

1269
P1. Baseline.ipynb Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

2745
P2. Evaluation.ipynb Normal file

File diff suppressed because it is too large Load Diff

BIN
P2. Evaluation.pdf Normal file

Binary file not shown.

214
evaluation_measures.py Normal file
View File

@ -0,0 +1,214 @@
import os
import sys
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import normalize
from tqdm import tqdm
from datetime import datetime, date
import random
import scipy.sparse as sparse
from os import listdir
from os.path import isfile, join
from collections import defaultdict
def evaluate(test,
estimations_df,
reco,
super_reactions=[4,5],
topK=10):
estimations_df=estimations_df.copy()
reco=reco.copy()
test_df=test.copy()
# prepare testset
test_df.columns=['user', 'item', 'rating', 'timestamp']
test_df['user_code'] = test_df['user'].astype("category").cat.codes
test_df['item_code'] = test_df['item'].astype("category").cat.codes
user_code_id = dict(enumerate(test_df['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(test_df['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])))
#prepare estimations
estimations_df.columns=['user', 'item' ,'score']
estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
#compute_estimations
estimations_df=estimations_metrics(test_ui, estimations)
#prepare reco
users=reco[:,:1]
items=reco[:,1::2]
# Let's use inner ids instead of real ones
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users) # maybe users we recommend are not in test set
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
# Let's put them into one array
reco=np.concatenate((users, items), axis=1)
#compute ranking metrics
ranking_df=ranking_metrics(test_ui, reco, super_reactions=super_reactions, topK=topK)
#compute diversity metrics
diversity_df=diversity_metrics(test_ui, reco, topK)
result=pd.concat([estimations_df, ranking_df, diversity_df], axis=1)
return(result)
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
nb_items=test_ui.shape[1]
relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR=\
0,0,0,0,0,0,0,0,0,0,0,0,0
cg = (1.0 / np.log2(np.arange(2, topK + 2)))
cg_sum = np.cumsum(cg)
for (nb_user, user) in tqdm(enumerate(reco[:,0])):
u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
nb_u_rated_items=len(u_rated_items)
if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
relevant_users+=1
u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
(test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
# more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
# but accesing test_ui[user,item] is expensive -we should avoid doing it
if len(u_super_items)>0:
super_relevant_users+=1
user_successes=np.zeros(topK)
nb_user_successes=0
user_super_successes=np.zeros(topK)
nb_user_super_successes=0
# evaluation
for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
if item in u_rated_items:
user_successes[item_position]=1
nb_user_successes+=1
if item in u_super_items:
user_super_successes[item_position]=1
nb_user_super_successes+=1
prec_u=nb_user_successes/topK
prec+=prec_u
rec_u=nb_user_successes/nb_u_rated_items
rec+=rec_u
F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
prec_super+=nb_user_super_successes/topK
rec_super+=nb_user_super_successes/max(len(u_super_items),1)
ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
cumsum_successes=np.cumsum(user_successes)
mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
(nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
((nb_items-nb_u_rated_items)*nb_u_rated_items)
HR+=nb_user_successes>0
result=[]
result.append(('precision', prec/relevant_users))
result.append(('recall', rec/relevant_users))
result.append(('F_1', F_1/relevant_users))
result.append(('F_05', F_05/relevant_users))
result.append(('precision_super', prec_super/super_relevant_users))
result.append(('recall_super', rec_super/super_relevant_users))
result.append(('NDCG', ndcg/relevant_users))
result.append(('mAP', mAP/relevant_users))
result.append(('MRR', MRR/relevant_users))
result.append(('LAUC', LAUC/relevant_users))
result.append(('HR', HR/relevant_users))
df_result=pd.DataFrame()
if len(result)>0:
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def estimations_metrics(test_ui, estimations):
result=[]
RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
result.append(['RMSE', RMSE])
MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
result.append(['MAE', MAE])
df_result=pd.DataFrame()
if len(result)>0:
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def diversity_metrics(test_ui, reco, topK=10):
frequencies=defaultdict(int)
for item in list(set(test_ui.indices)):
frequencies[item]=0
for item in reco[:,1:].flat:
frequencies[item]+=1
nb_reco_outside_test=frequencies[-1]
del frequencies[-1]
frequencies=np.array(list(frequencies.values()))
nb_rec_items=len(frequencies[frequencies>0])
nb_reco_inside_test=np.sum(frequencies)
frequencies=frequencies/np.sum(frequencies)
frequencies=np.sort(frequencies)
with np.errstate(divide='ignore'): # let's put zeros we items with 0 frequency and ignore division warning
log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
result=[]
result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
df_result=(pd.DataFrame(list(zip(*result))[1])).T
df_result.columns=list(zip(*result))[0]
return df_result
def evaluate_all(test,
dir_path="Recommendations generated/ml-100k/",
super_reactions=[4,5],
topK=10):
models = list(set(['_'.join(f.split('_')[:2]) for f in listdir(dir_path)
if isfile(dir_path+f)]))
result=[]
for model in models:
estimations_df=pd.read_csv('{}{}_estimations.csv'.format(dir_path, model), header=None)
reco=np.loadtxt('{}{}_reco.csv'.format(dir_path, model), delimiter=',')
to_append=evaluate(test, estimations_df, reco, super_reactions, topK)
to_append.insert(0, "Model", model)
result.append(to_append)
result=pd.concat(result)
result=result.sort_values(by='recall', ascending=False)
return result

90
helpers.py Normal file
View File

@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import surprise as sp
import time
from collections import defaultdict
from itertools import chain
from six.moves.urllib.request import urlretrieve
import zipfile
import os
def data_to_csr(train_read, test_read):
train_read.columns=['user', 'item', 'rating', 'timestamp']
test_read.columns=['user', 'item', 'rating', 'timestamp']
# Let's build whole dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code
def get_top_n(predictions, n=10):
# Here we create a dictionary which items are lists of pairs (item, score)
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*user_ratings[:n])))
return result
def ready_made(algo, reco_path, estimations_path):
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
print('Generating predictions...')
predictions = algo.test(antitrainset)
print('Generating top N recommendations...')
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv(reco_path, index=False, header=False)
print('Generating predictions...')
predictions = algo.test(testset)
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv(estimations_path, index=False, header=False)
def download_movielens_100k_dataset(force=False):
os.makedirs('./Datasets/', exist_ok = True)
if not os.path.isdir('Datasets/ml-100k') or force:
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
tmp_file_path = 'Datasets/ml-100k.zip'
urlretrieve(url, tmp_file_path)
with zipfile.ZipFile(tmp_file_path, 'r') as tmp_zip:
tmp_zip.extractall('Datasets/')
os.remove(tmp_file_path)