Tasks 1, 2, 3, 5
This commit is contained in:
commit
f437261019
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
Datasets/
|
||||||
|
*.csv
|
||||||
|
__pycache__/
|
707
.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb
Normal file
707
.ipynb_checkpoints/P0. Data preparation-checkpoint.ipynb
Normal file
File diff suppressed because one or more lines are too long
1491
.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb
Normal file
1491
.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1737
.ipynb_checkpoints/P2. Evaluation-checkpoint.ipynb
Normal file
1737
.ipynb_checkpoints/P2. Evaluation-checkpoint.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1369
.ipynb_checkpoints/P3. k-nearest neighbours-checkpoint.ipynb
Normal file
1369
.ipynb_checkpoints/P3. k-nearest neighbours-checkpoint.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1682
.ipynb_checkpoints/P5. Graph-based-checkpoint.ipynb
Normal file
1682
.ipynb_checkpoints/P5. Graph-based-checkpoint.ipynb
Normal file
File diff suppressed because one or more lines are too long
707
P0. Data preparation.ipynb
Normal file
707
P0. Data preparation.ipynb
Normal file
File diff suppressed because one or more lines are too long
1491
P1. Baseline.ipynb
Normal file
1491
P1. Baseline.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1737
P2. Evaluation.ipynb
Normal file
1737
P2. Evaluation.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1488
P3. k-nearest neighbours.ipynb
Normal file
1488
P3. k-nearest neighbours.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1689
P5. Graph-based.ipynb
Normal file
1689
P5. Graph-based.ipynb
Normal file
File diff suppressed because one or more lines are too long
217
evaluation_measures.py
Normal file
217
evaluation_measures.py
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import math
|
||||||
|
from sklearn.preprocessing import normalize
|
||||||
|
from tqdm import tqdm
|
||||||
|
from datetime import datetime, date
|
||||||
|
import random
|
||||||
|
import scipy.sparse as sparse
|
||||||
|
from os import listdir
|
||||||
|
from os.path import isfile, join
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(test,
|
||||||
|
estimations_df,
|
||||||
|
reco,
|
||||||
|
super_reactions=[4,5],
|
||||||
|
topK=10):
|
||||||
|
|
||||||
|
estimations_df=estimations_df.copy()
|
||||||
|
reco=reco.copy()
|
||||||
|
test_df=test.copy()
|
||||||
|
|
||||||
|
# prepare testset
|
||||||
|
test_df.columns=['user', 'item', 'rating', 'timestamp']
|
||||||
|
test_df['user_code'] = test_df['user'].astype("category").cat.codes
|
||||||
|
test_df['item_code'] = test_df['item'].astype("category").cat.codes
|
||||||
|
|
||||||
|
user_code_id = dict(enumerate(test_df['user'].astype("category").cat.categories))
|
||||||
|
user_id_code = dict((v, k) for k, v in user_code_id.items())
|
||||||
|
item_code_id = dict(enumerate(test_df['item'].astype("category").cat.categories))
|
||||||
|
item_id_code = dict((v, k) for k, v in item_code_id.items())
|
||||||
|
|
||||||
|
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])))
|
||||||
|
|
||||||
|
#prepare estimations
|
||||||
|
estimations_df.columns=['user', 'item' ,'score']
|
||||||
|
estimations_df['user_code']=[user_id_code[user] for user in estimations_df['user']]
|
||||||
|
estimations_df['item_code']=[item_id_code[item] for item in estimations_df['item']]
|
||||||
|
estimations=sparse.csr_matrix((estimations_df['score'], (estimations_df['user_code'], estimations_df['item_code'])), shape=test_ui.shape)
|
||||||
|
|
||||||
|
#compute_estimations
|
||||||
|
estimations_df=estimations_metrics(test_ui, estimations)
|
||||||
|
|
||||||
|
#prepare reco
|
||||||
|
users=reco[:,:1]
|
||||||
|
items=reco[:,1::2]
|
||||||
|
# Let's use inner ids instead of real ones
|
||||||
|
users=np.vectorize(lambda x: user_id_code.setdefault(x, -1))(users) # maybe users we recommend are not in test set
|
||||||
|
items=np.vectorize(lambda x: item_id_code.setdefault(x, -1))(items) # maybe items we recommend are not in test set
|
||||||
|
# Let's put them into one array
|
||||||
|
reco=np.concatenate((users, items), axis=1)
|
||||||
|
|
||||||
|
#compute ranking metrics
|
||||||
|
ranking_df=ranking_metrics(test_ui, reco, super_reactions=super_reactions, topK=topK)
|
||||||
|
|
||||||
|
#compute diversity metrics
|
||||||
|
diversity_df=diversity_metrics(test_ui, reco, topK)
|
||||||
|
|
||||||
|
result=pd.concat([estimations_df, ranking_df, diversity_df], axis=1)
|
||||||
|
|
||||||
|
return(result)
|
||||||
|
|
||||||
|
|
||||||
|
def ranking_metrics(test_ui, reco, super_reactions=[], topK=10):
|
||||||
|
|
||||||
|
nb_items=test_ui.shape[1]
|
||||||
|
relevant_users, super_relevant_users, prec, rec, F_1, F_05, prec_super, rec_super, ndcg, mAP, MRR, LAUC, HR, Hit2Rate=\
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||||
|
|
||||||
|
cg = (1.0 / np.log2(np.arange(2, topK + 2)))
|
||||||
|
cg_sum = np.cumsum(cg)
|
||||||
|
|
||||||
|
for (nb_user, user) in tqdm(enumerate(reco[:,0])):
|
||||||
|
u_rated_items=test_ui.indices[test_ui.indptr[user]:test_ui.indptr[user+1]]
|
||||||
|
nb_u_rated_items=len(u_rated_items)
|
||||||
|
if nb_u_rated_items>0: # skip users with no items in test set (still possible that there will be no super items)
|
||||||
|
relevant_users+=1
|
||||||
|
|
||||||
|
u_super_items=u_rated_items[np.vectorize(lambda x: x in super_reactions)\
|
||||||
|
(test_ui.data[test_ui.indptr[user]:test_ui.indptr[user+1]])]
|
||||||
|
# more natural seems u_super_items=[item for item in u_rated_items if test_ui[user,item] in super_reactions]
|
||||||
|
# but accesing test_ui[user,item] is expensive -we should avoid doing it
|
||||||
|
if len(u_super_items)>0:
|
||||||
|
super_relevant_users+=1
|
||||||
|
|
||||||
|
user_successes=np.zeros(topK)
|
||||||
|
nb_user_successes=0
|
||||||
|
user_super_successes=np.zeros(topK)
|
||||||
|
nb_user_super_successes=0
|
||||||
|
|
||||||
|
# evaluation
|
||||||
|
for (item_position,item) in enumerate(reco[nb_user,1:topK+1]):
|
||||||
|
if item in u_rated_items:
|
||||||
|
user_successes[item_position]=1
|
||||||
|
nb_user_successes+=1
|
||||||
|
if item in u_super_items:
|
||||||
|
user_super_successes[item_position]=1
|
||||||
|
nb_user_super_successes+=1
|
||||||
|
|
||||||
|
prec_u=nb_user_successes/topK
|
||||||
|
prec+=prec_u
|
||||||
|
|
||||||
|
rec_u=nb_user_successes/nb_u_rated_items
|
||||||
|
rec+=rec_u
|
||||||
|
|
||||||
|
F_1+=2*(prec_u*rec_u)/(prec_u+rec_u) if prec_u+rec_u>0 else 0
|
||||||
|
F_05+=(0.5**2+1)*(prec_u*rec_u)/(0.5**2*prec_u+rec_u) if prec_u+rec_u>0 else 0
|
||||||
|
|
||||||
|
prec_super+=nb_user_super_successes/topK
|
||||||
|
rec_super+=nb_user_super_successes/max(len(u_super_items),1)
|
||||||
|
ndcg+=np.dot(user_successes,cg)/cg_sum[min(topK, nb_u_rated_items)-1]
|
||||||
|
|
||||||
|
cumsum_successes=np.cumsum(user_successes)
|
||||||
|
mAP+=np.dot(cumsum_successes/np.arange(1,topK+1), user_successes)/min(topK, nb_u_rated_items)
|
||||||
|
MRR+=1/(user_successes.nonzero()[0][0]+1) if user_successes.nonzero()[0].size>0 else 0
|
||||||
|
LAUC+=(np.dot(cumsum_successes, 1-user_successes)+\
|
||||||
|
(nb_user_successes+nb_u_rated_items)/2*((nb_items-nb_u_rated_items)-(topK-nb_user_successes)))/\
|
||||||
|
((nb_items-nb_u_rated_items)*nb_u_rated_items)
|
||||||
|
|
||||||
|
HR+=nb_user_successes>0
|
||||||
|
Hit2Rate+=nb_user_successes>1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
result=[]
|
||||||
|
result.append(('precision', prec/relevant_users))
|
||||||
|
result.append(('recall', rec/relevant_users))
|
||||||
|
result.append(('F_1', F_1/relevant_users))
|
||||||
|
result.append(('F_05', F_05/relevant_users))
|
||||||
|
result.append(('precision_super', prec_super/super_relevant_users))
|
||||||
|
result.append(('recall_super', rec_super/super_relevant_users))
|
||||||
|
result.append(('NDCG', ndcg/relevant_users))
|
||||||
|
result.append(('mAP', mAP/relevant_users))
|
||||||
|
result.append(('MRR', MRR/relevant_users))
|
||||||
|
result.append(('LAUC', LAUC/relevant_users))
|
||||||
|
result.append(('HR', HR/relevant_users))
|
||||||
|
result.append(('H2R', Hit2Rate/relevant_users))
|
||||||
|
|
||||||
|
df_result=pd.DataFrame()
|
||||||
|
if len(result)>0:
|
||||||
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
||||||
|
df_result.columns=list(zip(*result))[0]
|
||||||
|
return df_result
|
||||||
|
|
||||||
|
|
||||||
|
def estimations_metrics(test_ui, estimations):
|
||||||
|
result=[]
|
||||||
|
|
||||||
|
RMSE=(np.sum((estimations.data-test_ui.data)**2)/estimations.nnz)**(1/2)
|
||||||
|
result.append(['RMSE', RMSE])
|
||||||
|
|
||||||
|
MAE=np.sum(abs(estimations.data-test_ui.data))/estimations.nnz
|
||||||
|
result.append(['MAE', MAE])
|
||||||
|
|
||||||
|
df_result=pd.DataFrame()
|
||||||
|
if len(result)>0:
|
||||||
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
||||||
|
df_result.columns=list(zip(*result))[0]
|
||||||
|
return df_result
|
||||||
|
|
||||||
|
def diversity_metrics(test_ui, reco, topK=10):
|
||||||
|
|
||||||
|
frequencies=defaultdict(int)
|
||||||
|
|
||||||
|
for item in list(set(test_ui.indices)):
|
||||||
|
frequencies[item]=0
|
||||||
|
|
||||||
|
for item in reco[:,1:].flat:
|
||||||
|
frequencies[item]+=1
|
||||||
|
|
||||||
|
nb_reco_outside_test=frequencies[-1]
|
||||||
|
del frequencies[-1]
|
||||||
|
|
||||||
|
frequencies=np.array(list(frequencies.values()))
|
||||||
|
|
||||||
|
nb_rec_items=len(frequencies[frequencies>0])
|
||||||
|
nb_reco_inside_test=np.sum(frequencies)
|
||||||
|
|
||||||
|
frequencies=frequencies/np.sum(frequencies)
|
||||||
|
frequencies=np.sort(frequencies)
|
||||||
|
|
||||||
|
with np.errstate(divide='ignore'): # let's put zeros we items with 0 frequency and ignore division warning
|
||||||
|
log_frequencies=np.nan_to_num(np.log(frequencies), posinf=0, neginf=0)
|
||||||
|
|
||||||
|
result=[]
|
||||||
|
result.append(('Reco in test', nb_reco_inside_test/(nb_reco_inside_test+nb_reco_outside_test)))
|
||||||
|
result.append(('Test coverage', nb_rec_items/test_ui.shape[1]))
|
||||||
|
result.append(('Shannon', -np.dot(frequencies, log_frequencies)))
|
||||||
|
result.append(('Gini', np.dot(frequencies, np.arange(1-len(frequencies), len(frequencies), 2))/(len(frequencies)-1)))
|
||||||
|
|
||||||
|
df_result=(pd.DataFrame(list(zip(*result))[1])).T
|
||||||
|
df_result.columns=list(zip(*result))[0]
|
||||||
|
return df_result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_all(test,
|
||||||
|
dir_path="Recommendations generated/ml-100k/",
|
||||||
|
super_reactions=[4,5],
|
||||||
|
topK=10):
|
||||||
|
|
||||||
|
models = list(set(['_'.join(f.split('_')[:2]) for f in listdir(dir_path)
|
||||||
|
if isfile(dir_path+f)]))
|
||||||
|
result=[]
|
||||||
|
for model in models:
|
||||||
|
estimations_df=pd.read_csv('{}{}_estimations.csv'.format(dir_path, model), header=None)
|
||||||
|
reco=np.loadtxt('{}{}_reco.csv'.format(dir_path, model), delimiter=',')
|
||||||
|
to_append=evaluate(test, estimations_df, reco, super_reactions, topK)
|
||||||
|
|
||||||
|
to_append.insert(0, "Model", model)
|
||||||
|
result.append(to_append)
|
||||||
|
result=pd.concat(result)
|
||||||
|
result=result.sort_values(by='recall', ascending=False)
|
||||||
|
return result
|
90
helpers.py
Normal file
90
helpers.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse as sparse
|
||||||
|
import surprise as sp
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from itertools import chain
|
||||||
|
from six.moves.urllib.request import urlretrieve
|
||||||
|
import zipfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
def data_to_csr(train_read, test_read):
|
||||||
|
train_read.columns=['user', 'item', 'rating', 'timestamp']
|
||||||
|
test_read.columns=['user', 'item', 'rating', 'timestamp']
|
||||||
|
|
||||||
|
# Let's build whole dataset
|
||||||
|
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
|
||||||
|
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
|
||||||
|
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
|
||||||
|
|
||||||
|
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
|
||||||
|
user_id_code = dict((v, k) for k, v in user_code_id.items())
|
||||||
|
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
|
||||||
|
item_id_code = dict((v, k) for k, v in item_code_id.items())
|
||||||
|
|
||||||
|
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
|
||||||
|
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
|
||||||
|
|
||||||
|
# Take number of users and items
|
||||||
|
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
|
||||||
|
|
||||||
|
# Create sparse csr matrices
|
||||||
|
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
|
||||||
|
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
|
||||||
|
|
||||||
|
return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code
|
||||||
|
|
||||||
|
|
||||||
|
def get_top_n(predictions, n=10):
|
||||||
|
|
||||||
|
# Here we create a dictionary which items are lists of pairs (item, score)
|
||||||
|
top_n = defaultdict(list)
|
||||||
|
for uid, iid, true_r, est, _ in predictions:
|
||||||
|
top_n[uid].append((iid, est))
|
||||||
|
|
||||||
|
result=[]
|
||||||
|
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
|
||||||
|
for uid, user_ratings in top_n.items():
|
||||||
|
user_ratings.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
result.append([uid]+list(chain(*user_ratings[:n])))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def ready_made(algo, reco_path, estimations_path):
|
||||||
|
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
|
||||||
|
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
|
||||||
|
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
|
||||||
|
|
||||||
|
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
|
||||||
|
testset = sp.Trainset.build_testset(testset.build_full_trainset())
|
||||||
|
|
||||||
|
algo.fit(trainset)
|
||||||
|
|
||||||
|
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
|
||||||
|
print('Generating predictions...')
|
||||||
|
predictions = algo.test(antitrainset)
|
||||||
|
print('Generating top N recommendations...')
|
||||||
|
top_n = get_top_n(predictions, n=10)
|
||||||
|
top_n=pd.DataFrame(top_n)
|
||||||
|
top_n.to_csv(reco_path, index=False, header=False)
|
||||||
|
|
||||||
|
print('Generating predictions...')
|
||||||
|
predictions = algo.test(testset)
|
||||||
|
predictions_df=[]
|
||||||
|
for uid, iid, true_r, est, _ in predictions:
|
||||||
|
predictions_df.append([uid, iid, est])
|
||||||
|
predictions_df=pd.DataFrame(predictions_df)
|
||||||
|
predictions_df.to_csv(estimations_path, index=False, header=False)
|
||||||
|
|
||||||
|
|
||||||
|
def download_movielens_100k_dataset(force=False):
|
||||||
|
os.makedirs('./Datasets/', exist_ok = True)
|
||||||
|
if not os.path.isdir('Datasets/ml-100k') or force:
|
||||||
|
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
|
||||||
|
tmp_file_path = 'Datasets/ml-100k.zip'
|
||||||
|
urlretrieve(url, tmp_file_path)
|
||||||
|
|
||||||
|
with zipfile.ZipFile(tmp_file_path, 'r') as tmp_zip:
|
||||||
|
tmp_zip.extractall('Datasets/')
|
||||||
|
os.remove(tmp_file_path)
|
Loading…
Reference in New Issue
Block a user