56 KiB
56 KiB
import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
import time
import matplotlib.pyplot as plt
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)
train_ui
<943x1682 sparse matrix of type '<class 'numpy.longlong'>' with 80000 stored elements in Compressed Sparse Row format>
Let's prepare user and item features
Item features
movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)
movies[:3]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Toy%20Story%2... | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | GoldenEye (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?GoldenEye%20(... | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 3 | Four Rooms (1995) | 01-Jan-1995 | NaN | http://us.imdb.com/M/title-exact?Four%20Rooms%... | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 rows × 24 columns
movies=movies.astype(object)
id_date=pd.get_dummies(movies[[0,2]], ['id', 'date'])
id_date[:3]
id_1 | id_2 | id_3 | id_4 | id_5 | id_6 | id_7 | id_8 | id_9 | id_10 | ... | date_30-Mar-1996 | date_30-May-1997 | date_30-Nov-1996 | date_30-Oct-1995 | date_30-Oct-1996 | date_31-Dec-1997 | date_31-Jan-1997 | date_31-Jul-1996 | date_31-May-1996 | date_4-Feb-1971 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 rows × 1922 columns
genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,
encoding='latin-1')
genres[:3]
0 | 1 | |
---|---|---|
0 | unknown | 0 |
1 | Action | 1 |
2 | Adventure | 2 |
item_genres=movies[np.arange(5,24)]
item_genres.columns=list(genres[0])
item_features_df=pd.concat([id_date, item_genres], axis=1).astype(int)
item_features_df
id_1 | id_2 | id_3 | id_4 | id_5 | id_6 | id_7 | id_8 | id_9 | id_10 | ... | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1677 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1678 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1679 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1680 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1681 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1682 rows × 1941 columns
item_features=sparse.csr_matrix(item_features_df.values)
item_features
<1682x1941 sparse matrix of type '<class 'numpy.longlong'>' with 6256 stored elements in Compressed Sparse Row format>
User features
users = pd.read_csv('./Datasets/ml-100k/u.user', sep='|', encoding='latin-1', header=None)
users[:3]
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
users=users.astype(object)
user_features_df=pd.get_dummies(users, ['id', 'age', 'sex','profesion','zip_code'])
item_features_df[:3]
id_1 | id_2 | id_3 | id_4 | id_5 | id_6 | id_7 | id_8 | id_9 | id_10 | ... | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 rows × 1941 columns
train_ui
<943x1682 sparse matrix of type '<class 'numpy.longlong'>' with 80000 stored elements in Compressed Sparse Row format>
user_features=sparse.csr_matrix(user_features_df.values)
user_features
<943x1822 sparse matrix of type '<class 'numpy.uint8'>' with 4715 stored elements in Compressed Sparse Row format>
Model
LightFM with user and item features
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
for loss in ['logistic', 'bpr', 'warp']:
model = LightFM(no_components=10, loss=loss)
model.fit(train_ui, user_features=user_features, item_features=item_features, epochs=30, num_threads=4)
print(loss)
print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui,
user_features=user_features, item_features=item_features, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
user_features=user_features, item_features=item_features, k=10).mean())
logistic Train precision: 0.09 Test precision: 0.03 bpr Train precision: 0.57 Test precision: 0.24 warp Train precision: 0.63 Test precision: 0.34
def top_k_recommendations(model, user_features, item_features, user_code_id, item_code_id, topK=10):
result=[]
for user_code in range(test_ui.shape[0]):
user_rated=train_ui.indices[train_ui.indptr[user_code]:train_ui.indptr[user_code+1]]
scores = model.predict(user_code, np.arange(train_ui.shape[1]), user_features=user_features, item_features=item_features)
scores[user_rated]=-np.inf # to put rated items at the end of the list
top_items=[item_code_id[item] for item in np.argsort(-scores)[:topK]]
result.append([user_code_id[user_code]]+list(chain(*zip(top_items,-np.sort(-scores)[:topK]))))
return result
def estimate(model, user_features, item_features, user_code_id, item_code_id, test_ui):
result=[]
for user, item in zip(*test_ui.nonzero()):
result.append([user_code_id[user], item_code_id[item],
model.predict(user, np.array([item]), user_features=user_features, item_features=item_features)[0]])
return result
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFM_reco.csv', index=False, header=False)
estimations=pd.DataFrame(estimate(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFM_estimations.csv', index=False, header=False)
Pure MF with LightFM
item_features_interactions=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns
if 'id_' in item_feature]].values)
user_features_interactions=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns
if 'id_' in user_feature]].values)
from lightfm import LightFM
model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_interactions, item_features=item_features_interactions, epochs=30, num_threads=4)
from lightfm.evaluation import precision_at_k
print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui, k=10).mean())
Train precision: 0.63 Test precision: 0.33
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_reco.csv', index=False, header=False)
estimations=pd.DataFrame(estimate(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_estimations.csv', index=False, header=False)
LightFM with user/item attributes only (without treating id as a feature)
item_features_only=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns
if 'id_' not in item_feature]].values)
user_features_only=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns
if 'id_' not in user_feature]].values)
from lightfm import LightFM
model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_only, item_features=item_features_only, epochs=30, num_threads=4)
from lightfm.evaluation import precision_at_k
print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui,
user_features=user_features_only, item_features=item_features_only, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
user_features=user_features_only, item_features=item_features_only, k=10).mean())
Train precision: 0.40 Test precision: 0.16
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_reco.csv', index=False, header=False)
estimations=pd.DataFrame(estimate(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_estimations.csv', index=False, header=False)
import evaluation_measures as ev
dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
df=ev.evaluate_all(test, dir_path, super_reactions)
display(df.iloc[:,:9])
display(df.iloc[:,np.append(0,np.arange(9, df.shape[1]))])
943it [00:00, 8294.93it/s] 943it [00:00, 7480.38it/s] 943it [00:00, 8182.78it/s] 943it [00:00, 7942.50it/s] 943it [00:00, 7571.16it/s] 943it [00:00, 7715.40it/s] 943it [00:00, 8094.16it/s] 943it [00:00, 9015.90it/s] 943it [00:00, 7848.42it/s] 943it [00:00, 7401.02it/s]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | |
---|---|---|---|---|---|---|---|---|---|
0 | Ready_LightFMpureMF | 7.953192 | 7.462008 | 0.334464 | 0.219997 | 0.217225 | 0.254981 | 0.233798 | 0.266952 |
0 | Ready_LightFM | 162.707436 | 160.855483 | 0.340827 | 0.217682 | 0.217990 | 0.258010 | 0.243884 | 0.260663 |
0 | Self_P3 | 3.702446 | 3.527273 | 0.282185 | 0.192092 | 0.186749 | 0.216980 | 0.204185 | 0.240096 |
0 | Ready_ImplicitALS | 3.266101 | 3.065824 | 0.255037 | 0.188653 | 0.176852 | 0.201189 | 0.166631 | 0.214925 |
0 | Self_TopPop | 2.508258 | 2.217909 | 0.188865 | 0.116919 | 0.118732 | 0.141584 | 0.130472 | 0.137473 |
0 | Ready_LightFMcontent | 182.471340 | 180.405210 | 0.160339 | 0.101224 | 0.102198 | 0.121074 | 0.102682 | 0.112455 |
0 | Ready_Baseline | 0.949459 | 0.752487 | 0.091410 | 0.037652 | 0.046030 | 0.061286 | 0.079614 | 0.056463 |
0 | Self_GlobalAvg | 1.125760 | 0.943534 | 0.061188 | 0.025968 | 0.031383 | 0.041343 | 0.040558 | 0.032107 |
0 | Ready_Random | 1.514355 | 1.216383 | 0.049735 | 0.022300 | 0.025782 | 0.033598 | 0.028219 | 0.021751 |
0 | Self_BaselineUI | 0.967585 | 0.762740 | 0.000954 | 0.000170 | 0.000278 | 0.000463 | 0.000644 | 0.000189 |
Model | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Ready_LightFMpureMF | 0.398778 | 0.263058 | 0.629129 | 0.607709 | 0.913043 | 1.000000 | 0.275613 | 5.085818 | 0.913665 |
0 | Ready_LightFM | 0.403850 | 0.268266 | 0.637590 | 0.606568 | 0.898197 | 1.000000 | 0.351371 | 5.366291 | 0.885046 |
0 | Self_P3 | 0.339114 | 0.204905 | 0.572157 | 0.593544 | 0.875928 | 1.000000 | 0.077201 | 3.875892 | 0.974947 |
0 | Ready_ImplicitALS | 0.305908 | 0.172546 | 0.523871 | 0.591709 | 0.889714 | 1.000000 | 0.502886 | 5.722957 | 0.827507 |
0 | Self_TopPop | 0.214651 | 0.111707 | 0.400939 | 0.555546 | 0.765642 | 1.000000 | 0.038961 | 3.159079 | 0.987317 |
0 | Ready_LightFMcontent | 0.180079 | 0.087429 | 0.337825 | 0.547572 | 0.704136 | 0.974973 | 0.264791 | 4.909893 | 0.926201 |
0 | Ready_Baseline | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.000000 | 0.033911 | 2.836513 | 0.991139 |
0 | Self_GlobalAvg | 0.067695 | 0.027470 | 0.171187 | 0.509546 | 0.384942 | 1.000000 | 0.025974 | 2.711772 | 0.992003 |
0 | Ready_Random | 0.054383 | 0.021119 | 0.133978 | 0.507680 | 0.339343 | 0.986957 | 0.177489 | 5.088670 | 0.907676 |
0 | Self_BaselineUI | 0.000752 | 0.000168 | 0.001677 | 0.496424 | 0.009544 | 0.600530 | 0.005051 | 1.803126 | 0.996380 |