warsztaty-B/P6. LightFM.ipynb
2020-06-13 15:34:33 +02:00

56 KiB
Raw Blame History

import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
import time
import matplotlib.pyplot as plt

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)
train_ui
<943x1682 sparse matrix of type '<class 'numpy.longlong'>'
	with 80000 stored elements in Compressed Sparse Row format>

Let's prepare user and item features

Item features

movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)
movies[:3]
0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 24 columns

movies=movies.astype(object)
id_date=pd.get_dummies(movies[[0,2]], ['id', 'date'])
id_date[:3]
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... date_30-Mar-1996 date_30-May-1997 date_30-Nov-1996 date_30-Oct-1995 date_30-Oct-1996 date_31-Dec-1997 date_31-Jan-1997 date_31-Jul-1996 date_31-May-1996 date_4-Feb-1971
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1922 columns

genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,
                     encoding='latin-1')
genres[:3]
0 1
0 unknown 0
1 Action 1
2 Adventure 2
item_genres=movies[np.arange(5,24)]
item_genres.columns=list(genres[0])
item_features_df=pd.concat([id_date, item_genres], axis=1).astype(int)
item_features_df
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1677 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1678 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
1679 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1680 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1681 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1682 rows × 1941 columns

item_features=sparse.csr_matrix(item_features_df.values)
item_features
<1682x1941 sparse matrix of type '<class 'numpy.longlong'>'
	with 6256 stored elements in Compressed Sparse Row format>

User features

users = pd.read_csv('./Datasets/ml-100k/u.user', sep='|', encoding='latin-1', header=None)
users[:3]
0 1 2 3 4
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
users=users.astype(object)
user_features_df=pd.get_dummies(users, ['id', 'age', 'sex','profesion','zip_code'])
item_features_df[:3]
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 1941 columns

train_ui
<943x1682 sparse matrix of type '<class 'numpy.longlong'>'
	with 80000 stored elements in Compressed Sparse Row format>
user_features=sparse.csr_matrix(user_features_df.values)
user_features
<943x1822 sparse matrix of type '<class 'numpy.uint8'>'
	with 4715 stored elements in Compressed Sparse Row format>

Model

LightFM with user and item features

from lightfm import LightFM
from lightfm.evaluation import precision_at_k

for loss in ['logistic', 'bpr', 'warp']:

    model = LightFM(no_components=10, loss=loss)
    model.fit(train_ui, user_features=user_features, item_features=item_features, epochs=30, num_threads=4)

    print(loss)
    print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, 
                                                   user_features=user_features, item_features=item_features, k=10).mean())
    print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
                                                  user_features=user_features, item_features=item_features, k=10).mean())
logistic
Train precision: 0.09
Test precision: 0.03
bpr
Train precision: 0.57
Test precision: 0.24
warp
Train precision: 0.63
Test precision: 0.34
def top_k_recommendations(model, user_features, item_features, user_code_id, item_code_id, topK=10):
    result=[]
    for user_code in range(test_ui.shape[0]):
        user_rated=train_ui.indices[train_ui.indptr[user_code]:train_ui.indptr[user_code+1]]
        scores = model.predict(user_code, np.arange(train_ui.shape[1]), user_features=user_features, item_features=item_features)
        
        scores[user_rated]=-np.inf # to put rated items at the end of the list
        
        top_items=[item_code_id[item] for item in np.argsort(-scores)[:topK]]
        result.append([user_code_id[user_code]]+list(chain(*zip(top_items,-np.sort(-scores)[:topK]))))
    return result

def estimate(model, user_features, item_features, user_code_id, item_code_id, test_ui):
    result=[]
    for user, item in zip(*test_ui.nonzero()):
        result.append([user_code_id[user], item_code_id[item], 
                       model.predict(user, np.array([item]), user_features=user_features, item_features=item_features)[0]])
    return result
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFM_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFM_estimations.csv', index=False, header=False)

Pure MF with LightFM

item_features_interactions=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns 
                                    if 'id_' in item_feature]].values)
user_features_interactions=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns 
                                    if 'id_' in user_feature]].values)
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_interactions, item_features=item_features_interactions, epochs=30, num_threads=4)

from lightfm.evaluation import precision_at_k

print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui, k=10).mean())
Train precision: 0.63
Test precision: 0.33
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_estimations.csv', index=False, header=False)

LightFM with user/item attributes only (without treating id as a feature)

item_features_only=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns 
                                    if 'id_' not in item_feature]].values)
user_features_only=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns 
                                    if 'id_' not in user_feature]].values)
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_only, item_features=item_features_only, epochs=30, num_threads=4)

from lightfm.evaluation import precision_at_k

print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, 
                                    user_features=user_features_only, item_features=item_features_only, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
                                    user_features=user_features_only, item_features=item_features_only, k=10).mean())
Train precision: 0.40
Test precision: 0.16
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_estimations.csv', index=False, header=False)
import evaluation_measures as ev

dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

df=ev.evaluate_all(test, dir_path, super_reactions)
display(df.iloc[:,:9])
display(df.iloc[:,np.append(0,np.arange(9, df.shape[1]))])
943it [00:00, 8294.93it/s]
943it [00:00, 7480.38it/s]
943it [00:00, 8182.78it/s]
943it [00:00, 7942.50it/s]
943it [00:00, 7571.16it/s]
943it [00:00, 7715.40it/s]
943it [00:00, 8094.16it/s]
943it [00:00, 9015.90it/s]
943it [00:00, 7848.42it/s]
943it [00:00, 7401.02it/s]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super
0 Ready_LightFMpureMF 7.953192 7.462008 0.334464 0.219997 0.217225 0.254981 0.233798 0.266952
0 Ready_LightFM 162.707436 160.855483 0.340827 0.217682 0.217990 0.258010 0.243884 0.260663
0 Self_P3 3.702446 3.527273 0.282185 0.192092 0.186749 0.216980 0.204185 0.240096
0 Ready_ImplicitALS 3.266101 3.065824 0.255037 0.188653 0.176852 0.201189 0.166631 0.214925
0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 0.141584 0.130472 0.137473
0 Ready_LightFMcontent 182.471340 180.405210 0.160339 0.101224 0.102198 0.121074 0.102682 0.112455
0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 0.061286 0.079614 0.056463
0 Self_GlobalAvg 1.125760 0.943534 0.061188 0.025968 0.031383 0.041343 0.040558 0.032107
0 Ready_Random 1.514355 1.216383 0.049735 0.022300 0.025782 0.033598 0.028219 0.021751
0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 0.000463 0.000644 0.000189
Model NDCG mAP MRR LAUC HR Reco in test Test coverage Shannon Gini
0 Ready_LightFMpureMF 0.398778 0.263058 0.629129 0.607709 0.913043 1.000000 0.275613 5.085818 0.913665
0 Ready_LightFM 0.403850 0.268266 0.637590 0.606568 0.898197 1.000000 0.351371 5.366291 0.885046
0 Self_P3 0.339114 0.204905 0.572157 0.593544 0.875928 1.000000 0.077201 3.875892 0.974947
0 Ready_ImplicitALS 0.305908 0.172546 0.523871 0.591709 0.889714 1.000000 0.502886 5.722957 0.827507
0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317
0 Ready_LightFMcontent 0.180079 0.087429 0.337825 0.547572 0.704136 0.974973 0.264791 4.909893 0.926201
0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139
0 Self_GlobalAvg 0.067695 0.027470 0.171187 0.509546 0.384942 1.000000 0.025974 2.711772 0.992003
0 Ready_Random 0.054383 0.021119 0.133978 0.507680 0.339343 0.986957 0.177489 5.088670 0.907676
0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380