In [1]:
import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
import time
import matplotlib.pyplot as plt

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

In [2]:
train_ui

<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 80000 stored elements in Compressed Sparse Row format>

# Let's prepare user and item features

### Item features

In [3]:
movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)
movies[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
movies=movies.astype(object)
id_date=pd.get_dummies(movies[[0,2]], ['id', 'date'])
id_date[:3]

Unnamed: 0,id_1,id_2,id_3,id_4,id_5,id_6,id_7,id_8,id_9,id_10,...,date_30-Mar-1996,date_30-May-1997,date_30-Nov-1996,date_30-Oct-1995,date_30-Oct-1996,date_31-Dec-1997,date_31-Jan-1997,date_31-Jul-1996,date_31-May-1996,date_4-Feb-1971
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,
                     encoding='latin-1')
genres[:3]

Unnamed: 0,0,1
0,unknown,0
1,Action,1
2,Adventure,2


In [6]:
item_genres=movies[np.arange(5,24)]
item_genres.columns=list(genres[0])

In [7]:
item_features_df=pd.concat([id_date, item_genres], axis=1).astype(int)
item_features_df

Unnamed: 0,id_1,id_2,id_3,id_4,id_5,id_6,id_7,id_8,id_9,id_10,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
item_features=sparse.csr_matrix(item_features_df.values)
item_features

<1682x1941 sparse matrix of type '<class 'numpy.intc'>'
	with 6256 stored elements in Compressed Sparse Row format>

### User features

In [9]:
users = pd.read_csv('./Datasets/ml-100k/u.user', sep='|', encoding='latin-1', header=None)
users[:3]

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [10]:
users=users.astype(object)
user_features_df=pd.get_dummies(users, ['id', 'age', 'sex','profesion','zip_code'])
item_features_df[:3]

Unnamed: 0,id_1,id_2,id_3,id_4,id_5,id_6,id_7,id_8,id_9,id_10,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
train_ui

<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 80000 stored elements in Compressed Sparse Row format>

In [12]:
user_features=sparse.csr_matrix(user_features_df.values)
user_features

<943x1822 sparse matrix of type '<class 'numpy.uint8'>'
	with 4715 stored elements in Compressed Sparse Row format>

# Model

#### LightFM with user and item features

In [13]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

for loss in ['logistic', 'bpr', 'warp']:

    model = LightFM(no_components=10, loss=loss)
    model.fit(train_ui, user_features=user_features, item_features=item_features, epochs=30, num_threads=4)

    print(loss)
    print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, 
                                                   user_features=user_features, item_features=item_features, k=10).mean())
    print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
                                                  user_features=user_features, item_features=item_features, k=10).mean())

ModuleNotFoundError: No module named 'lightfm'

In [None]:
def top_k_recommendations(model, user_features, item_features, user_code_id, item_code_id, topK=10):
    result=[]
    for user_code in range(test_ui.shape[0]):
        user_rated=train_ui.indices[train_ui.indptr[user_code]:train_ui.indptr[user_code+1]]
        scores = model.predict(user_code, np.arange(train_ui.shape[1]), user_features=user_features, item_features=item_features)
        
        scores[user_rated]=-np.inf # to put rated items at the end of the list
        
        top_items=[item_code_id[item] for item in np.argsort(-scores)[:topK]]
        result.append([user_code_id[user_code]]+list(chain(*zip(top_items,-np.sort(-scores)[:topK]))))
    return result

def estimate(model, user_features, item_features, user_code_id, item_code_id, test_ui):
    result=[]
    for user, item in zip(*test_ui.nonzero()):
        result.append([user_code_id[user], item_code_id[item], 
                       model.predict(user, np.array([item]), user_features=user_features, item_features=item_features)[0]])
    return result

In [14]:
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFM_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features, item_features=item_features, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFM_estimations.csv', index=False, header=False)

NameError: name 'top_k_recommendations' is not defined

#### Pure MF with LightFM

In [15]:
item_features_interactions=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns 
                                    if 'id_' in item_feature]].values)
user_features_interactions=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns 
                                    if 'id_' in user_feature]].values)

In [16]:
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_interactions, item_features=item_features_interactions, epochs=30, num_threads=4)

from lightfm.evaluation import precision_at_k

print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui, k=10).mean())

ModuleNotFoundError: No module named 'lightfm'

In [17]:
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features_interactions, item_features=item_features_interactions, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMpureMF_estimations.csv', index=False, header=False)

NameError: name 'top_k_recommendations' is not defined

#### LightFM with user/item attributes only (without treating id as a feature)

In [18]:
item_features_only=sparse.csr_matrix(item_features_df[[item_feature for item_feature in item_features_df.columns 
                                    if 'id_' not in item_feature]].values)
user_features_only=sparse.csr_matrix(user_features_df[[user_feature for user_feature in user_features_df.columns 
                                    if 'id_' not in user_feature]].values)

In [22]:
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(train_ui, user_features=user_features_only, item_features=item_features_only, epochs=30, num_threads=4)

from lightfm.evaluation import precision_at_k

print("Train precision: %.2f" % precision_at_k(model, test_interactions=train_ui, 
                                    user_features=user_features_only, item_features=item_features_only, k=10).mean())
print("Test precision: %.2f" % precision_at_k(model, test_interactions=test_ui, train_interactions=train_ui,
                                    user_features=user_features_only, item_features=item_features_only, k=10).mean())



Train precision: 0.39
Test precision: 0.16


In [24]:
top_n=pd.DataFrame(top_k_recommendations(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_reco.csv', index=False, header=False)

estimations=pd.DataFrame(estimate(model=model, user_features=user_features_only, item_features=item_features_only, user_code_id=user_code_id, item_code_id=item_code_id, test_ui=test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Ready_LightFMcontent_estimations.csv', index=False, header=False)

NameError: name 'top_k_recommendations' is not defined

In [21]:
import evaluation_measures as ev

dir_path="Recommendations generated/ml-100k/"
super_reactions=[4,5]
test=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)

df=ev.evaluate_all(test, dir_path, super_reactions)
display(df.iloc[:,:9])
display(df.iloc[:,np.append(0,np.arange(9, df.shape[1]))])

943it [00:00, 10622.43it/s]
943it [00:00, 9747.58it/s]
943it [00:00, 10554.65it/s]
943it [00:00, 9450.92it/s]
943it [00:00, 10058.79it/s]
943it [00:00, 10744.58it/s]
943it [00:00, 10390.37it/s]
943it [00:00, 10578.65it/s]
943it [00:00, 11388.05it/s]
943it [00:00, 11256.24it/s]
943it [00:00, 10166.93it/s]
943it [00:00, 10388.40it/s]
943it [00:00, 10058.69it/s]
943it [00:00, 10740.81it/s]
943it [00:00, 9636.75it/s]
943it [00:00, 10511.93it/s]
943it [00:00, 10864.37it/s]


Unnamed: 0,Model,RMSE,MAE,precision,recall,F_1,F_05,precision_super,recall_super
0,Self_RP3Beta,3.704589,3.529397,0.286744,0.196524,0.191117,0.221375,0.213948,0.251263
0,Self_P3,3.702446,3.527273,0.282185,0.192092,0.186749,0.21698,0.204185,0.240096
0,Self_TopPop,2.508258,2.217909,0.188865,0.116919,0.118732,0.141584,0.130472,0.137473
0,Ready_SVD,0.949165,0.746667,0.093955,0.044969,0.051197,0.065474,0.083906,0.073996
0,Self_SVD,0.91633,0.720153,0.103393,0.044455,0.053177,0.070073,0.093884,0.079366
0,Ready_Baseline,0.949459,0.752487,0.09141,0.037652,0.04603,0.061286,0.079614,0.056463
0,Ready_SVDBiased,0.938146,0.739917,0.086532,0.037067,0.044832,0.058877,0.078004,0.057865
0,Self_GlobalAvg,1.12576,0.943534,0.061188,0.025968,0.031383,0.041343,0.040558,0.032107
0,Ready_Random,1.51003,1.211848,0.050053,0.022367,0.025984,0.033727,0.030687,0.023255
0,Ready_I-KNN,1.030386,0.813067,0.026087,0.006908,0.010593,0.016046,0.021137,0.009522


Unnamed: 0,Model,NDCG,mAP,MRR,LAUC,HR,Reco in test,Test coverage,Shannon,Gini
0,Self_RP3Beta,0.344598,0.207836,0.587953,0.59577,0.885472,0.998197,0.193362,4.291821,0.960775
0,Self_P3,0.339114,0.204905,0.572157,0.593544,0.875928,1.0,0.077201,3.875892,0.974947
0,Self_TopPop,0.214651,0.111707,0.400939,0.555546,0.765642,1.0,0.038961,3.159079,0.987317
0,Ready_SVD,0.104672,0.048211,0.220757,0.519187,0.483563,0.997985,0.204906,4.408913,0.954288
0,Self_SVD,0.107792,0.051281,0.20021,0.518957,0.47508,0.853022,0.147186,3.911356,0.971196
0,Ready_Baseline,0.095957,0.043178,0.198193,0.515501,0.437964,1.0,0.033911,2.836513,0.991139
0,Ready_SVDBiased,0.094583,0.043013,0.202391,0.515202,0.433722,0.996076,0.166667,4.168354,0.964092
0,Self_GlobalAvg,0.067695,0.02747,0.171187,0.509546,0.384942,1.0,0.025974,2.711772,0.992003
0,Ready_Random,0.055392,0.021602,0.13769,0.507713,0.338282,0.987911,0.18759,5.111878,0.906685
0,Ready_I-KNN,0.024214,0.008958,0.048068,0.499885,0.154825,0.402333,0.434343,5.13365,0.877999
