WSS-project/P7. LightFM.ipynb
Robert Kwiecinski c16fd781bb script added
2021-05-26 22:32:10 +02:00

69 KiB
Raw Blame History

import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
import time
import matplotlib.pyplot as plt

train_read = pd.read_csv("./Datasets/ml-100k/train.csv", sep="\t", header=None)
test_read = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
(
    train_ui,
    test_ui,
    user_code_id,
    user_id_code,
    item_code_id,
    item_id_code,
) = helpers.data_to_csr(train_read, test_read)

User and item features preparation

Item features

movies = pd.read_csv(
    "./Datasets/ml-100k/u.item", sep="|", encoding="latin-1", header=None
).astype(object)

movies[:3]
0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 24 columns

id_date = pd.get_dummies(data=movies[[0, 2]], prefix=["id", "date"])
id_date[:3]
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... date_30-Mar-1996 date_30-May-1997 date_30-Nov-1996 date_30-Oct-1995 date_30-Oct-1996 date_31-Dec-1997 date_31-Jan-1997 date_31-Jul-1996 date_31-May-1996 date_4-Feb-1971
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 1922 columns

genres = pd.read_csv(
    "./Datasets/ml-100k/u.genre", sep="|", header=None, encoding="latin-1"
)
genres[:3]
0 1
0 unknown 0
1 Action 1
2 Adventure 2
item_genres = movies[np.arange(5, 24)]
item_genres.columns = list(genres[0])
item_features_df = pd.concat([id_date, item_genres], axis=1).astype(int)
item_features_df
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1677 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1678 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
1679 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1680 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1681 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1682 rows × 1941 columns

item_features = sparse.csr_matrix(item_features_df.values)
item_features
<1682x1941 sparse matrix of type '<class 'numpy.int64'>'
	with 6256 stored elements in Compressed Sparse Row format>

User features

users = pd.read_csv(
    "./Datasets/ml-100k/u.user", sep="|", encoding="latin-1", header=None
)
users[:3]
0 1 2 3 4
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
users = users.astype(object)
user_features_df = pd.get_dummies(users, ["id", "age", "sex", "profesion", "zip_code"])
item_features_df[:3]
id_1 id_2 id_3 id_4 id_5 id_6 id_7 id_8 id_9 id_10 ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 1941 columns

train_ui
<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 80000 stored elements in Compressed Sparse Row format>
user_features = sparse.csr_matrix(user_features_df.values)
user_features
<943x1822 sparse matrix of type '<class 'numpy.uint8'>'
	with 4715 stored elements in Compressed Sparse Row format>

Model

LightFM with user and item features

from lightfm import LightFM
from lightfm.evaluation import precision_at_k

for loss in ["logistic", "bpr", "warp"]:

    model = LightFM(no_components=10, loss=loss)
    model.fit(
        train_ui,
        user_features=user_features,
        item_features=item_features,
        epochs=30,
        num_threads=4,
    )

    print(loss)
    print(
        "Train precision: %.2f"
        % precision_at_k(
            model,
            test_interactions=train_ui,
            user_features=user_features,
            item_features=item_features,
            k=10,
            preserve_rows=True,
        ).mean()
    )
    print(
        "Test precision: %.2f"
        % precision_at_k(
            model,
            test_interactions=test_ui,
            train_interactions=train_ui,
            user_features=user_features,
            item_features=item_features,
            k=10,
            preserve_rows=True,
        ).mean()
    )
/Users/robert.kwiecinski/opt/anaconda3/lib/python3.8/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.
  warnings.warn(
logistic
Train precision: 0.0896
Test precision: 0.0329
bpr
Train precision: 0.5801
Test precision: 0.2488
warp
Train precision: 0.6406
Test precision: 0.3456
def top_k_recommendations(
    model, user_features, item_features, user_code_id, item_code_id, topK=10
):
    result = []
    for user_code in range(test_ui.shape[0]):
        user_rated = train_ui.indices[
            train_ui.indptr[user_code] : train_ui.indptr[user_code + 1]
        ]
        scores = model.predict(
            user_code,
            np.arange(train_ui.shape[1]),
            user_features=user_features,
            item_features=item_features,
        )

        scores[user_rated] = -np.inf  # to put rated items at the end of the list

        top_items = [item_code_id[item] for item in np.argsort(-scores)[:topK]]
        result.append(
            [user_code_id[user_code]]
            + list(chain(*zip(top_items, -np.sort(-scores)[:topK])))
        )
    return result


def estimate(model, user_features, item_features, user_code_id, item_code_id, test_ui):
    result = []
    for user, item in zip(*test_ui.nonzero()):
        result.append(
            [
                user_code_id[user],
                item_code_id[item],
                model.predict(
                    int(user),
                    np.array([int(item)]),
                    user_features=user_features,
                    item_features=item_features,
                )[0],
            ]
        )
    return result
top_n = pd.DataFrame(
    top_k_recommendations(
        model=model,
        user_features=user_features,
        item_features=item_features,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        topK=10,
    )
)
top_n.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFM_reco.csv",
    index=False,
    header=False,
)

estimations = pd.DataFrame(
    estimate(
        model=model,
        user_features=user_features,
        item_features=item_features,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        test_ui=test_ui,
    )
)
estimations.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFM_estimations.csv",
    index=False,
    header=False,
)

Pure MF with LightFM

item_features_interactions = sparse.csr_matrix(
    item_features_df[
        [
            item_feature
            for item_feature in item_features_df.columns
            if "id_" in item_feature
        ]
    ].values
)
user_features_interactions = sparse.csr_matrix(
    user_features_df[
        [
            user_feature
            for user_feature in user_features_df.columns
            if "id_" in user_feature
        ]
    ].values
)
from lightfm import LightFM

model = LightFM(loss="warp")
model.fit(
    train_ui,
    user_features=user_features_interactions,
    item_features=item_features_interactions,
    epochs=30,
    num_threads=4,
)

from lightfm.evaluation import precision_at_k

print(
    "Train precision: %.2f"
    % precision_at_k(model, test_interactions=train_ui, k=10).mean()
)
print(
    "Test precision: %.2f"
    % precision_at_k(
        model, test_interactions=test_ui, train_interactions=train_ui, k=10
    ).mean()
)
Train precision: 0.62
Test precision: 0.34
top_n = pd.DataFrame(
    top_k_recommendations(
        model=model,
        user_features=user_features_interactions,
        item_features=item_features_interactions,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        topK=10,
    )
)
top_n.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFMpureMF_reco.csv",
    index=False,
    header=False,
)

estimations = pd.DataFrame(
    estimate(
        model=model,
        user_features=user_features_interactions,
        item_features=item_features_interactions,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        test_ui=test_ui,
    )
)
estimations.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFMpureMF_estimations.csv",
    index=False,
    header=False,
)

LightFM with user/item attributes only (without treating id as a feature)

item_features_only = sparse.csr_matrix(
    item_features_df[
        [
            item_feature
            for item_feature in item_features_df.columns
            if "id_" not in item_feature
        ]
    ].values
)
user_features_only = sparse.csr_matrix(
    user_features_df[
        [
            user_feature
            for user_feature in user_features_df.columns
            if "id_" not in user_feature
        ]
    ].values
)
from lightfm import LightFM

model = LightFM(loss="warp")
model.fit(
    train_ui,
    user_features=user_features_only,
    item_features=item_features_only,
    epochs=30,
    num_threads=4,
)

from lightfm.evaluation import precision_at_k

print(
    "Train precision: %.2f"
    % precision_at_k(
        model,
        test_interactions=train_ui,
        user_features=user_features_only,
        item_features=item_features_only,
        k=10,
    ).mean()
)
print(
    "Test precision: %.2f"
    % precision_at_k(
        model,
        test_interactions=test_ui,
        train_interactions=train_ui,
        user_features=user_features_only,
        item_features=item_features_only,
        k=10,
    ).mean()
)
Train precision: 0.39
Test precision: 0.16
top_n = pd.DataFrame(
    top_k_recommendations(
        model=model,
        user_features=user_features_only,
        item_features=item_features_only,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        topK=10,
    )
)
top_n.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFMcontent_reco.csv",
    index=False,
    header=False,
)

estimations = pd.DataFrame(
    estimate(
        model=model,
        user_features=user_features_only,
        item_features=item_features_only,
        user_code_id=user_code_id,
        item_code_id=item_code_id,
        test_ui=test_ui,
    )
)
estimations.to_csv(
    "Recommendations generated/ml-100k/Ready_LightFMcontent_estimations.csv",
    index=False,
    header=False,
)
import evaluation_measures as ev

dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)

df = ev.evaluate_all(test, dir_path, super_reactions)
display(df.iloc[:, :9])
display(df.iloc[:, np.append(0, np.arange(9, df.shape[1]))])
943it [00:00, 7916.13it/s]
943it [00:00, 7411.38it/s]
943it [00:00, 7288.77it/s]
943it [00:00, 8270.47it/s]
943it [00:00, 8356.60it/s]
943it [00:00, 8515.68it/s]
943it [00:00, 7612.45it/s]
943it [00:00, 8137.53it/s]
943it [00:00, 8291.41it/s]
943it [00:00, 8935.79it/s]
943it [00:00, 9276.67it/s]
943it [00:00, 8497.64it/s]
943it [00:00, 9071.75it/s]
943it [00:00, 8091.43it/s]
943it [00:00, 8078.89it/s]
943it [00:00, 9082.25it/s]
943it [00:00, 8886.58it/s]
943it [00:00, 7250.24it/s]
Model RMSE MAE precision recall F_1 F_05 precision_super recall_super
0 Ready_LightFMpureMF 7.971534 7.489846 0.336267 0.219775 0.216963 0.255495 0.239485 0.262821
0 Ready_LightFM 164.987667 163.062242 0.345599 0.218064 0.220719 0.261767 0.242597 0.256644
0 Self_P3 3.702446 3.527273 0.282185 0.192092 0.186749 0.216980 0.204185 0.240096
0 Ready_ImplicitALS 3.268391 3.069209 0.252068 0.183575 0.174441 0.198723 0.167918 0.212330
0 Self_TopPop 2.508258 2.217909 0.188865 0.116919 0.118732 0.141584 0.130472 0.137473
0 Ready_LightFMcontent 184.473200 182.333123 0.164051 0.101701 0.103410 0.123120 0.103648 0.112447
0 Ready_SVD 0.951652 0.750975 0.096394 0.047252 0.052870 0.067257 0.085515 0.074754
0 Self_SVD 0.914393 0.717199 0.101697 0.042334 0.051787 0.068811 0.092489 0.072360
0 Ready_Baseline 0.949459 0.752487 0.091410 0.037652 0.046030 0.061286 0.079614 0.056463
0 Ready_SVDBiased 0.940413 0.739571 0.086002 0.035478 0.043196 0.057507 0.075751 0.053460
0 Ready_Random 1.527935 1.225393 0.049311 0.020479 0.024944 0.032990 0.032189 0.024725
0 Ready_I-KNN 1.030386 0.813067 0.026087 0.006908 0.010593 0.016046 0.021137 0.009522
0 Ready_I-KNNBaseline 0.935327 0.737424 0.002545 0.000755 0.001105 0.001602 0.002253 0.000930
0 Ready_U-KNN 1.023495 0.807913 0.000742 0.000205 0.000305 0.000449 0.000536 0.000198
0 Self_BaselineIU 0.958136 0.754051 0.000954 0.000188 0.000298 0.000481 0.000644 0.000223
0 Self_TopRated 1.030712 0.820904 0.000954 0.000188 0.000298 0.000481 0.000644 0.000223
0 Self_BaselineUI 0.967585 0.762740 0.000954 0.000170 0.000278 0.000463 0.000644 0.000189
0 Self_IKNN 1.018363 0.808793 0.000318 0.000108 0.000140 0.000189 0.000000 0.000000
Model NDCG mAP MRR LAUC HR Reco in test Test coverage Shannon Gini
0 Ready_LightFMpureMF 0.401680 0.264756 0.639739 0.607591 0.913043 1.000000 0.276335 5.106009 0.911870
0 Ready_LightFM 0.406439 0.273339 0.628360 0.606774 0.901379 1.000000 0.366522 5.397986 0.880639
0 Self_P3 0.339114 0.204905 0.572157 0.593544 0.875928 1.000000 0.077201 3.875892 0.974947
0 Ready_ImplicitALS 0.303179 0.170318 0.533574 0.589147 0.872747 0.999682 0.506494 5.735292 0.823380
0 Self_TopPop 0.214651 0.111707 0.400939 0.555546 0.765642 1.000000 0.038961 3.159079 0.987317
0 Ready_LightFMcontent 0.184974 0.090747 0.349255 0.547829 0.724284 0.978579 0.272727 4.893099 0.927459
0 Ready_SVD 0.109578 0.051562 0.235567 0.520341 0.496288 0.995546 0.208514 4.455755 0.951624
0 Self_SVD 0.104839 0.048970 0.196117 0.517889 0.480382 0.867338 0.147186 3.852545 0.972694
0 Ready_Baseline 0.095957 0.043178 0.198193 0.515501 0.437964 1.000000 0.033911 2.836513 0.991139
0 Ready_SVDBiased 0.094897 0.043361 0.209124 0.514405 0.428420 0.997349 0.177489 4.212509 0.962656
0 Ready_Random 0.053647 0.020462 0.136036 0.506763 0.339343 0.986108 0.191198 5.101215 0.907796
0 Ready_I-KNN 0.024214 0.008958 0.048068 0.499885 0.154825 0.402333 0.434343 5.133650 0.877999
0 Ready_I-KNNBaseline 0.003444 0.001362 0.011760 0.496724 0.021209 0.482821 0.059885 2.232578 0.994487
0 Ready_U-KNN 0.000845 0.000274 0.002744 0.496441 0.007423 0.602121 0.010823 2.089186 0.995706
0 Self_BaselineIU 0.001043 0.000335 0.003348 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669
0 Self_TopRated 0.001043 0.000335 0.003348 0.496433 0.009544 0.699046 0.005051 1.945910 0.995669
0 Self_BaselineUI 0.000752 0.000168 0.001677 0.496424 0.009544 0.600530 0.005051 1.803126 0.996380
0 Self_IKNN 0.000214 0.000037 0.000368 0.496391 0.003181 0.392153 0.115440 4.174741 0.965327