74 KiB
74 KiB
Self made SVD
import helpers
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
import matplotlib.pyplot as plt
train_read = pd.read_csv("./Datasets/ml-100k/train.csv", sep="\t", header=None)
test_read = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
(
train_ui,
test_ui,
user_code_id,
user_id_code,
item_code_id,
item_id_code,
) = helpers.data_to_csr(train_read, test_read)
# Done similarly to https://github.com/albertauyeung/matrix-factorization-in-python
from tqdm import tqdm
class SVD:
def __init__(self, train_ui, learning_rate, regularization, nb_factors, iterations):
self.train_ui = train_ui
self.uir = list(
zip(*[train_ui.nonzero()[0], train_ui.nonzero()[1], train_ui.data])
)
self.learning_rate = learning_rate
self.regularization = regularization
self.iterations = iterations
self.nb_users, self.nb_items = train_ui.shape
self.nb_ratings = train_ui.nnz
self.nb_factors = nb_factors
self.Pu = np.random.normal(
loc=0, scale=1.0 / self.nb_factors, size=(self.nb_users, self.nb_factors)
)
self.Qi = np.random.normal(
loc=0, scale=1.0 / self.nb_factors, size=(self.nb_items, self.nb_factors)
)
def train(self, test_ui=None):
if test_ui != None:
self.test_uir = list(
zip(*[test_ui.nonzero()[0], test_ui.nonzero()[1], test_ui.data])
)
self.learning_process = []
pbar = tqdm(range(self.iterations))
for i in pbar:
pbar.set_description(
f"Epoch {i} RMSE: {self.learning_process[-1][1] if i>0 else 0}. Training epoch {i+1}..."
)
np.random.shuffle(self.uir)
self.sgd(self.uir)
if test_ui == None:
self.learning_process.append([i + 1, self.RMSE_total(self.uir)])
else:
self.learning_process.append(
[i + 1, self.RMSE_total(self.uir), self.RMSE_total(self.test_uir)]
)
def sgd(self, uir):
for u, i, score in uir:
# Computer prediction and error
prediction = self.get_rating(u, i)
e = score - prediction
# Update user and item latent feature matrices
Pu_update = self.learning_rate * (
e * self.Qi[i] - self.regularization * self.Pu[u]
)
Qi_update = self.learning_rate * (
e * self.Pu[u] - self.regularization * self.Qi[i]
)
self.Pu[u] += Pu_update
self.Qi[i] += Qi_update
def get_rating(self, u, i):
prediction = self.Pu[u].dot(self.Qi[i].T)
return prediction
def RMSE_total(self, uir):
RMSE = 0
for u, i, score in uir:
prediction = self.get_rating(u, i)
RMSE += (score - prediction) ** 2
return np.sqrt(RMSE / len(uir))
def estimations(self):
self.estimations = np.dot(self.Pu, self.Qi.T)
def recommend(self, user_code_id, item_code_id, topK=10):
top_k = defaultdict(list)
for nb_user, user in enumerate(self.estimations):
user_rated = self.train_ui.indices[
self.train_ui.indptr[nb_user] : self.train_ui.indptr[nb_user + 1]
]
for item, score in enumerate(user):
if item not in user_rated and not np.isnan(score):
top_k[user_code_id[nb_user]].append((item_code_id[item], score))
result = []
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, item_scores in top_k.items():
item_scores.sort(key=lambda x: x[1], reverse=True)
result.append([uid] + list(chain(*item_scores[:topK])))
return result
def estimate(self, user_code_id, item_code_id, test_ui):
result = []
for user, item in zip(*test_ui.nonzero()):
result.append(
[
user_code_id[user],
item_code_id[item],
self.estimations[user, item]
if not np.isnan(self.estimations[user, item])
else 1,
]
)
return result
model = SVD(
train_ui, learning_rate=0.005, regularization=0.02, nb_factors=100, iterations=40
)
model.train(test_ui)
Epoch 39 RMSE: 0.7477090330529405. Training epoch 40...: 100%|██████████| 40/40 [01:03<00:00, 1.59s/it]
df = pd.DataFrame(model.learning_process).iloc[:, :2]
df.columns = ["epoch", "train_RMSE"]
plt.plot("epoch", "train_RMSE", data=df, color="blue")
plt.legend()
<matplotlib.legend.Legend at 0x7fdc5a9a2370>
df = pd.DataFrame(
model.learning_process[10:], columns=["epoch", "train_RMSE", "test_RMSE"]
)
plt.plot("epoch", "train_RMSE", data=df, color="blue")
plt.plot("epoch", "test_RMSE", data=df, color="green", linestyle="dashed")
plt.legend()
<matplotlib.legend.Legend at 0x7fdc6037bb20>
Saving and evaluating recommendations
model.estimations()
top_n = pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))
top_n.to_csv(
"Recommendations generated/ml-100k/Self_SVD_reco.csv", index=False, header=False
)
estimations = pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv(
"Recommendations generated/ml-100k/Self_SVD_estimations.csv",
index=False,
header=False,
)
import evaluation_measures as ev
estimations_df = pd.read_csv(
"Recommendations generated/ml-100k/Self_SVD_estimations.csv", header=None
)
reco = np.loadtxt("Recommendations generated/ml-100k/Self_SVD_reco.csv", delimiter=",")
ev.evaluate(
test=pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None),
estimations_df=estimations_df,
reco=reco,
super_reactions=[4, 5],
)
943it [00:00, 11138.92it/s]
RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.914143 | 0.717131 | 0.101803 | 0.042134 | 0.05161 | 0.068543 | 0.091953 | 0.071255 | 0.104015 | 0.048817 | 0.193027 | 0.517784 | 0.471898 | 0.867232 | 0.147908 | 3.871296 | 0.97182 |
dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
ev.evaluate_all(test, dir_path, super_reactions)
943it [00:00, 10694.55it/s] 943it [00:00, 11600.99it/s] 943it [00:00, 11461.54it/s] 943it [00:00, 11660.39it/s] 943it [00:00, 9872.18it/s] 943it [00:00, 11443.77it/s] 943it [00:00, 11990.88it/s] 943it [00:00, 11615.02it/s] 943it [00:00, 11874.78it/s] 943it [00:00, 12387.19it/s]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Self_TopPop | 2.508258 | 2.217909 | 0.188865 | 0.116919 | 0.118732 | 0.141584 | 0.130472 | 0.137473 | 0.214651 | 0.111707 | 0.400939 | 0.555546 | 0.765642 | 1.000000 | 0.038961 | 3.159079 | 0.987317 |
0 | Self_SVD | 0.914143 | 0.717131 | 0.101803 | 0.042134 | 0.051610 | 0.068543 | 0.091953 | 0.071255 | 0.104015 | 0.048817 | 0.193027 | 0.517784 | 0.471898 | 0.867232 | 0.147908 | 3.871296 | 0.971820 |
0 | Ready_Baseline | 0.949459 | 0.752487 | 0.091410 | 0.037652 | 0.046030 | 0.061286 | 0.079614 | 0.056463 | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.000000 | 0.033911 | 2.836513 | 0.991139 |
0 | Ready_Random | 1.521845 | 1.225949 | 0.047190 | 0.020753 | 0.024810 | 0.032269 | 0.029506 | 0.023707 | 0.050075 | 0.018728 | 0.121957 | 0.506893 | 0.329799 | 0.986532 | 0.184704 | 5.099706 | 0.907217 |
0 | Ready_I-KNN | 1.030386 | 0.813067 | 0.026087 | 0.006908 | 0.010593 | 0.016046 | 0.021137 | 0.009522 | 0.024214 | 0.008958 | 0.048068 | 0.499885 | 0.154825 | 0.402333 | 0.434343 | 5.133650 | 0.877999 |
0 | Ready_I-KNNBaseline | 0.935327 | 0.737424 | 0.002545 | 0.000755 | 0.001105 | 0.001602 | 0.002253 | 0.000930 | 0.003444 | 0.001362 | 0.011760 | 0.496724 | 0.021209 | 0.482821 | 0.059885 | 2.232578 | 0.994487 |
0 | Ready_U-KNN | 1.023495 | 0.807913 | 0.000742 | 0.000205 | 0.000305 | 0.000449 | 0.000536 | 0.000198 | 0.000845 | 0.000274 | 0.002744 | 0.496441 | 0.007423 | 0.602121 | 0.010823 | 2.089186 | 0.995706 |
0 | Self_TopRated | 1.030712 | 0.820904 | 0.000954 | 0.000188 | 0.000298 | 0.000481 | 0.000644 | 0.000223 | 0.001043 | 0.000335 | 0.003348 | 0.496433 | 0.009544 | 0.699046 | 0.005051 | 1.945910 | 0.995669 |
0 | Self_BaselineUI | 0.967585 | 0.762740 | 0.000954 | 0.000170 | 0.000278 | 0.000463 | 0.000644 | 0.000189 | 0.000752 | 0.000168 | 0.001677 | 0.496424 | 0.009544 | 0.600530 | 0.005051 | 1.803126 | 0.996380 |
0 | Self_IKNN | 1.018363 | 0.808793 | 0.000318 | 0.000108 | 0.000140 | 0.000189 | 0.000000 | 0.000000 | 0.000214 | 0.000037 | 0.000368 | 0.496391 | 0.003181 | 0.392153 | 0.115440 | 4.174741 | 0.965327 |
Embeddings
item = random.choice(list(set(train_ui.indices)))
embeddings_norm = (
model.Qi / np.linalg.norm(model.Qi, axis=1)[:, None]
) # we do not mean-center here
# omitting normalization also makes sense, but items with a greater magnitude will be recommended more often
similarity_scores = np.dot(embeddings_norm, embeddings_norm[item].T)
top_similar_items = pd.DataFrame(
enumerate(similarity_scores), columns=["code", "score"]
).sort_values(by=["score"], ascending=[False])[:10]
top_similar_items["item_id"] = top_similar_items["code"].apply(
lambda x: item_code_id[x]
)
items = pd.read_csv("./Datasets/ml-100k/movies.csv")
result = pd.merge(top_similar_items, items, left_on="item_id", right_on="id")
result
code | score | item_id | id | title | genres | |
---|---|---|---|---|---|---|
0 | 405 | 1.000000 | 406 | 406 | Thinner (1996) | Horror, Thriller |
1 | 827 | 0.968354 | 828 | 828 | Alaska (1996) | Adventure, Children's |
2 | 818 | 0.967103 | 819 | 819 | Eddie (1996) | Comedy |
3 | 904 | 0.963944 | 905 | 905 | Great Expectations (1998) | Drama, Romance |
4 | 1293 | 0.962779 | 1294 | 1294 | Ayn Rand: A Sense of Life (1997) | Documentary |
5 | 347 | 0.961946 | 348 | 348 | Desperate Measures (1998) | Crime, Drama, Thriller |
6 | 807 | 0.960952 | 808 | 808 | Program, The (1993) | Action, Drama |
7 | 560 | 0.960885 | 561 | 561 | Mary Shelley's Frankenstein (1994) | Drama, Horror |
8 | 1392 | 0.958724 | 1393 | 1393 | Stag (1997) | Action, Thriller |
9 | 787 | 0.957891 | 788 | 788 | Relative Fear (1994) | Horror, Thriller |
project task 5: implement SVD on top baseline (as it is in Surprise library)
# making changes to our implementation by considering additional parameters in the gradient descent procedure
# seems to be the fastest option
# please save the output in 'Recommendations generated/ml-100k/Self_SVDBaseline_reco.csv' and
# 'Recommendations generated/ml-100k/Self_SVDBaseline_estimations.csv'
Ready-made SVD - Surprise implementation
SVD
import helpers
import surprise as sp
algo = sp.SVD(biased=False) # to use unbiased version
helpers.ready_made(
algo,
reco_path="Recommendations generated/ml-100k/Ready_SVD_reco.csv",
estimations_path="Recommendations generated/ml-100k/Ready_SVD_estimations.csv",
)
Generating predictions... Generating top N recommendations... Generating predictions...
SVD biased - on top baseline
algo = sp.SVD() # default is biased=True
helpers.ready_made(
algo,
reco_path="Recommendations generated/ml-100k/Ready_SVDBiased_reco.csv",
estimations_path="Recommendations generated/ml-100k/Ready_SVDBiased_estimations.csv",
)
Generating predictions... Generating top N recommendations... Generating predictions...
dir_path = "Recommendations generated/ml-100k/"
super_reactions = [4, 5]
test = pd.read_csv("./Datasets/ml-100k/test.csv", sep="\t", header=None)
ev.evaluate_all(test, dir_path, super_reactions)
943it [00:00, 11249.52it/s] 943it [00:00, 10927.13it/s] 943it [00:00, 11816.00it/s] 943it [00:00, 11204.84it/s] 943it [00:00, 11803.13it/s] 943it [00:00, 10580.63it/s] 943it [00:00, 11843.28it/s] 943it [00:00, 12313.76it/s] 943it [00:00, 10678.21it/s] 943it [00:00, 9772.22it/s] 943it [00:00, 10699.52it/s] 943it [00:00, 11789.55it/s]
Model | RMSE | MAE | precision | recall | F_1 | F_05 | precision_super | recall_super | NDCG | mAP | MRR | LAUC | HR | Reco in test | Test coverage | Shannon | Gini | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Self_TopPop | 2.508258 | 2.217909 | 0.188865 | 0.116919 | 0.118732 | 0.141584 | 0.130472 | 0.137473 | 0.214651 | 0.111707 | 0.400939 | 0.555546 | 0.765642 | 1.000000 | 0.038961 | 3.159079 | 0.987317 |
0 | Ready_SVD | 0.950347 | 0.749312 | 0.100636 | 0.050514 | 0.055794 | 0.070753 | 0.091202 | 0.082734 | 0.114054 | 0.053200 | 0.248803 | 0.521983 | 0.517497 | 0.992153 | 0.210678 | 4.418683 | 0.952848 |
0 | Self_SVD | 0.914143 | 0.717131 | 0.101803 | 0.042134 | 0.051610 | 0.068543 | 0.091953 | 0.071255 | 0.104015 | 0.048817 | 0.193027 | 0.517784 | 0.471898 | 0.867232 | 0.147908 | 3.871296 | 0.971820 |
0 | Ready_Baseline | 0.949459 | 0.752487 | 0.091410 | 0.037652 | 0.046030 | 0.061286 | 0.079614 | 0.056463 | 0.095957 | 0.043178 | 0.198193 | 0.515501 | 0.437964 | 1.000000 | 0.033911 | 2.836513 | 0.991139 |
0 | Ready_SVDBiased | 0.939472 | 0.739816 | 0.085896 | 0.036073 | 0.043528 | 0.057643 | 0.077039 | 0.057463 | 0.097753 | 0.045546 | 0.219839 | 0.514709 | 0.431601 | 0.997455 | 0.168831 | 4.217578 | 0.962577 |
0 | Ready_Random | 1.521845 | 1.225949 | 0.047190 | 0.020753 | 0.024810 | 0.032269 | 0.029506 | 0.023707 | 0.050075 | 0.018728 | 0.121957 | 0.506893 | 0.329799 | 0.986532 | 0.184704 | 5.099706 | 0.907217 |
0 | Ready_I-KNN | 1.030386 | 0.813067 | 0.026087 | 0.006908 | 0.010593 | 0.016046 | 0.021137 | 0.009522 | 0.024214 | 0.008958 | 0.048068 | 0.499885 | 0.154825 | 0.402333 | 0.434343 | 5.133650 | 0.877999 |
0 | Ready_I-KNNBaseline | 0.935327 | 0.737424 | 0.002545 | 0.000755 | 0.001105 | 0.001602 | 0.002253 | 0.000930 | 0.003444 | 0.001362 | 0.011760 | 0.496724 | 0.021209 | 0.482821 | 0.059885 | 2.232578 | 0.994487 |
0 | Ready_U-KNN | 1.023495 | 0.807913 | 0.000742 | 0.000205 | 0.000305 | 0.000449 | 0.000536 | 0.000198 | 0.000845 | 0.000274 | 0.002744 | 0.496441 | 0.007423 | 0.602121 | 0.010823 | 2.089186 | 0.995706 |
0 | Self_TopRated | 1.030712 | 0.820904 | 0.000954 | 0.000188 | 0.000298 | 0.000481 | 0.000644 | 0.000223 | 0.001043 | 0.000335 | 0.003348 | 0.496433 | 0.009544 | 0.699046 | 0.005051 | 1.945910 | 0.995669 |
0 | Self_BaselineUI | 0.967585 | 0.762740 | 0.000954 | 0.000170 | 0.000278 | 0.000463 | 0.000644 | 0.000189 | 0.000752 | 0.000168 | 0.001677 | 0.496424 | 0.009544 | 0.600530 | 0.005051 | 1.803126 | 0.996380 |
0 | Self_IKNN | 1.018363 | 0.808793 | 0.000318 | 0.000108 | 0.000140 | 0.000189 | 0.000000 | 0.000000 | 0.000214 | 0.000037 | 0.000368 | 0.496391 | 0.003181 | 0.392153 | 0.115440 | 4.174741 | 0.965327 |