37 KiB
37 KiB
Preparing dataset
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
# Let's prepare dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_and_test[:5]
user | item | rating | timestamp | user_code | item_code | |
---|---|---|---|---|---|---|
0 | 664 | 525 | 4 | 876526580 | 663 | 524 |
1 | 49 | 1 | 2 | 888068651 | 48 | 0 |
2 | 352 | 273 | 2 | 884290328 | 351 | 272 |
3 | 618 | 96 | 3 | 891307749 | 617 | 95 |
4 | 560 | 24 | 2 | 879976772 | 559 | 23 |
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
# Above steps are the same for many algorithms, so I put the code in separate file:
import helpers
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)
CSR matrices - what is it?
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample_csr=sparse.csr_matrix((data, (row, col)))
sample_csr
<3x4 sparse matrix of type '<class 'numpy.int64'>' with 8 stored elements in Compressed Sparse Row format>
print('Ratings matrix with missing entries replaced by zeros:')
display(sample_csr.todense())
print(f'Number of ratings: {sample_csr.nnz}')
print(f'Number of users: {sample_csr.shape[0]}')
print(f'Number of items: {sample_csr.shape[1]}')
Ratings matrix with missing entries replaced by zeros:
matrix([[4, 1, 3, 0], [0, 2, 0, 1], [2, 0, 5, 4]])
Number of ratings: 8 Number of users: 3 Number of items: 4
print('Ratings data:', sample_csr.data)
print('Regarding items:', sample_csr.indices)
for i in range(sample_csr.shape[0]):
print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')
Ratings data: [4 1 3 2 1 2 5 4] Regarding items: [0 1 2 1 3 0 2 3] Where ratings from 0 to 2 belongs to user 0. Where ratings from 3 to 4 belongs to user 1. Where ratings from 5 to 7 belongs to user 2.
user=123
print('Efficient way to access items rated by user:')
display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])
%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]
print('Inefficient way to access items rated by user:')
display(train_ui[user].indices)
%timeit train_ui[user].indices
Efficient way to access items rated by user:
array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167, 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each) Inefficient way to access items rated by user:
array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167, 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Example: subtracting row means
print('Our matrix:')
display(sample_csr.todense())
print('List of row sums:')
sample_csr.sum(axis=1).ravel()
Our matrix:
matrix([[4, 1, 3, 0], [0, 2, 0, 1], [2, 0, 5, 4]])
List of row sums:
matrix([[ 8, 3, 11]])
print('Array with row means:')
row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)
display(row_means)
print('Diagonal csr matrix with inverse of row sums on diagonal:')
display(sparse.diags(row_means).todense())
print("""Let's apply them in nonzero entries:""")
to_subtract=sparse.diags(row_means)*(sample_csr>0)
display(to_subtract.todense())
print("Finally after subtraction:")
sample_csr-to_subtract.todense()
Array with row means:
array([2.66666667, 1.5 , 3.66666667])
Diagonal csr matrix with inverse of row sums on diagonal:
matrix([[2.66666667, 0. , 0. ], [0. , 1.5 , 0. ], [0. , 0. , 3.66666667]])
Let's apply them in nonzero entries:
matrix([[2.66666667, 2.66666667, 2.66666667, 0. ], [0. , 1.5 , 0. , 1.5 ], [3.66666667, 0. , 3.66666667, 3.66666667]])
Finally after subtraction:
matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ], [ 0. , 0.5 , 0. , -0.5 ], [-1.66666667, 0. , 1.33333333, 0.33333333]])
Transposing
import numpy as np
from scipy import sparse
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample=sparse.csr_matrix((data, (row, col)))
print('Sample matrix: \n', sample.A)
print('\nIndices: \n', sample.indices)
transposed=sample.transpose()
print('\nTransposed matrix: \n', transposed.A)
print('\nIndices of transposed matrix: \n', transposed.indices)
print('\nReason: ', type(transposed))
print('\nAfter converting to csr: \n', transposed.tocsr().indices)
Sample matrix: [[4 1 3 0] [0 2 0 1] [2 0 5 4]] Indices: [0 1 2 1 3 0 2 3] Transposed matrix: [[4 0 2] [1 2 0] [3 0 5] [0 1 4]] Indices of transposed matrix: [0 1 2 1 3 0 2 3] Reason: <class 'scipy.sparse.csc.csc_matrix'> After converting to csr: [0 2 0 1 0 2 1 2]
Self made top popular
import os
if not os.path.exists('./Recommendations generated/'):
os.mkdir('./Recommendations generated/')
os.mkdir('./Recommendations generated/ml-100k/')
os.mkdir('./Recommendations generated/toy-example/')
top_pop = []
train_iu = train_ui.transpose().tocsr()
scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))
for i in range(train_iu.shape[0]):
top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))
top_pop.sort(key=lambda x: x[1], reverse=True)
#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular
k = 10
result = []
for u in range(train_ui.shape[0]):
user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
rec_user = []
item_pos = 0
while len(rec_user)<10:
if top_pop[item_pos][0] not in user_rated:
rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))
item_pos+=1
result.append([user_code_id[u]]+list(chain(*rec_user)))
(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)
# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking
estimations=[]
for user, item in zip(*test_ui.nonzero()):
estimations.append([user_code_id[user], item_code_id[item],
(train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)
Self made top rated
top_rated = []
global_avg = sum(train_iu.data)/train_ui.nnz
for i in range(train_iu.shape[0]):
ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]
avg = np.mean(ratings) if len(ratings)>0 else global_avg
top_rated.append((i, avg))
top_rated.sort(key=lambda x: x[1], reverse=True)
k=10
result=[]
for u in range(train_ui.shape[0]):
user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
rec_user=[]
item_pos=0
while len(rec_user)<10:
if top_rated[item_pos][0] not in user_rated:
rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))
item_pos+=1
result.append([user_code_id[u]]+list(chain(*rec_user)))
(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)
estimations=[]
d = dict(top_rated)
for user, item in zip(*test_ui.nonzero()):
estimations.append([user_code_id[user], item_code_id[item], d[item]])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)
pd.DataFrame(result)[:2]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 814 | 5.0 | 1122 | 5.0 | 1189 | 5.0 | 1201 | 5.0 | 1293 | ... | 1306 | 5.0 | 1467 | 5.0 | 1491 | 5.0 | 1500 | 5.0 | 1536 | 5.0 |
1 | 2 | 119 | 5.0 | 814 | 5.0 | 1122 | 5.0 | 1189 | 5.0 | 1201 | ... | 1293 | 5.0 | 1306 | 5.0 | 1467 | 5.0 | 1491 | 5.0 | 1500 | 5.0 |
2 rows × 21 columns
Self-made baseline
class selfBaselineUI():
def fit(self, train_ui):
self.train_ui=train_ui.copy()
self.train_iu=train_ui.transpose().tocsr()
result=self.train_ui.copy()
self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)
# in csr format after addition or multiplication 0 entries "disappear" - so some workaraunds are needed
# (other option is to define addition/multiplication in a desired way)
row_means=self.row_means.copy()
max_row_mean=np.max(row_means)
row_means[row_means==0]=max_row_mean+1
to_subtract_rows=sparse.diags(row_means)*(result.power(0))
to_subtract_rows.sort_indices() # needed to have valid .data
subtract=to_subtract_rows.data
subtract[subtract==max_row_mean+1]=0
result.data=result.data-subtract
# we can't do result=train_ui-to_subtract_rows since then 0 entries will "disappear" in csr format
self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\
out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings
# again - it is possible that some mean will be zero, so let's use the same workaround
col_means=self.col_means.copy()
max_col_mean=np.max(col_means)
col_means[col_means==0]=max_col_mean+1
to_subtract_cols=result.power(0)*sparse.diags(col_means)
to_subtract_cols.sort_indices() # needed to have valid .data
subtract=to_subtract_cols.data
subtract[subtract==max_col_mean+1]=0
result.data=result.data-subtract
return result
def recommend(self, user_code_id, item_code_id, topK=10):
estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])
top_k = defaultdict(list)
for nb_user, user in enumerate(estimations):
user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
for item, score in enumerate(user):
if item not in user_rated:
top_k[user_code_id[nb_user]].append((item_code_id[item], score))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, item_scores in top_k.items():
item_scores.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*item_scores[:topK])))
return result
def estimate(self, user_code_id, item_code_id, test_ui):
result=[]
for user, item in zip(*test_ui.nonzero()):
result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])
return result
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)
print('Training data:')
display(toy_train_ui.todense())
model=selfBaselineUI()
print('After subtracting rows and columns:')
display(model.fit(toy_train_ui).todense())
print('Recommend best unseen item:')
display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))
print('Print estimations on unseen items:')
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.columns=['user', 'item', 'est_score']
display(estimations)
top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))
top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4], [0, 1, 2, 3, 0, 0, 0, 0], [0, 0, 0, 5, 0, 3, 4, 0]])
After subtracting rows and columns:
matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
Recommend best unseen item:
[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]
Print estimations on unseen items:
user | item | est_score | |
---|---|---|---|
0 | 0 | 60 | 4.0 |
1 | 10 | 40 | 3.0 |
2 | 20 | 0 | 3.0 |
3 | 20 | 20 | 4.0 |
4 | 20 | 70 | 4.0 |
model=selfBaselineUI()
model.fit(train_ui)
top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)
estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)
project task 1: implement self-made BaselineIU
Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI but first subtract column means then row means.
The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv' and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'.
Additional clarification:
Summarizing, the prediction of the rating of the user u regarding the item i should be equal to b_u + b_i. The procedure to get b_u and b_i is the following:
- We have the original user-item ratings matrix M.
- For each column representing the item i, we compute the mean of ratings and denote by b_i. From each rating in matrix M we subtract the corresponding column mean (b_i) to receive new matrix M'.
- For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u.
Ready-made baseline - Surprise implementation
import surprise as sp
import time
# Based on surprise.readthedocs.io
def get_top_n(predictions, n=10):
# Here we create a dictionary which items are lists of pairs (item, score)
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*user_ratings[:n])))
return result
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())
algo = sp.BaselineOnly()
# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})
# observe how bad results gives above algorithm
# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)
Estimating biases using als...
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)
# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)
sp.accuracy.mae(predictions, verbose=True)
RMSE: 0.9495 MAE: 0.7525
0.7524871012820799
Let's compare with random
# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set
algo = sp.NormalPredictor()
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)
# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)
sp.accuracy.mae(predictions, verbose=True)
RMSE: 1.5239 MAE: 1.2268
1.2267993503843746