37 KiB
37 KiB
Preparing dataset
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
# Let's prepare dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_and_test[:5]
user | item | rating | timestamp | user_code | item_code | |
---|---|---|---|---|---|---|
0 | 664 | 525 | 4 | 876526580 | 663 | 524 |
1 | 49 | 1 | 2 | 888068651 | 48 | 0 |
2 | 352 | 273 | 2 | 884290328 | 351 | 272 |
3 | 618 | 96 | 3 | 891307749 | 617 | 95 |
4 | 560 | 24 | 2 | 879976772 | 559 | 23 |
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
# Above steps are the same for many algorithms, so I put the code in separate file:
import helpers
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)
CSR matrices - what is it?
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample_csr=sparse.csr_matrix((data, (row, col)))
sample_csr
<3x4 sparse matrix of type '<class 'numpy.int64'>' with 8 stored elements in Compressed Sparse Row format>
print('Ratings matrix with missing entries replaced by zeros:')
display(sample_csr.todense())
print(f'Number of ratings: {sample_csr.nnz}')
print(f'Number of users: {sample_csr.shape[0]}')
print(f'Number of items: {sample_csr.shape[1]}')
Ratings matrix with missing entries replaced by zeros:
matrix([[4, 1, 3, 0], [0, 2, 0, 1], [2, 0, 5, 4]])
Number of ratings: 8 Number of users: 3 Number of items: 4
print('Ratings data:', sample_csr.data)
print('Regarding items:', sample_csr.indices)
for i in range(sample_csr.shape[0]):
print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')
Ratings data: [4 1 3 2 1 2 5 4] Regarding items: [0 1 2 1 3 0 2 3] Where ratings from 0 to 2 belongs to user 0. Where ratings from 3 to 4 belongs to user 1. Where ratings from 5 to 7 belongs to user 2.
user=123
print('Efficient way to access items rated by user:')
display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])
%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]
print('Inefficient way to access items rated by user:')
display(train_ui[user].indices)
%timeit train_ui[user].indices
Efficient way to access items rated by user:
array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167, 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each) Inefficient way to access items rated by user:
array([ 0, 6, 10, 27, 49, 78, 95, 97, 116, 143, 153, 156, 167, 171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Example: subtracting row means
print('Our matrix:')
display(sample_csr.todense())
print('List of row sums:')
sample_csr.sum(axis=1).ravel()
Our matrix:
matrix([[4, 1, 3, 0], [0, 2, 0, 1], [2, 0, 5, 4]])
List of row sums:
matrix([[ 8, 3, 11]])
print('Array with row means:')
row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)
display(row_means)
print('Diagonal csr matrix with inverse of row sums on diagonal:')
display(sparse.diags(row_means).todense())
print("""Let's apply them in nonzero entries:""")
to_subtract=sparse.diags(row_means)*(sample_csr>0)
display(to_subtract.todense())
print("Finally after subtraction:")
sample_csr-to_subtract.todense()
Array with row means:
array([2.66666667, 1.5 , 3.66666667])
Diagonal csr matrix with inverse of row sums on diagonal:
matrix([[2.66666667, 0. , 0. ], [0. , 1.5 , 0. ], [0. , 0. , 3.66666667]])
Let's apply them in nonzero entries:
matrix([[2.66666667, 2.66666667, 2.66666667, 0. ], [0. , 1.5 , 0. , 1.5 ], [3.66666667, 0. , 3.66666667, 3.66666667]])
Finally after subtraction:
matrix([[ 1.33333333, -1.66666667, 0.33333333, 0. ], [ 0. , 0.5 , 0. , -0.5 ], [-1.66666667, 0. , 1.33333333, 0.33333333]])
Transposing
import numpy as np
from scipy import sparse
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample=sparse.csr_matrix((data, (row, col)))
print('Sample matrix: \n', sample.A)
print('\nIndices: \n', sample.indices)
transposed=sample.transpose()
print('\nTransposed matrix: \n', transposed.A)
print('\nIndices of transposed matrix: \n', transposed.indices)
print('\nReason: ', type(transposed))
print('\nAfter converting to csr: \n', transposed.tocsr().indices)
Sample matrix: [[4 1 3 0] [0 2 0 1] [2 0 5 4]] Indices: [0 1 2 1 3 0 2 3] Transposed matrix: [[4 0 2] [1 2 0] [3 0 5] [0 1 4]] Indices of transposed matrix: [0 1 2 1 3 0 2 3] Reason: <class 'scipy.sparse.csc.csc_matrix'> After converting to csr: [0 2 0 1 0 2 1 2]
Self made top popular
import os
if not os.path.exists('./Recommendations generated/'):
os.mkdir('./Recommendations generated/')
os.mkdir('./Recommendations generated/ml-100k/')
os.mkdir('./Recommendations generated/toy-example/')
top_pop = []
train_iu = train_ui.transpose().tocsr()
scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))
for i in range(train_iu.shape[0]):
top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))
top_pop.sort(key=lambda x: x[1], reverse=True)
#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular
k = 10
result = []
for u in range(train_ui.shape[0]):
user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
rec_user = []
item_pos = 0
while len(rec_user)<10:
if top_pop[item_pos][0] not in user_rated:
rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))
item_pos+=1
result.append([user_code_id[u]]+list(chain(*rec_user)))
(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)
# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking
estimations=[]
for user, item in zip(*test_ui.nonzero()):
estimations.append([user_code_id[user], item_code_id[item],
(train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)
Self made top rated
top_rated = []
global_avg = sum(train_iu.data)/train_ui.nnz
for i in range(train_iu.shape[0]):
ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]
avg = np.mean(ratings) if len(ratings)>0 else global_avg
top_rated.append((i, avg))
top_rated.sort(key=lambda x: x[1], reverse=True)
k=10
result=[]
for u in range(train_ui.shape[0]):
user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
rec_user=[]
item_pos=0
while len(rec_user)<10:
if top_rated[item_pos][0] not in user_rated:
rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))
item_pos+=1
result.append([user_code_id[u]]+list(chain(*rec_user)))
(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)
estimations=[]
d = dict(top_rated)
for user, item in zip(*test_ui.nonzero()):
estimations.append([user_code_id[user], item_code_id[item], d[item]])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)
pd.DataFrame(result)[:2]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 814 | 5.0 | 1122 | 5.0 | 1189 | 5.0 | 1201 | 5.0 | 1293 | ... | 1306 | 5.0 | 1467 | 5.0 | 1491 | 5.0 | 1500 | 5.0 | 1536 | 5.0 |
1 | 2 | 119 | 5.0 | 814 | 5.0 | 1122 | 5.0 | 1189 | 5.0 | 1201 | ... | 1293 | 5.0 | 1306 | 5.0 | 1467 | 5.0 | 1491 | 5.0 | 1500 | 5.0 |
2 rows × 21 columns
Self-made baseline
class selfBaselineUI():
def fit(self, train_ui):
self.train_ui=train_ui.copy()
self.train_iu=train_ui.transpose().tocsr()
result=self.train_ui.copy()
self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)
# in csr format after addition or multiplication 0 entries "disappear" - so some workaraunds are needed
# (other option is to define addition/multiplication in a desired way)
row_means=self.row_means.copy()
max_row_mean=np.max(row_means)
row_means[row_means==0]=max_row_mean+1
to_subtract_rows=sparse.diags(row_means)*(result.power(0))
to_subtract_rows.sort_indices() # needed to have valid .data
subtract=to_subtract_rows.data
subtract[subtract==max_row_mean+1]=0
result.data=result.data-subtract
# we can't do result=train_ui-to_subtract_rows since then 0 entries will "disappear" in csr format
self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\
out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings
# again - it is possible that some mean will be zero, so let's use the same workaround
col_means=self.col_means.copy()
max_col_mean=np.max(col_means)
col_means[col_means==0]=max_col_mean+1
to_subtract_cols=result.power(0)*sparse.diags(col_means)
to_subtract_cols.sort_indices() # needed to have valid .data
subtract=to_subtract_cols.data
subtract[subtract==max_col_mean+1]=0
result.data=result.data-subtract
return result
def recommend(self, user_code_id, item_code_id, topK=10):
estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])
top_k = defaultdict(list)
for nb_user, user in enumerate(estimations):
user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
for item, score in enumerate(user):
if item not in user_rated:
top_k[user_code_id[nb_user]].append((item_code_id[item], score))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, item_scores in top_k.items():
item_scores.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*item_scores[:topK])))
return result
def estimate(self, user_code_id, item_code_id, test_ui):
result=[]
for user, item in zip(*test_ui.nonzero()):
result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])
return result
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)
print('Training data:')
display(toy_train_ui.todense())
model=selfBaselineUI()
print('After subtracting rows and columns:')
display(model.fit(toy_train_ui).todense())
print('Recommend best unseen item:')
display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))
print('Print estimations on unseen items:')
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.columns=['user', 'item', 'est_score']
display(estimations)
top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))
top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4], [0, 1, 2, 3, 0, 0, 0, 0], [0, 0, 0, 5, 0, 3, 4, 0]])
After subtracting rows and columns:
matrix([[ 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , -0.5, 0. , 0. , 0. , 0. , 0. , 0. ], [ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
Recommend best unseen item:
[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]
Print estimations on unseen items:
user | item | est_score | |
---|---|---|---|
0 | 0 | 60 | 4.0 |
1 | 10 | 40 | 3.0 |
2 | 20 | 0 | 3.0 |
3 | 20 | 20 | 4.0 |
4 | 20 | 70 | 4.0 |
model=selfBaselineUI()
model.fit(train_ui)
top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))
top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)
estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)
project task 1: implement self-made BaselineIU
# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI
# but first subtract column means then row means
# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'
# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'
Ready-made baseline - Surprise implementation
import surprise as sp
import time
# Based on surprise.readthedocs.io
def get_top_n(predictions, n=10):
# Here we create a dictionary which items are lists of pairs (item, score)
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*user_ratings[:n])))
return result
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())
algo = sp.BaselineOnly()
# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})
# observe how bad results gives above algorithm
# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)
Estimating biases using als...
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)
# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)
sp.accuracy.mae(predictions, verbose=True)
RMSE: 0.9495 MAE: 0.7525
0.7524871012820799
Let's compare with random
# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set
algo = sp.NormalPredictor()
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)
# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)
sp.accuracy.mae(predictions, verbose=True)
RMSE: 1.5239 MAE: 1.2268
1.2267993503843746