WSS-project/P1. Baseline.ipynb
2021-04-11 16:30:13 +02:00

37 KiB
Raw Blame History

Preparing dataset

import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
# Let's prepare dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes

user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_and_test[:5]
user item rating timestamp user_code item_code
0 664 525 4 876526580 663 524
1 49 1 2 888068651 48 0
2 352 273 2 884290328 351 272
3 618 96 3 891307749 617 95
4 560 24 2 879976772 559 23
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)

# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
# Above steps are the same for many algorithms, so I put the code in separate file:
import helpers
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

CSR matrices - what is it?

row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample_csr=sparse.csr_matrix((data, (row, col)))
sample_csr
<3x4 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>
print('Ratings matrix with missing entries replaced by zeros:')
display(sample_csr.todense())

print(f'Number of ratings: {sample_csr.nnz}')
print(f'Number of users: {sample_csr.shape[0]}')
print(f'Number of items: {sample_csr.shape[1]}')
Ratings matrix with missing entries replaced by zeros:
matrix([[4, 1, 3, 0],
        [0, 2, 0, 1],
        [2, 0, 5, 4]])
Number of ratings: 8
Number of users: 3
Number of items: 4
print('Ratings data:', sample_csr.data)

print('Regarding items:', sample_csr.indices)

for i in range(sample_csr.shape[0]):
    print(f'Where ratings from {sample_csr.indptr[i]} to {sample_csr.indptr[i+1]-1} belongs to user {i}.')
Ratings data: [4 1 3 2 1 2 5 4]
Regarding items: [0 1 2 1 3 0 2 3]
Where ratings from 0 to 2 belongs to user 0.
Where ratings from 3 to 4 belongs to user 1.
Where ratings from 5 to 7 belongs to user 2.
user=123

print('Efficient way to access items rated by user:')
display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])
%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]

print('Inefficient way to access items rated by user:')
display(train_ui[user].indices)
%timeit train_ui[user].indices
Efficient way to access items rated by user:
array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,
       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
1.13 µs ± 79.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
Inefficient way to access items rated by user:
array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,
       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
149 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Example: subtracting row means
print('Our matrix:')
display(sample_csr.todense())
print('List of row sums:')
sample_csr.sum(axis=1).ravel()
Our matrix:
matrix([[4, 1, 3, 0],
        [0, 2, 0, 1],
        [2, 0, 5, 4]])
List of row sums:
matrix([[ 8,  3, 11]])
print('Array with row means:')
row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)
display(row_means)

print('Diagonal csr matrix with inverse of row sums on diagonal:')
display(sparse.diags(row_means).todense())

print("""Let's apply them in nonzero entries:""")
to_subtract=sparse.diags(row_means)*(sample_csr>0)
display(to_subtract.todense())

print("Finally after subtraction:")
sample_csr-to_subtract.todense()
Array with row means:
array([2.66666667, 1.5       , 3.66666667])
Diagonal csr matrix with inverse of row sums on diagonal:
matrix([[2.66666667, 0.        , 0.        ],
        [0.        , 1.5       , 0.        ],
        [0.        , 0.        , 3.66666667]])
Let's apply them in nonzero entries:
matrix([[2.66666667, 2.66666667, 2.66666667, 0.        ],
        [0.        , 1.5       , 0.        , 1.5       ],
        [3.66666667, 0.        , 3.66666667, 3.66666667]])
Finally after subtraction:
matrix([[ 1.33333333, -1.66666667,  0.33333333,  0.        ],
        [ 0.        ,  0.5       ,  0.        , -0.5       ],
        [-1.66666667,  0.        ,  1.33333333,  0.33333333]])
Transposing
import numpy as np
from scipy import sparse
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample=sparse.csr_matrix((data, (row, col)))
print('Sample matrix: \n', sample.A)
print('\nIndices: \n', sample.indices)
transposed=sample.transpose()
print('\nTransposed matrix: \n', transposed.A)
print('\nIndices of transposed matrix: \n', transposed.indices)

print('\nReason: ', type(transposed))

print('\nAfter converting to csr: \n', transposed.tocsr().indices)
Sample matrix: 
 [[4 1 3 0]
 [0 2 0 1]
 [2 0 5 4]]

Indices: 
 [0 1 2 1 3 0 2 3]

Transposed matrix: 
 [[4 0 2]
 [1 2 0]
 [3 0 5]
 [0 1 4]]

Indices of transposed matrix: 
 [0 1 2 1 3 0 2 3]

Reason:  <class 'scipy.sparse.csc.csc_matrix'>

After converting to csr: 
 [0 2 0 1 0 2 1 2]

Self made top popular

import os
if not os.path.exists('./Recommendations generated/'):
    os.mkdir('./Recommendations generated/')
    os.mkdir('./Recommendations generated/ml-100k/')
    os.mkdir('./Recommendations generated/toy-example/')
top_pop = []
train_iu = train_ui.transpose().tocsr()
scaling_factor = train_ui.max()/max(np.diff(train_iu.indptr))

for i in range(train_iu.shape[0]):
    top_pop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))
    
top_pop.sort(key=lambda x: x[1], reverse=True)
#top_pop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular

k = 10
result = []

for u in range(train_ui.shape[0]):
    user_rated = train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
    rec_user = []
    item_pos = 0
    while len(rec_user)<10:
        if top_pop[item_pos][0] not in user_rated:
            rec_user.append((item_code_id[top_pop[item_pos][0]], top_pop[item_pos][1]))
        item_pos+=1
    result.append([user_code_id[u]]+list(chain(*rec_user)))

(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)


# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking

estimations=[]

for user, item in zip(*test_ui.nonzero()):
    estimations.append([user_code_id[user], item_code_id[item],
        (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)

Self made top rated

top_rated = []
global_avg = sum(train_iu.data)/train_ui.nnz

for i in range(train_iu.shape[0]):
    ratings = train_iu.data[train_iu.indptr[i]: train_iu.indptr[i+1]]
    avg = np.mean(ratings) if len(ratings)>0 else global_avg
    top_rated.append((i, avg))
    
top_rated.sort(key=lambda x: x[1], reverse=True)
    
k=10
result=[]

for u in range(train_ui.shape[0]):
    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
    rec_user=[]
    item_pos=0
    while len(rec_user)<10:
        if top_rated[item_pos][0] not in user_rated:
            rec_user.append((item_code_id[top_rated[item_pos][0]], top_rated[item_pos][1]))
        item_pos+=1
    result.append([user_code_id[u]]+list(chain(*rec_user)))

(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)



estimations=[]
d = dict(top_rated)

for user, item in zip(*test_ui.nonzero()):
    estimations.append([user_code_id[user], item_code_id[item], d[item]])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)
pd.DataFrame(result)[:2]
0 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 20
0 1 814 5.0 1122 5.0 1189 5.0 1201 5.0 1293 ... 1306 5.0 1467 5.0 1491 5.0 1500 5.0 1536 5.0
1 2 119 5.0 814 5.0 1122 5.0 1189 5.0 1201 ... 1293 5.0 1306 5.0 1467 5.0 1491 5.0 1500 5.0

2 rows × 21 columns

Self-made baseline

class selfBaselineUI():
    
    def fit(self, train_ui):
        self.train_ui=train_ui.copy()
        self.train_iu=train_ui.transpose().tocsr()
        
        result=self.train_ui.copy()
        
        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)
        
        # in csr format after addition or multiplication 0 entries "disappear" - so some workaraunds are needed 
        # (other option is to define addition/multiplication in a desired way)
        row_means=self.row_means.copy()
        
        max_row_mean=np.max(row_means)
        row_means[row_means==0]=max_row_mean+1
        to_subtract_rows=sparse.diags(row_means)*(result.power(0))
        to_subtract_rows.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_rows.data
        subtract[subtract==max_row_mean+1]=0
        
        result.data=result.data-subtract
#        we can't do result=train_ui-to_subtract_rows since then 0 entries will "disappear" in csr format
        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\
                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings
        
        # again - it is possible that some mean will be zero, so let's use the same workaround
        col_means=self.col_means.copy()
        
        max_col_mean=np.max(col_means)
        col_means[col_means==0]=max_col_mean+1
        to_subtract_cols=result.power(0)*sparse.diags(col_means)
        to_subtract_cols.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_cols.data
        subtract[subtract==max_col_mean+1]=0
        
        result.data=result.data-subtract

        return result
    
    
    def recommend(self, user_code_id, item_code_id, topK=10):
        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated:
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])
        return result
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])

toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)

print('Training data:')
display(toy_train_ui.todense())

model=selfBaselineUI()
print('After subtracting rows and columns:')
display(model.fit(toy_train_ui).todense())

print('Recommend best unseen item:')
display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))

print('Print estimations on unseen items:')
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.columns=['user', 'item', 'est_score']
display(estimations)

top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))

top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4],
        [0, 1, 2, 3, 0, 0, 0, 0],
        [0, 0, 0, 5, 0, 3, 4, 0]])
After subtracting rows and columns:
matrix([[ 0. ,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
        [ 0. , -0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]])
Recommend best unseen item:
[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]
Print estimations on unseen items:
user item est_score
0 0 60 4.0
1 10 40 3.0
2 20 0 3.0
3 20 20 4.0
4 20 70 4.0
model=selfBaselineUI()
model.fit(train_ui)

top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)

project task 1: implement self-made BaselineIU

Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI but first subtract column means then row means.

The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv' and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'.



Additional clarification:

Summarizing, the prediction of the rating of the user u regarding the item i should be equal to b_u + b_i. The procedure to get b_u and b_i is the following:

  • We have the original user-item ratings matrix M.
  • For each column representing the item i, we compute the mean of ratings and denote by b_i. From each rating in matrix M we subtract the corresponding column mean (b_i) to receive new matrix M'.
  • For each row of matrix M' representing the user u, we compute the mean of ratings and denote by b_u.

Ready-made baseline - Surprise implementation

import surprise as sp
import time

# Based on surprise.readthedocs.io
def get_top_n(predictions, n=10):
    
    # Here we create a dictionary which items are lists of pairs (item, score)
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        
    result=[]
    # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        result.append([uid]+list(chain(*user_ratings[:n]))) 
    return result


reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package

testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())

algo = sp.BaselineOnly()
# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})
# observe how bad results gives above algorithm
# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1

algo.fit(trainset)

antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)

top_n = get_top_n(predictions, n=10)

top_n=pd.DataFrame(top_n)

top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)
Estimating biases using als...
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)

# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
    predictions_df.append([uid, iid, est])
    
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)

sp.accuracy.mae(predictions, verbose=True)
RMSE: 0.9495
MAE:  0.7525
0.7524871012820799
Let's compare with random
# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set
algo = sp.NormalPredictor()
algo.fit(trainset)

antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)

top_n = get_top_n(predictions, n=10)

top_n=pd.DataFrame(top_n)

top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)

# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)

# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
    predictions_df.append([uid, iid, est])
    
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)

sp.accuracy.mae(predictions, verbose=True)
RMSE: 1.5239
MAE:  1.2268
1.2267993503843746