warsztaty2/.ipynb_checkpoints/P1. Baseline-checkpoint.ipynb
2020-06-16 19:40:37 +02:00

42 KiB
Raw Blame History

Preparing dataset

import pandas as pd
import numpy as np
import scipy.sparse as sparse
from collections import defaultdict
from itertools import chain
import random

train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
# Let's prepare dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes

user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_and_test[:5]
user item rating timestamp user_code item_code
0 664 525 4 876526580 663 524
1 49 1 2 888068651 48 0
2 352 273 2 884290328 351 272
3 618 96 3 891307749 617 95
4 560 24 2 879976772 559 23
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)

# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
# Above steps are the same for many algorithms, so I put the code in separate file:
import helpers
train_read=pd.read_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None)
test_read=pd.read_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None)
train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code = helpers.data_to_csr(train_read, test_read)

CSR matrices - what is it?

row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample_csr=sparse.csr_matrix((data, (row, col)))
sample_csr
<3x4 sparse matrix of type '<class 'numpy.intc'>'
	with 8 stored elements in Compressed Sparse Row format>
print('Ratings matrix with missing entries replaced by zeros:')
display(sample_csr.todense())

print('\nNumber of ratings: {} \nNumber of users: {} \nNumber of items: {} \n'
      .format(sample_csr.nnz, sample_csr.shape[0], sample_csr.shape[1]))
Ratings matrix with missing entries replaced by zeros:
matrix([[4, 1, 3, 0],
        [0, 2, 0, 1],
        [2, 0, 5, 4]], dtype=int32)
Number of ratings: 8 
Number of users: 3 
Number of items: 4 

print('Ratings data:', sample_csr.data)

print('Regarding items:', sample_csr.indices)

for i in range(sample_csr.shape[0]):
    print('Where ratings from {} to {} belongs to user {}.'.format(sample_csr.indptr[i], sample_csr.indptr[i+1]-1, i))
Ratings data: [4 1 3 2 1 2 5 4]
Regarding items: [0 1 2 1 3 0 2 3]
Where ratings from 0 to 2 belongs to user 0.
Where ratings from 3 to 4 belongs to user 1.
Where ratings from 5 to 7 belongs to user 2.
user=123

print('Efficient way to access items rated by user:')
display(train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]])
%timeit train_ui.indices[train_ui.indptr[user]:train_ui.indptr[user+1]]

print('Inefficient way to access items rated by user:')
display(train_ui[user].indices)
%timeit train_ui[user].indices
Efficient way to access items rated by user:
array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,
       171, 172, 173, 194, 208, 225, 473, 495, 549, 615], dtype=int32)
679 ns ± 11.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
Inefficient way to access items rated by user:
array([  0,   6,  10,  27,  49,  78,  95,  97, 116, 143, 153, 156, 167,
       171, 172, 173, 194, 208, 225, 473, 495, 549, 615])
57.1 µs ± 894 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Example: subtracting row means
print('Our matrix:')
display(sample_csr.todense())
print('List of row sums:')
sample_csr.sum(axis=1).ravel()
Our matrix:
matrix([[4, 1, 3, 0],
        [0, 2, 0, 1],
        [2, 0, 5, 4]], dtype=int32)
List of row sums:
matrix([[ 8,  3, 11]])
print('Array with row means:')
row_means=np.asarray(sample_csr.sum(axis=1).ravel())[0]/np.diff(sample_csr.indptr)
display(row_means)

print('Diagonal csr matrix with inverse of row sums on diagonal:')
display(sparse.diags(row_means).todense())

print("""Let's apply them in nonzero entries:""")
to_subtract=sparse.diags(row_means)*sample_csr.power(0)
display(to_subtract.todense())

print("Finally after subtraction:")
sample_csr-to_subtract.todense()
Array with row means:
array([2.66666667, 1.5       , 3.66666667])
Diagonal csr matrix with inverse of row sums on diagonal:
matrix([[2.66666667, 0.        , 0.        ],
        [0.        , 1.5       , 0.        ],
        [0.        , 0.        , 3.66666667]])
Let's apply them in nonzero entries:
matrix([[2.66666667, 2.66666667, 2.66666667, 0.        ],
        [0.        , 1.5       , 0.        , 1.5       ],
        [3.66666667, 0.        , 3.66666667, 3.66666667]])
Finally after subtraction:
matrix([[ 1.33333333, -1.66666667,  0.33333333,  0.        ],
        [ 0.        ,  0.5       ,  0.        , -0.5       ],
        [-1.66666667,  0.        ,  1.33333333,  0.33333333]])
Transposing
import numpy as np
from scipy import sparse
row = np.array([0, 0, 0, 1, 1, 2, 2, 2])
col = np.array([0, 1, 2, 1, 3, 2, 0, 3])
data = np.array([4, 1, 3, 2,1, 5, 2, 4])
sample=sparse.csr_matrix((data, (row, col)))
print('Sample matrix: \n', sample.A)
print('\nIndices: \n', sample.indices)
transposed=sample.transpose()
print('\nTransposed matrix: \n', transposed.A)
print('\nIndices of transposed matrix: \n', transposed.indices)

print('\nReason: ', type(transposed))

print('\nAfter converting to csr: \n', transposed.tocsr().indices)
Sample matrix: 
 [[4 1 3 0]
 [0 2 0 1]
 [2 0 5 4]]

Indices: 
 [0 1 2 1 3 0 2 3]

Transposed matrix: 
 [[4 0 2]
 [1 2 0]
 [3 0 5]
 [0 1 4]]

Indices of transposed matrix: 
 [0 1 2 1 3 0 2 3]

Reason:  <class 'scipy.sparse.csc.csc_matrix'>

After converting to csr: 
 [0 2 0 1 0 2 1 2]

Self made top popular

import os
if not os.path.exists('./Recommendations generated/'):
    os.mkdir('./Recommendations generated/')
    os.mkdir('./Recommendations generated/ml-100k/')
    os.mkdir('./Recommendations generated/toy-example/')
TopPop=[]
train_iu=train_ui.transpose().tocsr()
scaling_factor=train_ui.max()/max(np.diff(train_iu.indptr))

for i in range(train_iu.shape[0]):
    TopPop.append((i, (train_iu.indptr[i+1]-train_iu.indptr[i])*scaling_factor))
    
TopPop.sort(key=lambda x: x[1], reverse=True)
#TopPop is an array of pairs (item, rescaled_popularity) sorted descending from the most popular

k=10
result=[]

for u in range(train_ui.shape[0]):
    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
    rec_user=[]
    item_pos=0
    while len(rec_user)<10:
        if TopPop[item_pos][0] not in user_rated:
            rec_user.append((item_code_id[TopPop[item_pos][0]], TopPop[item_pos][1]))
        item_pos+=1
    result.append([user_code_id[u]]+list(chain(*rec_user)))

(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopPop_reco.csv', index=False, header=False)


# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking

estimations=[]

for user, item in zip(*test_ui.nonzero()):
    estimations.append([user_code_id[user], item_code_id[item],
        (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopPop_estimations.csv', index=False, header=False)

Self made global average

GlobalAvg=[]
avg=np.sum(train_ui)/train_ui.nnz

for i in range(train_iu.shape[0]):
    GlobalAvg.append((i, avg))
    
k=10
result=[]

for u in range(train_ui.shape[0]):
    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
    rec_user=[]
    item_pos=0
    while len(rec_user)<10:
        if GlobalAvg[item_pos][0] not in user_rated:
            rec_user.append((item_code_id[GlobalAvg[item_pos][0]], GlobalAvg[item_pos][1]))
        item_pos+=1
    result.append([user_code_id[u]]+list(chain(*rec_user)))

(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_reco.csv', index=False, header=False)


# estimations - score is a bit artificial since that method is not designed for scoring, but for ranking

estimations=[]

for user, item in zip(*test_ui.nonzero()):
    estimations.append([user_code_id[user], item_code_id[item], avg])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_GlobalAvg_estimations.csv', index=False, header=False)
pd.DataFrame(result)[:2]
0 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 20
0 1 5 3.529975 10 3.529975 25 3.529975 32 3.529975 33 ... 44 3.529975 46 3.529975 50 3.529975 52 3.529975 55 3.529975
1 2 1 3.529975 2 3.529975 3 3.529975 4 3.529975 5 ... 6 3.529975 7 3.529975 8 3.529975 9 3.529975 11 3.529975

2 rows × 21 columns

Project task 1 - self made top rated

# project task 1:  implement TopRated
# Implement recommender system which will recommend movies (which user hasn't seen) with the highest average rating
# The output should be saved in 'Recommendations generated/ml-100k/Self_TopRated_reco.csv'
# and 'Recommendations generated/ml-100k/Self_TopRated_estimations.csv'
TopRated=[]
train_iu=train_ui.transpose().tocsr()
for i in range(train_iu.shape[0]):
    if(train_iu.indptr[i+1]-train_iu.indptr[i] != 0):
        avg = np.sum(train_iu.data[train_iu.indptr[i]:train_iu.indptr[i+1]])/(train_iu.indptr[i+1]-train_iu.indptr[i])
        TopRated.append((i, avg))
TopRated.sort(key=lambda x: x[1], reverse=True)
result=[]
for u in range(train_ui.shape[0]):
    user_rated=train_ui.indices[train_ui.indptr[u]:train_ui.indptr[u+1]]
    rec_user=[]
    item_pos=0
    while len(rec_user)<10:
        if TopRated[item_pos][0] not in user_rated:
            rec_user.append((item_code_id[TopRated[item_pos][0]], TopRated[item_pos][1]))
        item_pos+=1
    result.append([user_code_id[u]]+list(chain(*rec_user)))
(pd.DataFrame(result)).to_csv('Recommendations generated/ml-100k/Self_TopRated_reco.csv', index=False, header=False)
estimations=[]
for user, item in zip(*test_ui.nonzero()):
    estimations.append([user_code_id[user], item_code_id[item],
        (train_iu.indptr[item+1]-train_iu.indptr[item])*scaling_factor])
(pd.DataFrame(estimations)).to_csv('Recommendations generated/ml-100k/Self_TopRated_estimations.csv', index=False, header=False)

Self-made baseline

class selfBaselineUI():
    
    def fit(self, train_ui):
        self.train_ui=train_ui.copy()
        self.train_iu=train_ui.transpose().tocsr()
        
        result=self.train_ui.copy()
        
        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)
        
        # in csr format after addition or multiplication 0 entries "disappear" - so some workaraunds are needed 
        # (other option is to define addition/multiplication in a desired way)
        row_means=self.row_means.copy()
        
        max_row_mean=np.max(row_means)
        row_means[row_means==0]=max_row_mean+1
        to_subtract_rows=sparse.diags(row_means)*result.power(0)
        to_subtract_rows.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_rows.data
        subtract[subtract==max_row_mean+1]=0
        
        result.data=result.data-subtract
#        we can't do result=train_ui-to_subtract_rows since then 0 entries will "disappear" in csr format
        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),\
                            out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0) # handling items without ratings
        
        # again - it is possible that some mean will be zero, so let's use the same workaround
        col_means=self.col_means.copy()
        
        max_col_mean=np.max(col_means)
        col_means[col_means==0]=max_col_mean+1
        to_subtract_cols=result.power(0)*sparse.diags(col_means)
        to_subtract_cols.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_cols.data
        subtract[subtract==max_col_mean+1]=0
        
        result.data=result.data-subtract

        return result
    
    
    def recommend(self, user_code_id, item_code_id, topK=10):
        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated:
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]
        # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])
        return result
toy_train_read=pd.read_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
toy_test_read=pd.read_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])

toy_train_ui, toy_test_ui, toy_user_code_id, toy_user_id_code, \
toy_item_code_id, toy_item_id_code = helpers.data_to_csr(toy_train_read, toy_test_read)

print('Training data:')
display(toy_train_ui.todense())

model=selfBaselineUI()
print('After subtracting rows and columns:')
display(model.fit(toy_train_ui).todense())

print('Recommend best unseen item:')
display(model.recommend(toy_user_code_id, toy_item_code_id, topK=1))

print('Print estimations on unseen items:')
estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.columns=['user', 'item', 'est_score']
display(estimations)

top_n=pd.DataFrame(model.recommend(toy_user_code_id, toy_item_code_id, topK=3))

top_n.to_csv('Recommendations generated/toy-example/Self_BaselineUI_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(toy_user_code_id, toy_item_code_id, toy_test_ui))
estimations.to_csv('Recommendations generated/toy-example/Self_BaselineUI_estimations.csv', index=False, header=False)
Training data:
matrix([[3, 4, 0, 0, 5, 0, 0, 4],
        [0, 1, 2, 3, 0, 0, 0, 0],
        [0, 0, 0, 5, 0, 3, 4, 0]], dtype=int64)
After subtracting rows and columns:
matrix([[ 0. ,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
        [ 0. , -0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]])
Recommend best unseen item:
[[0, 30, 5.0], [10, 40, 3.0], [20, 40, 5.0]]
Print estimations on unseen items:
user item est_score
0 0 60 4.0
1 10 40 3.0
2 20 0 3.0
3 20 20 4.0
4 20 70 4.0
model=selfBaselineUI()
model.fit(train_ui)

top_n=pd.DataFrame(model.recommend(user_code_id, item_code_id, topK=10))

top_n.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_reco.csv', index=False, header=False)

estimations=pd.DataFrame(model.estimate(user_code_id, item_code_id, test_ui))
estimations.to_csv('Recommendations generated/ml-100k/Self_BaselineUI_estimations.csv', index=False, header=False)

project task 2: implement self-made BaselineIU

# Implement recommender system which will recommend movies (which user hasn't seen) which is similar to BaselineUI
# but first subtract col means then row means
# The output should be saved in 'Recommendations generated/ml-100k/Self_BaselineIU_reco.csv'
# and 'Recommendations generated/ml-100k/Self_BaselineIU_estimations.csv'
class selfBaselineIU():
    
    def fit(self, train_ui):
        self.train_ui=train_ui.copy()
        self.train_iu=train_ui.transpose().tocsr()
        
        result=self.train_ui.copy()
        
        self.col_means=np.divide(np.asarray(result.sum(axis=0).ravel())[0], np.diff(self.train_iu.indptr),                    out=np.zeros(self.train_iu.shape[0]), where=np.diff(self.train_iu.indptr)!=0)
        
        col_means=self.col_means.copy()
        
        max_col_mean=np.max(col_means)
        col_means[col_means==0]=max_col_mean+1
        to_subtract_cols=result.power(0)*sparse.diags(col_means)
        to_subtract_cols.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_cols.data
        subtract[subtract==max_col_mean+1]=0
        
        result.data=result.data-subtract

        self.row_means=np.asarray(result.sum(axis=1).ravel())[0]/np.diff(result.indptr)
        
        row_means=self.row_means.copy()
        
        max_row_mean=np.max(row_means)
        row_means[row_means==0]=max_row_mean+1
        to_subtract_rows=sparse.diags(row_means)*result.power(0)
        to_subtract_rows.sort_indices() # needed to have valid .data
        
        subtract=to_subtract_rows.data
        subtract[subtract==max_row_mean+1]=0
        
        result.data=result.data-subtract

        return result
    
    
    def recommend(self, user_code_id, item_code_id, topK=10):
        estimations=np.tile(self.row_means[:,None], [1, self.train_ui.shape[1]]) +np.tile(self.col_means, [self.train_ui.shape[0], 1])
        
        top_k = defaultdict(list)
        for nb_user, user in enumerate(estimations):
            
            user_rated=self.train_ui.indices[self.train_ui.indptr[nb_user]:self.train_ui.indptr[nb_user+1]]
            for item, score in enumerate(user):
                if item not in user_rated:
                    top_k[user_code_id[nb_user]].append((item_code_id[item], score))
        result=[]

        for uid, item_scores in top_k.items():
            item_scores.sort(key=lambda x: x[1], reverse=True)
            result.append([uid]+list(chain(*item_scores[:topK])))
        return result
    
    def estimate(self, user_code_id, item_code_id, test_ui):
        result=[]
        for user, item in zip(*test_ui.nonzero()):
            result.append([user_code_id[user], item_code_id[item], self.row_means[user]+self.col_means[item]])
        return result

Ready-made baseline - Surprise implementation

import surprise as sp
import time

# Based on surprise.readthedocs.io
def get_top_n(predictions, n=10):
    
    # Here we create a dictionary which items are lists of pairs (item, score)
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        
    result=[]
    # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        result.append([uid]+list(chain(*user_ratings[:n]))) 
    return result


reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package

testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())

algo = sp.BaselineOnly()
# algo = sp.BaselineOnly(bsl_options={'method':'sgd', 'reg':0, 'n_epochs':2000})
# observe how bad results gives above algorithm
# more details http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf - chapter 2.1

algo.fit(trainset)

antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)

top_n = get_top_n(predictions, n=10)

top_n=pd.DataFrame(top_n)

top_n.to_csv('Recommendations generated/ml-100k/Ready_Baseline_reco.csv', index=False, header=False)
Estimating biases using als...
# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)

# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
    predictions_df.append([uid, iid, est])
    
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Baseline_estimations.csv', index=False, header=False)

sp.accuracy.mae(predictions, verbose=True)
RMSE: 0.9495
MAE:  0.7525
0.7524871012820799
Let's compare with random
# in surprise random is an algorithm predicting random value regarding to normal distribution estimated from train set
algo = sp.NormalPredictor()
algo.fit(trainset)

antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
predictions = algo.test(antitrainset)

top_n = get_top_n(predictions, n=10)

top_n=pd.DataFrame(top_n)

top_n.to_csv('Recommendations generated/ml-100k/Ready_Random_reco.csv', index=False, header=False)

# Compute RMSE on testset using buildin functions
predictions = algo.test(testset)
sp.accuracy.rmse(predictions, verbose=True)

# Let's also save the results in file
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
    predictions_df.append([uid, iid, est])
    
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv('Recommendations generated/ml-100k/Ready_Random_estimations.csv', index=False, header=False)

sp.accuracy.mae(predictions, verbose=True)
RMSE: 1.5100
MAE:  1.2118
1.211847558071457