import pandas as pd import numpy as np import scipy.sparse as sparse import surprise as sp import time from collections import defaultdict from itertools import chain def data_to_csr(train_read, test_read): train_read.columns=['user', 'item', 'rating', 'timestamp'] test_read.columns=['user', 'item', 'rating', 'timestamp'] # Let's build whole dataset train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True) train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories)) user_id_code = dict((v, k) for k, v in user_code_id.items()) item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories)) item_id_code = dict((v, k) for k, v in item_code_id.items()) train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns)) test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns)) # Take number of users and items (U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1) # Create sparse csr matrices train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I)) test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I)) return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code def get_top_n(predictions, n=10): # Here we create a dictionary which items are lists of pairs (item, score) top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) result=[] # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) result.append([uid]+list(chain(*user_ratings[:n]))) return result def ready_made(algo, reco_path, estimations_path): reader = sp.Reader(line_format='user item rating timestamp', sep='\t') trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader) trainset = trainset.build_full_trainset() # -> it is needed for using Surprise package testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader) testset = sp.Trainset.build_testset(testset.build_full_trainset()) algo.fit(trainset) antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set print('Generating predictions...') predictions = algo.test(antitrainset) print('Generating top N recommendations...') top_n = get_top_n(predictions, n=10) top_n=pd.DataFrame(top_n) top_n.to_csv(reco_path, index=False, header=False) print('Generating predictions...') predictions = algo.test(testset) predictions_df=[] for uid, iid, true_r, est, _ in predictions: predictions_df.append([uid, iid, est]) predictions_df=pd.DataFrame(predictions_df) predictions_df.to_csv(estimations_path, index=False, header=False)