import pandas as pd import numpy as np import scipy.sparse as sparse import surprise as sp import time from collections import defaultdict from itertools import chain from six.moves.urllib.request import urlretrieve import zipfile import os def data_to_csr(train_read, test_read): train_read.columns=['user', 'item', 'rating', 'timestamp'] test_read.columns=['user', 'item', 'rating', 'timestamp'] # Let's build whole dataset train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True) train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories)) user_id_code = dict((v, k) for k, v in user_code_id.items()) item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories)) item_id_code = dict((v, k) for k, v in item_code_id.items()) train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns)) test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns)) # Take number of users and items (U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1) # Create sparse csr matrices train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I)) test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I)) return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code def get_top_n(predictions, n=10): # Here we create a dictionary which items are lists of pairs (item, score) top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) result=[] # Let's choose k best items in the format: (user, item1, score1, item2, score2, ...) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) result.append([uid]+list(chain(*user_ratings[:n]))) return result def ready_made(algo, reco_path, estimations_path): reader = sp.Reader(line_format='user item rating timestamp', sep='\t') trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader) trainset = trainset.build_full_trainset() # -> it is needed for using Surprise package testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader) testset = sp.Trainset.build_testset(testset.build_full_trainset()) algo.fit(trainset) antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set print('Generating predictions...') predictions = algo.test(antitrainset) print('Generating top N recommendations...') top_n = get_top_n(predictions, n=10) top_n=pd.DataFrame(top_n) top_n.to_csv(reco_path, index=False, header=False) print('Generating predictions...') predictions = algo.test(testset) predictions_df=[] for uid, iid, true_r, est, _ in predictions: predictions_df.append([uid, iid, est]) predictions_df=pd.DataFrame(predictions_df) predictions_df.to_csv(estimations_path, index=False, header=False) def download_movielens_100k_dataset(force=False): os.makedirs('./Datasets/', exist_ok = True) if not os.path.isdir('Datasets/ml-100k') or force: url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' tmp_file_path = 'Datasets/ml-100k.zip' urlretrieve(url, tmp_file_path) with zipfile.ZipFile(tmp_file_path, 'r') as tmp_zip: tmp_zip.extractall('Datasets/') os.remove(tmp_file_path)