90 lines
3.8 KiB
Python
90 lines
3.8 KiB
Python
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import scipy.sparse as sparse
|
||
|
import surprise as sp
|
||
|
import time
|
||
|
from collections import defaultdict
|
||
|
from itertools import chain
|
||
|
from six.moves.urllib.request import urlretrieve
|
||
|
import zipfile
|
||
|
import os
|
||
|
|
||
|
def data_to_csr(train_read, test_read):
|
||
|
train_read.columns=['user', 'item', 'rating', 'timestamp']
|
||
|
test_read.columns=['user', 'item', 'rating', 'timestamp']
|
||
|
|
||
|
# Let's build whole dataset
|
||
|
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
|
||
|
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
|
||
|
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
|
||
|
|
||
|
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
|
||
|
user_id_code = dict((v, k) for k, v in user_code_id.items())
|
||
|
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
|
||
|
item_id_code = dict((v, k) for k, v in item_code_id.items())
|
||
|
|
||
|
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
|
||
|
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
|
||
|
|
||
|
# Take number of users and items
|
||
|
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
|
||
|
|
||
|
# Create sparse csr matrices
|
||
|
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
|
||
|
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
|
||
|
|
||
|
return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code
|
||
|
|
||
|
|
||
|
def get_top_n(predictions, n=10):
|
||
|
|
||
|
# Here we create a dictionary which items are lists of pairs (item, score)
|
||
|
top_n = defaultdict(list)
|
||
|
for uid, iid, true_r, est, _ in predictions:
|
||
|
top_n[uid].append((iid, est))
|
||
|
|
||
|
result=[]
|
||
|
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
|
||
|
for uid, user_ratings in top_n.items():
|
||
|
user_ratings.sort(key=lambda x: x[1], reverse=True)
|
||
|
result.append([uid]+list(chain(*user_ratings[:n])))
|
||
|
return result
|
||
|
|
||
|
|
||
|
def ready_made(algo, reco_path, estimations_path):
|
||
|
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
|
||
|
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
|
||
|
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
|
||
|
|
||
|
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
|
||
|
testset = sp.Trainset.build_testset(testset.build_full_trainset())
|
||
|
|
||
|
algo.fit(trainset)
|
||
|
|
||
|
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
|
||
|
print('Generating predictions...')
|
||
|
predictions = algo.test(antitrainset)
|
||
|
print('Generating top N recommendations...')
|
||
|
top_n = get_top_n(predictions, n=10)
|
||
|
top_n=pd.DataFrame(top_n)
|
||
|
top_n.to_csv(reco_path, index=False, header=False)
|
||
|
|
||
|
print('Generating predictions...')
|
||
|
predictions = algo.test(testset)
|
||
|
predictions_df=[]
|
||
|
for uid, iid, true_r, est, _ in predictions:
|
||
|
predictions_df.append([uid, iid, est])
|
||
|
predictions_df=pd.DataFrame(predictions_df)
|
||
|
predictions_df.to_csv(estimations_path, index=False, header=False)
|
||
|
|
||
|
|
||
|
def download_movielens_100k_dataset(force=False):
|
||
|
os.makedirs('./Datasets/', exist_ok = True)
|
||
|
if not os.path.isdir('Datasets/ml-100k') or force:
|
||
|
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
|
||
|
tmp_file_path = 'Datasets/ml-100k.zip'
|
||
|
urlretrieve(url, tmp_file_path)
|
||
|
|
||
|
with zipfile.ZipFile(tmp_file_path, 'r') as tmp_zip:
|
||
|
tmp_zip.extractall('Datasets/')
|
||
|
os.remove(tmp_file_path)
|