WSS-project/helpers.py
2021-03-20 20:01:22 +01:00

90 lines
3.8 KiB
Python

import pandas as pd
import numpy as np
import scipy.sparse as sparse
import surprise as sp
import time
from collections import defaultdict
from itertools import chain
from six.moves.urllib.request import urlretrieve
import zipfile
import os
def data_to_csr(train_read, test_read):
train_read.columns=['user', 'item', 'rating', 'timestamp']
test_read.columns=['user', 'item', 'rating', 'timestamp']
# Let's build whole dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code
def get_top_n(predictions, n=10):
# Here we create a dictionary which items are lists of pairs (item, score)
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*user_ratings[:n])))
return result
def ready_made(algo, reco_path, estimations_path):
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
print('Generating predictions...')
predictions = algo.test(antitrainset)
print('Generating top N recommendations...')
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv(reco_path, index=False, header=False)
print('Generating predictions...')
predictions = algo.test(testset)
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv(estimations_path, index=False, header=False)
def download_movielens_100k_dataset(force=False):
os.makedirs('./Datasets/', exist_ok = True)
if not os.path.isdir('Datasets/ml-100k') or force:
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
tmp_file_path = 'Datasets/ml-100k.zip'
urlretrieve(url, tmp_file_path)
with zipfile.ZipFile(tmp_file_path, 'r') as tmp_zip:
tmp_zip.extractall('Datasets/')
os.remove(tmp_file_path)