warsztaty-B/helpers.py

75 lines
3.3 KiB
Python
Raw Normal View History

2020-06-13 15:34:33 +02:00
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import surprise as sp
import time
from collections import defaultdict
from itertools import chain
def data_to_csr(train_read, test_read):
train_read.columns=['user', 'item', 'rating', 'timestamp']
test_read.columns=['user', 'item', 'rating', 'timestamp']
# Let's build whole dataset
train_and_test=pd.concat([train_read, test_read], axis=0, ignore_index=True)
train_and_test['user_code'] = train_and_test['user'].astype("category").cat.codes
train_and_test['item_code'] = train_and_test['item'].astype("category").cat.codes
user_code_id = dict(enumerate(train_and_test['user'].astype("category").cat.categories))
user_id_code = dict((v, k) for k, v in user_code_id.items())
item_code_id = dict(enumerate(train_and_test['item'].astype("category").cat.categories))
item_id_code = dict((v, k) for k, v in item_code_id.items())
train_df=pd.merge(train_read, train_and_test, on=list(train_read.columns))
test_df=pd.merge(test_read, train_and_test, on=list(train_read.columns))
# Take number of users and items
(U,I)=(train_and_test['user_code'].max()+1, train_and_test['item_code'].max()+1)
# Create sparse csr matrices
train_ui = sparse.csr_matrix((train_df['rating'], (train_df['user_code'], train_df['item_code'])), shape=(U, I))
test_ui = sparse.csr_matrix((test_df['rating'], (test_df['user_code'], test_df['item_code'])), shape=(U, I))
return train_ui, test_ui, user_code_id, user_id_code, item_code_id, item_id_code
def get_top_n(predictions, n=10):
# Here we create a dictionary which items are lists of pairs (item, score)
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
result=[]
# Let's choose k best items in the format: (user, item1, score1, item2, score2, ...)
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
result.append([uid]+list(chain(*user_ratings[:n])))
return result
def ready_made(algo, reco_path, estimations_path):
reader = sp.Reader(line_format='user item rating timestamp', sep='\t')
trainset = sp.Dataset.load_from_file('./Datasets/ml-100k/train.csv', reader=reader)
trainset = trainset.build_full_trainset() # <class 'surprise.trainset.Trainset'> -> it is needed for using Surprise package
testset = sp.Dataset.load_from_file('./Datasets/ml-100k/test.csv', reader=reader)
testset = sp.Trainset.build_testset(testset.build_full_trainset())
algo.fit(trainset)
antitrainset = trainset.build_anti_testset() # We want to predict ratings of pairs (user, item) which are not in train set
print('Generating predictions...')
predictions = algo.test(antitrainset)
print('Generating top N recommendations...')
top_n = get_top_n(predictions, n=10)
top_n=pd.DataFrame(top_n)
top_n.to_csv(reco_path, index=False, header=False)
print('Generating predictions...')
predictions = algo.test(testset)
predictions_df=[]
for uid, iid, true_r, est, _ in predictions:
predictions_df.append([uid, iid, est])
predictions_df=pd.DataFrame(predictions_df)
predictions_df.to_csv(estimations_path, index=False, header=False)