first commit
This commit is contained in:
commit
f6ce2585b8
108
.gitignore
vendored
Normal file
108
.gitignore
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# PyCharm project settings
|
||||
.idea
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
simulation.spec
|
BIN
Class_1_Recommender_systems_introduction.pdf
Normal file
BIN
Class_1_Recommender_systems_introduction.pdf
Normal file
Binary file not shown.
BIN
Class_4_Content_based_recommenders.one
Normal file
BIN
Class_4_Content_based_recommenders.one
Normal file
Binary file not shown.
BIN
Class_4_Content_based_recommenders.pdf
Normal file
BIN
Class_4_Content_based_recommenders.pdf
Normal file
Binary file not shown.
BIN
Class_4_Recommender_systems_testing_and_evaluation.pdf
Normal file
BIN
Class_4_Recommender_systems_testing_and_evaluation.pdf
Normal file
Binary file not shown.
BIN
Class_5_Amazon_recommender.one
Normal file
BIN
Class_5_Amazon_recommender.one
Normal file
Binary file not shown.
BIN
Class_5_Amazon_recommender.pdf
Normal file
BIN
Class_5_Amazon_recommender.pdf
Normal file
Binary file not shown.
0
__init__.py
Normal file
0
__init__.py
Normal file
4669
class_2_numpy_pandas.ipynb
Normal file
4669
class_2_numpy_pandas.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1131
class_3_content_based_recommenders.ipynb
Normal file
1131
class_3_content_based_recommenders.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1719
class_5_amazon_recommender.ipynb
Normal file
1719
class_5_amazon_recommender.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
2599
class_6_collaborative_filtering.ipynb
Normal file
2599
class_6_collaborative_filtering.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
16103
data/hotel_data/hotel_data_interactions_df.csv
Normal file
File diff suppressed because it is too large
Load Diff
17251
data/hotel_data/hotel_data_original.csv
Normal file
17251
data/hotel_data/hotel_data_original.csv
Normal file
File diff suppressed because it is too large
Load Diff
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
16103
data/hotel_data/hotel_data_preprocessed.csv
Normal file
File diff suppressed because it is too large
Load Diff
9743
data/movielens_small/links.csv
Normal file
9743
data/movielens_small/links.csv
Normal file
File diff suppressed because it is too large
Load Diff
9743
data/movielens_small/movies.csv
Normal file
9743
data/movielens_small/movies.csv
Normal file
File diff suppressed because it is too large
Load Diff
100837
data/movielens_small/ratings.csv
Normal file
100837
data/movielens_small/ratings.csv
Normal file
File diff suppressed because it is too large
Load Diff
153
data/movielens_small/readme.txt
Normal file
153
data/movielens_small/readme.txt
Normal file
@ -0,0 +1,153 @@
|
||||
Summary
|
||||
=======
|
||||
|
||||
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
|
||||
|
||||
Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
|
||||
|
||||
The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
|
||||
|
||||
This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
|
||||
|
||||
This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.
|
||||
|
||||
|
||||
Usage License
|
||||
=============
|
||||
|
||||
Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
|
||||
|
||||
* The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
|
||||
* The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
|
||||
* The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
|
||||
* The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
|
||||
* The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
|
||||
|
||||
In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
|
||||
|
||||
If you have any further questions or comments, please email <grouplens-info@umn.edu>
|
||||
|
||||
|
||||
Citation
|
||||
========
|
||||
|
||||
To acknowledge use of the dataset in publications, please cite the following paper:
|
||||
|
||||
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>
|
||||
|
||||
|
||||
Further Information About GroupLens
|
||||
===================================
|
||||
|
||||
GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
|
||||
|
||||
* recommender systems
|
||||
* online communities
|
||||
* mobile and ubiquitious technologies
|
||||
* digital libraries
|
||||
* local geographic information systems
|
||||
|
||||
GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit <http://movielens.org> to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at <grouplens-info@cs.umn.edu> - we are always interested in working with external collaborators.
|
||||
|
||||
|
||||
Content and Use of Files
|
||||
========================
|
||||
|
||||
Formatting and Encoding
|
||||
-----------------------
|
||||
|
||||
The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
|
||||
|
||||
|
||||
User Ids
|
||||
--------
|
||||
|
||||
MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
|
||||
|
||||
|
||||
Movie Ids
|
||||
---------
|
||||
|
||||
Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
|
||||
|
||||
|
||||
Ratings Data File Structure (ratings.csv)
|
||||
-----------------------------------------
|
||||
|
||||
All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,rating,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Tags Data File Structure (tags.csv)
|
||||
-----------------------------------
|
||||
|
||||
All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,tag,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Movies Data File Structure (movies.csv)
|
||||
---------------------------------------
|
||||
|
||||
Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,title,genres
|
||||
|
||||
Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
|
||||
|
||||
Genres are a pipe-separated list, and are selected from the following:
|
||||
|
||||
* Action
|
||||
* Adventure
|
||||
* Animation
|
||||
* Children's
|
||||
* Comedy
|
||||
* Crime
|
||||
* Documentary
|
||||
* Drama
|
||||
* Fantasy
|
||||
* Film-Noir
|
||||
* Horror
|
||||
* Musical
|
||||
* Mystery
|
||||
* Romance
|
||||
* Sci-Fi
|
||||
* Thriller
|
||||
* War
|
||||
* Western
|
||||
* (no genres listed)
|
||||
|
||||
|
||||
Links Data File Structure (links.csv)
|
||||
---------------------------------------
|
||||
|
||||
Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,imdbId,tmdbId
|
||||
|
||||
movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.
|
||||
|
||||
imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.
|
||||
|
||||
tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.
|
||||
|
||||
Use of the resources listed above is subject to the terms of each provider.
|
||||
|
||||
|
||||
Cross-Validation
|
||||
----------------
|
||||
|
||||
Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
|
3684
data/movielens_small/tags.csv
Normal file
3684
data/movielens_small/tags.csv
Normal file
File diff suppressed because it is too large
Load Diff
18
data/steam/readme.txt
Normal file
18
data/steam/readme.txt
Normal file
@ -0,0 +1,18 @@
|
||||
https://www.kaggle.com/tamber/steam-video-games
|
||||
|
||||
Context
|
||||
Steam is the world's most popular PC Gaming hub, with over 6,000 games and a community of millions of gamers. With a massive collection that includes everything from AAA blockbusters to small indie titles, great discovery tools are a highly valuable asset for Steam. How can we make them better?
|
||||
|
||||
Content
|
||||
This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are 'purchase' and 'play'. The value indicates the degree to which the behavior was performed - in the case of 'purchase' the value is always 1, and in the case of 'play' the value represents the number of hours the user has played the game.
|
||||
|
||||
Acknowledgements
|
||||
This dataset is generated entirely from public Steam data, so we want to thank Steam for building such an awesome platform and community!
|
||||
|
||||
Inspiration
|
||||
The dataset is formatted to be compatible with Tamber. Build a Tamber engine and take it for a spin!
|
||||
|
||||
Combine our collaborative filter's results with your favorite Machine Learning techniques with Ensemble Learning, or make Tamber do battle with something else you've built.
|
||||
|
||||
Have fun,
|
||||
The Tamber Team
|
200000
data/steam/steam-200k.csv
Normal file
200000
data/steam/steam-200k.csv
Normal file
File diff suppressed because it is too large
Load Diff
0
data_preprocessing/__init__.py
Normal file
0
data_preprocessing/__init__.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
278
data_preprocessing/data_preprocessing_toolkit.py
Normal file
@ -0,0 +1,278 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil.easter import easter
|
||||
from data_preprocessing.dataset_specification import DatasetSpecification
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DataPreprocessingToolkit(object):
|
||||
|
||||
def __init__(self):
|
||||
dataset_specification = DatasetSpecification()
|
||||
|
||||
self.sum_columns = dataset_specification.get_sum_columns()
|
||||
self.mean_columns = dataset_specification.get_mean_columns()
|
||||
self.mode_columns = dataset_specification.get_mode_columns()
|
||||
self.first_columns = dataset_specification.get_first_columns()
|
||||
|
||||
self.nights_buckets = dataset_specification.get_nights_buckets()
|
||||
self.npeople_buckets = dataset_specification.get_npeople_buckets()
|
||||
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
|
||||
|
||||
self.arrival_terms = dataset_specification.get_arrival_terms()
|
||||
|
||||
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
|
||||
|
||||
# #########################
|
||||
# Entire datasets functions
|
||||
# #########################
|
||||
|
||||
def fix_date_to(self, df):
|
||||
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
|
||||
return df
|
||||
|
||||
def add_length_of_stay(self, df):
|
||||
df.loc[:, "length_of_stay"] = (df["date_to"] - df["date_from"]).dt.days
|
||||
return df
|
||||
|
||||
def add_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
|
||||
return df
|
||||
|
||||
def add_nrooms(self, df):
|
||||
df.loc[:, "n_rooms"] = 1
|
||||
return df
|
||||
|
||||
def add_weekend_stay(self, df):
|
||||
s = df["date_from"].dt.dayofweek
|
||||
e = df["date_to"].dt.dayofweek
|
||||
dt = (df["date_to"] - df["date_from"]).dt.days
|
||||
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
|
||||
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
|
||||
return df
|
||||
|
||||
def add_night_price(self, df):
|
||||
df.loc[:, "night_price"] = np.round(df["accomodation_price"] / df["length_of_stay"] / df["n_rooms"], 2)
|
||||
return df
|
||||
|
||||
def clip_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
|
||||
return df
|
||||
|
||||
def sum_npeople(self, df):
|
||||
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
|
||||
return df
|
||||
|
||||
def filter_out_company_clients(self, df):
|
||||
df = df.loc[df["is_company"] == 0]
|
||||
return df
|
||||
|
||||
def filter_out_long_stays(self, df):
|
||||
df = df.loc[df["length_of_stay"] <= 21]
|
||||
return df
|
||||
|
||||
def leave_one_from_group_reservations(self, df):
|
||||
unique_group_rows = []
|
||||
|
||||
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
|
||||
|
||||
group_ids = []
|
||||
for idx, row in df.iterrows():
|
||||
if row["group_id"] != -1:
|
||||
if row["group_id"] not in group_ids:
|
||||
unique_group_rows.append(row)
|
||||
group_ids.append(row["group_id"])
|
||||
else:
|
||||
unique_group_rows.append(row)
|
||||
|
||||
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
|
||||
|
||||
return df
|
||||
|
||||
def aggregate_group_reservations(self, df):
|
||||
non_group_reservations = df.loc[df["group_id"] == "",
|
||||
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
|
||||
group_reservations = df.loc[df["group_id"] != ""]
|
||||
|
||||
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
|
||||
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
|
||||
|
||||
group_reservations = agg_datasets[0]
|
||||
for i in range(1, len(agg_datasets)):
|
||||
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
|
||||
|
||||
group_reservations = group_reservations.reset_index(drop=True)
|
||||
|
||||
df = pd.concat([non_group_reservations, group_reservations])
|
||||
|
||||
return df
|
||||
|
||||
def leave_only_ota(self, df):
|
||||
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
|
||||
return df
|
||||
|
||||
def map_date_to_term_datasets(self, df):
|
||||
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
|
||||
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
|
||||
return df
|
||||
|
||||
def map_length_of_stay_to_nights_buckets(self, df):
|
||||
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
|
||||
return df
|
||||
|
||||
def map_night_price_to_room_segment_buckets(self, df):
|
||||
night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||
.groupby('room_group_id')['night_price'].mean().reset_index()
|
||||
night_prices.columns = ['room_group_id', 'room_night_price']
|
||||
df = pd.merge(df, night_prices, on=['room_group_id'], how='left')
|
||||
df.loc[df['room_night_price'].isnull(), 'room_night_price'] = 0.0
|
||||
df.loc[:, 'room_segment'] = df['room_night_price'].apply(
|
||||
lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||
df = df.drop(columns=['room_night_price'])
|
||||
return df
|
||||
|
||||
# def map_night_price_to_room_segment_buckets(self, df):
|
||||
# night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
|
||||
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
|
||||
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
|
||||
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
|
||||
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||
# df = df.drop(columns=['termnight_price'])
|
||||
# return df
|
||||
|
||||
def map_npeople_to_npeople_buckets(self, df):
|
||||
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
|
||||
return df
|
||||
|
||||
def map_item_to_item_id(self, df):
|
||||
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
ids = df['item'].unique().tolist()
|
||||
mapping = {ids[i]: i for i in range(len(ids))}
|
||||
|
||||
df['item_id'] = df['item'].apply(lambda x: mapping[x])
|
||||
|
||||
return df
|
||||
|
||||
def add_interaction_id(self, df):
|
||||
df.loc[:, 'interaction_id'] = range(df.shape[0])
|
||||
return df
|
||||
|
||||
# ################
|
||||
# Column functions
|
||||
# ################
|
||||
|
||||
def bundle_period(self, diff):
|
||||
diff = float(diff)
|
||||
if int(diff) < 0:
|
||||
return "<0"
|
||||
elif int(diff) <= 7:
|
||||
return diff
|
||||
elif 7 < int(diff) <= 14:
|
||||
return "<14"
|
||||
elif 14 < int(diff) <= 30:
|
||||
return "<30"
|
||||
elif 30 < int(diff) <= 60:
|
||||
return "<60"
|
||||
elif 60 < int(diff) <= 180:
|
||||
return "<180"
|
||||
elif int(diff) > 180:
|
||||
return ">180"
|
||||
|
||||
def bundle_price(self, price):
|
||||
mod = 300.0
|
||||
return int((price + mod / 2) / mod) * mod
|
||||
|
||||
def map_date_to_season(self, date):
|
||||
day = int(date[8:10])
|
||||
month = int(date[5:7])
|
||||
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
|
||||
return "Winter"
|
||||
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
|
||||
return "Spring"
|
||||
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
|
||||
return "Summer"
|
||||
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
|
||||
return "Autumn"
|
||||
|
||||
def map_value_to_bucket(self, value, buckets):
|
||||
if value == "":
|
||||
return str(buckets[0]).replace(", ", "-")
|
||||
for bucket in buckets:
|
||||
if bucket[0] <= value <= bucket[1]:
|
||||
return str(bucket).replace(", ", "-")
|
||||
|
||||
def map_date_to_term(self, date):
|
||||
|
||||
m = int(date[5:7])
|
||||
d = int(date[8:10])
|
||||
term = None
|
||||
|
||||
for arrival_term in self.arrival_terms:
|
||||
if arrival_term == "Easter":
|
||||
year = int(date[:4])
|
||||
easter_date = easter(year)
|
||||
easter_start = easter_date + timedelta(days=-4)
|
||||
easter_end = easter_date + timedelta(days=1)
|
||||
esm = easter_start.month
|
||||
esd = easter_start.day
|
||||
eem = easter_end.month
|
||||
eed = easter_end.day
|
||||
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
elif arrival_term == "NewYear":
|
||||
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][0]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
else:
|
||||
is_match = False
|
||||
|
||||
for i in range(len(self.arrival_terms[arrival_term])):
|
||||
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][i]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
is_match = True
|
||||
break
|
||||
|
||||
if is_match:
|
||||
break
|
||||
|
||||
return term
|
||||
|
||||
def map_dates_to_terms(self, dates):
|
||||
|
||||
terms = []
|
||||
for date in dates:
|
||||
term = self.map_date_to_term(date)
|
||||
terms.append(term)
|
||||
|
||||
return terms
|
||||
|
||||
def filter_out_historical_dates(self, date_list):
|
||||
"""
|
||||
Filters out past dates from a list of dates.
|
||||
"""
|
||||
future_dates = []
|
||||
|
||||
for date in date_list:
|
||||
if date >= datetime.now():
|
||||
future_dates.append(date.strftime("%Y-%m-%d"))
|
||||
|
||||
return future_dates
|
88
data_preprocessing/dataset_specification.py
Normal file
88
data_preprocessing/dataset_specification.py
Normal file
@ -0,0 +1,88 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DatasetSpecification(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# ################
|
||||
# Original dataset functions
|
||||
# ################
|
||||
|
||||
def get_sum_columns(self):
|
||||
return ["n_people", "n_children_1", "n_children_2", "n_children_3", "accomodation_price", "meal_price",
|
||||
"service_price", "paid", "n_rooms"]
|
||||
|
||||
def get_mean_columns(self):
|
||||
return ['discount']
|
||||
|
||||
def get_mode_columns(self):
|
||||
return ["room_id", "room_group_id", "date_from", "date_to", "booking_date", "rate_plan",
|
||||
"length_of_stay", "book_to_arrival", "weekend_stay"]
|
||||
|
||||
def get_first_columns(self):
|
||||
return ["user_id", "client_id", "client_name", "email", "phone", "is_company"]
|
||||
|
||||
def get_id_columns(self):
|
||||
return ["client_id", "client_name", "email", "phone"]
|
||||
|
||||
# ################
|
||||
# Output dataset functions
|
||||
# ################
|
||||
|
||||
def get_people_df_id_columns(self):
|
||||
return ['user_id']
|
||||
|
||||
def get_people_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
def get_items_df_id_columns(self):
|
||||
return ['item_id']
|
||||
|
||||
def get_items_df_feature_columns(self):
|
||||
return ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
|
||||
|
||||
def get_purchases_df_id_columns(self):
|
||||
return ['user_id', 'item_id']
|
||||
|
||||
def get_purchases_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
# ################
|
||||
# Mapping functions
|
||||
# ################
|
||||
|
||||
def get_nights_buckets(self):
|
||||
return [[0, 1], [2, 3], [4, 7], [8, np.inf]]
|
||||
|
||||
def get_npeople_buckets(self):
|
||||
return [[1, 1], [2, 2], [3, 4], [5, np.inf]]
|
||||
|
||||
def get_room_segment_buckets(self):
|
||||
return [[0, 160], [160, 260], [260, 360], [360, 500], [500, 900], [900, np.inf]]
|
||||
|
||||
def get_book_to_arrival_buckets(self):
|
||||
return [[0, 0], [1, 2], [3, 4], [5, 7], [8, 14], [15, 30], [31, 60], [61, 90], [91, 180], [181, np.inf]]
|
||||
|
||||
def get_arrival_terms(self):
|
||||
arrival_terms = {"Easter": [{"start": {"m": np.nan, "d": np.nan}, "end": {"m": np.nan, "d": np.nan}}],
|
||||
# Treated with priority
|
||||
"Christmas": [{"start": {"m": 12, "d": 22}, "end": {"m": 12, "d": 27}}],
|
||||
"NewYear": [{"start": {"m": 12, "d": 28}, "end": {"m": 1, "d": 4}}],
|
||||
"WinterVacation": [{"start": {"m": 1, "d": 5}, "end": {"m": 2, "d": 29}}],
|
||||
"OffSeason": [
|
||||
{"start": {"m": 3, "d": 1}, "end": {"m": 4, "d": 27}},
|
||||
{"start": {"m": 5, "d": 6}, "end": {"m": 6, "d": 20}},
|
||||
{"start": {"m": 9, "d": 26}, "end": {"m": 12, "d": 21}}],
|
||||
"MayLongWeekend": [{"start": {"m": 4, "d": 28}, "end": {"m": 5, "d": 5}}],
|
||||
"LowSeason": [
|
||||
{"start": {"m": 6, "d": 21}, "end": {"m": 7, "d": 10}},
|
||||
{"start": {"m": 8, "d": 23}, "end": {"m": 9, "d": 25}}],
|
||||
"HighSeason": [{"start": {"m": 7, "d": 11}, "end": {"m": 8, "d": 22}}]}
|
||||
return arrival_terms
|
77
data_preprocessing/people_identifier.py
Normal file
77
data_preprocessing/people_identifier.py
Normal file
@ -0,0 +1,77 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class PeopleIdentifier(object):
|
||||
|
||||
def __init__(self):
|
||||
self.id_column_names = []
|
||||
self.pid_cname = ""
|
||||
self.next_available_pid = 0
|
||||
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
|
||||
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
|
||||
self.data = None
|
||||
|
||||
def add_pid(self, data, id_column_names, pid_cname):
|
||||
self.id_column_names = id_column_names
|
||||
self.pid_cname = pid_cname
|
||||
|
||||
for cid_cname in id_column_names:
|
||||
self.cid_to_pid[cid_cname] = {}
|
||||
|
||||
for idx, reservation in data.iterrows():
|
||||
pids = set()
|
||||
for cid_cname in id_column_names:
|
||||
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
|
||||
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
|
||||
if len(pids) > 0:
|
||||
min_pid = min(pids)
|
||||
|
||||
self.set_pid(min_pid, reservation)
|
||||
|
||||
# Merge pids connected through this node
|
||||
|
||||
if len(pids) > 1:
|
||||
pids.remove(min_pid)
|
||||
self.merge_pids(pids, min_pid)
|
||||
|
||||
# print("Chosen pid: {}".format(min_pid))
|
||||
else:
|
||||
new_pid = self.next_available_pid
|
||||
self.next_available_pid += 1
|
||||
|
||||
self.set_pid(new_pid, reservation)
|
||||
# print("Chosen pid: {}".format(new_pid))
|
||||
|
||||
# print("=======")
|
||||
# print(self.pid_to_cid)
|
||||
# print("=======")
|
||||
|
||||
data_pid = data.copy()
|
||||
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
|
||||
self.data = data_pid
|
||||
|
||||
return data_pid
|
||||
|
||||
def set_pid(self, pid, reservation):
|
||||
for cid_cname in self.id_column_names:
|
||||
if reservation[cid_cname] != "":
|
||||
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
|
||||
if pid in self.pid_to_cid:
|
||||
for cid_cname in self.id_column_names:
|
||||
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
else:
|
||||
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
for cid_cname in self.id_column_names}
|
||||
|
||||
def merge_pids(self, pids_from, pid_to):
|
||||
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
|
||||
for pid_from in pids_from:
|
||||
for cid_cname in self.id_column_names:
|
||||
for cid in self.pid_to_cid[pid_from][cid_cname]:
|
||||
self.cid_to_pid[cid_cname][cid] = pid_to
|
||||
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
|
||||
self.pid_to_cid.pop(pid_from)
|
86
environment.yml
Normal file
86
environment.yml
Normal file
@ -0,0 +1,86 @@
|
||||
name: rs-class-env
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=21.0.1
|
||||
- python=3.8.8
|
||||
- setuptools=52.0.0
|
||||
- sqlite=3.35.1
|
||||
- wheel=0.36.2
|
||||
- pip:
|
||||
- anyio==2.2.0
|
||||
- argon2-cffi==20.1.0
|
||||
- async-generator==1.10
|
||||
- attrs==20.3.0
|
||||
- babel==2.9.0
|
||||
- backcall==0.2.0
|
||||
- bleach==3.3.0
|
||||
- cffi==1.14.5
|
||||
- chardet==4.0.0
|
||||
- colorama==0.4.4
|
||||
- cycler==0.10.0
|
||||
- decorator==4.4.2
|
||||
- defusedxml==0.7.1
|
||||
- entrypoints==0.3
|
||||
- idna==2.10
|
||||
- ipykernel==5.5.0
|
||||
- ipython==7.21.0
|
||||
- ipython-genutils==0.2.0
|
||||
- jedi==0.18.0
|
||||
- jinja2==2.11.3
|
||||
- joblib==1.0.1
|
||||
- json5==0.9.5
|
||||
- jsonschema==3.2.0
|
||||
- jupyter-client==6.1.12
|
||||
- jupyter-core==4.7.1
|
||||
- jupyter-packaging==0.7.12
|
||||
- jupyter-server==1.4.1
|
||||
- jupyterlab==3.0.11
|
||||
- jupyterlab-pygments==0.1.2
|
||||
- jupyterlab-server==2.3.0
|
||||
- kiwisolver==1.3.1
|
||||
- markupsafe==1.1.1
|
||||
- matplotlib==3.3.4
|
||||
- mistune==0.8.4
|
||||
- nbclassic==0.2.6
|
||||
- nbclient==0.5.3
|
||||
- nbconvert==6.0.7
|
||||
- nbformat==5.1.2
|
||||
- nest-asyncio==1.5.1
|
||||
- notebook==6.2.0
|
||||
- numpy==1.20.1
|
||||
- packaging==20.9
|
||||
- pandas==1.2.3
|
||||
- pandocfilters==1.4.3
|
||||
- parso==0.8.1
|
||||
- patsy==0.5.1
|
||||
- pickleshare==0.7.5
|
||||
- pillow==8.1.2
|
||||
- prometheus-client==0.9.0
|
||||
- prompt-toolkit==3.0.17
|
||||
- pycparser==2.20
|
||||
- pygments==2.8.1
|
||||
- pyparsing==2.4.7
|
||||
- pyrsistent==0.17.3
|
||||
- python-dateutil==2.8.1
|
||||
- pytz==2021.1
|
||||
- pyzmq==22.0.3
|
||||
- requests==2.25.1
|
||||
- scikit-learn==0.24.1
|
||||
- scipy==1.6.1
|
||||
- seaborn==0.11.1
|
||||
- send2trash==1.5.0
|
||||
- six==1.15.0
|
||||
- sklearn==0.0
|
||||
- sniffio==1.2.0
|
||||
- statsmodels==0.12.2
|
||||
- terminado==0.9.3
|
||||
- testpath==0.4.4
|
||||
- threadpoolctl==2.1.0
|
||||
- torch==1.8.0
|
||||
- tornado==6.1
|
||||
- traitlets==5.0.5
|
||||
- typing-extensions==3.7.4.3
|
||||
- urllib3==1.26.4
|
||||
- wcwidth==0.2.5
|
||||
- webencodings==0.5.1
|
0
evaluation_and_testing/__init__.py
Normal file
0
evaluation_and_testing/__init__.py
Normal file
87
evaluation_and_testing/evaluation_measures.py
Normal file
87
evaluation_and_testing/evaluation_measures.py
Normal file
@ -0,0 +1,87 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def rmse(r_pred, r_real):
|
||||
return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))
|
||||
|
||||
|
||||
def mape(r_pred, r_real):
|
||||
return 1 / len(r_pred) * np.sum(np.abs(r_pred - r_real) / np.abs(r_real))
|
||||
|
||||
|
||||
def tre(r_pred, r_real):
|
||||
return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))
|
||||
|
||||
|
||||
def hr(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])]
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def ndcg(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])] / np.log2(1 + rank)
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
209
evaluation_and_testing/testing.py
Normal file
209
evaluation_and_testing/testing.py
Normal file
@ -0,0 +1,209 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
from evaluation_and_testing.evaluation_measures import rmse
|
||||
from evaluation_and_testing.evaluation_measures import mape
|
||||
from evaluation_and_testing.evaluation_measures import tre
|
||||
from evaluation_and_testing.evaluation_measures import hr
|
||||
from evaluation_and_testing.evaluation_measures import ndcg
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Gather predictions
|
||||
|
||||
r_pred = []
|
||||
|
||||
for idx, row in interactions_df_test.iterrows():
|
||||
users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
|
||||
eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
|
||||
eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
|
||||
recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
|
||||
# Gather real ratings
|
||||
|
||||
r_real = np.array(interactions_df_test['rating'].tolist())
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
|
||||
# Write your code here
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Make recommendations for each user in the test set and calculate the metric
|
||||
# against all items of that user in the test set
|
||||
|
||||
test_user_interactions = interactions_df_test.groupby(by='user_id')
|
||||
|
||||
for user_id, user_interactions in test_user_interactions:
|
||||
|
||||
recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']),
|
||||
items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, user_interactions, n=1))
|
||||
hr_3.append(hr(recommendations, user_interactions, n=3))
|
||||
hr_5.append(hr(recommendations, user_interactions, n=5))
|
||||
hr_10.append(hr(recommendations, user_interactions, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
||||
|
||||
|
||||
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
r_pred = []
|
||||
r_real = []
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']],
|
||||
items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
r_real.append(interactions_df_test.iloc[0]['rating'])
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
r_pred = np.array(r_pred)
|
||||
r_real = np.array(r_real)
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']], items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, interactions_df_test, n=1))
|
||||
hr_3.append(hr(recommendations, interactions_df_test, n=3))
|
||||
hr_5.append(hr(recommendations, interactions_df_test, n=5))
|
||||
hr_10.append(hr(recommendations, interactions_df_test, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
27
git_configuration/.bash_profile
Normal file
27
git_configuration/.bash_profile
Normal file
@ -0,0 +1,27 @@
|
||||
# Enable tab completion
|
||||
source ~/git-completion.bash
|
||||
|
||||
# colors!
|
||||
green="\[\033[0;32m\]"
|
||||
blue="\[\033[0;34m\]"
|
||||
purple="\[\033[0;35m\]"
|
||||
yellow="\[\033[0;33m\]"
|
||||
reset="\[\033[0m\]"
|
||||
|
||||
# Change command prompt
|
||||
source ~/git-prompt.sh
|
||||
export GIT_PS1_SHOWDIRTYSTATE=1
|
||||
# '\u' adds the name of the current user to the prompt
|
||||
# '\$(__git_ps1)' adds git-related stuff
|
||||
# '\W' adds the name of the current directory
|
||||
export PS1="$purple\u$green\$(__git_ps1)$yellow \W $ $reset"
|
||||
|
||||
alias ntpd="C:/Program\ Files\ \(x86\)/Notepad++/notepad++.exe"
|
||||
|
||||
test -f ~/.profile && . ~/.profile
|
||||
test -f ~/.bashrc && . ~/.bashrc
|
||||
# >>> conda initialize >>>
|
||||
# !! Contents within this block are managed by 'conda init' !!
|
||||
eval "$('/c/ProgramData/Anaconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
|
||||
# <<< conda initialize <<<
|
||||
|
23
git_configuration/.bashrc
Normal file
23
git_configuration/.bashrc
Normal file
@ -0,0 +1,23 @@
|
||||
env=~/.ssh/agent.env
|
||||
|
||||
agent_load_env () { test -f "$env" && . "$env" >| /dev/null ; }
|
||||
|
||||
agent_start () {
|
||||
(umask 077; ssh-agent >| "$env")
|
||||
. "$env" >| /dev/null ; }
|
||||
|
||||
agent_load_env
|
||||
|
||||
# agent_run_state: 0=agent running w/ key; 1=agent w/o key; 2= agent not running
|
||||
agent_run_state=$(ssh-add -l >| /dev/null 2>&1; echo $?)
|
||||
|
||||
if [ ! "$SSH_AUTH_SOCK" ] || [ $agent_run_state = 2 ]; then
|
||||
agent_start
|
||||
ssh-add ~/.ssh/PZ_BitBucket_key;
|
||||
ssh-add ~/.ssh/PZ_GitHub_key;
|
||||
elif [ "$SSH_AUTH_SOCK" ] && [ $agent_run_state = 1 ]; then
|
||||
ssh-add ~/.ssh/PZ_BitBucket_key;
|
||||
ssh-add ~/.ssh/PZ_GitHub_key;
|
||||
fi
|
||||
|
||||
unset env
|
537
git_configuration/git-prompt.sh
Normal file
537
git_configuration/git-prompt.sh
Normal file
@ -0,0 +1,537 @@
|
||||
# bash/zsh git prompt support
|
||||
#
|
||||
# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
|
||||
# Distributed under the GNU General Public License, version 2.0.
|
||||
#
|
||||
# This script allows you to see repository status in your prompt.
|
||||
#
|
||||
# To enable:
|
||||
#
|
||||
# 1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
|
||||
# 2) Add the following line to your .bashrc/.zshrc:
|
||||
# source ~/.git-prompt.sh
|
||||
# 3a) Change your PS1 to call __git_ps1 as
|
||||
# command-substitution:
|
||||
# Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
|
||||
# ZSH: setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
|
||||
# the optional argument will be used as format string.
|
||||
# 3b) Alternatively, for a slightly faster prompt, __git_ps1 can
|
||||
# be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
|
||||
# with two parameters, <pre> and <post>, which are strings
|
||||
# you would put in $PS1 before and after the status string
|
||||
# generated by the git-prompt machinery. e.g.
|
||||
# Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
|
||||
# will show username, at-sign, host, colon, cwd, then
|
||||
# various status string, followed by dollar and SP, as
|
||||
# your prompt.
|
||||
# ZSH: precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
|
||||
# will show username, pipe, then various status string,
|
||||
# followed by colon, cwd, dollar and SP, as your prompt.
|
||||
# Optionally, you can supply a third argument with a printf
|
||||
# format string to finetune the output of the branch status
|
||||
#
|
||||
# The repository status will be displayed only if you are currently in a
|
||||
# git repository. The %s token is the placeholder for the shown status.
|
||||
#
|
||||
# The prompt status always includes the current branch name.
|
||||
#
|
||||
# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
|
||||
# unstaged (*) and staged (+) changes will be shown next to the branch
|
||||
# name. You can configure this per-repository with the
|
||||
# bash.showDirtyState variable, which defaults to true once
|
||||
# GIT_PS1_SHOWDIRTYSTATE is enabled.
|
||||
#
|
||||
# You can also see if currently something is stashed, by setting
|
||||
# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
|
||||
# then a '$' will be shown next to the branch name.
|
||||
#
|
||||
# If you would like to see if there're untracked files, then you can set
|
||||
# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
|
||||
# files, then a '%' will be shown next to the branch name. You can
|
||||
# configure this per-repository with the bash.showUntrackedFiles
|
||||
# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
|
||||
# enabled.
|
||||
#
|
||||
# If you would like to see the difference between HEAD and its upstream,
|
||||
# set GIT_PS1_SHOWUPSTREAM="auto". A "<" indicates you are behind, ">"
|
||||
# indicates you are ahead, "<>" indicates you have diverged and "="
|
||||
# indicates that there is no difference. You can further control
|
||||
# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
|
||||
# of values:
|
||||
#
|
||||
# verbose show number of commits ahead/behind (+/-) upstream
|
||||
# name if verbose, then also show the upstream abbrev name
|
||||
# legacy don't use the '--count' option available in recent
|
||||
# versions of git-rev-list
|
||||
# git always compare HEAD to @{upstream}
|
||||
# svn always compare HEAD to your SVN upstream
|
||||
#
|
||||
# You can change the separator between the branch name and the above
|
||||
# state symbols by setting GIT_PS1_STATESEPARATOR. The default separator
|
||||
# is SP.
|
||||
#
|
||||
# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
|
||||
# find one, or @{upstream} otherwise. Once you have set
|
||||
# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
|
||||
# setting the bash.showUpstream config variable.
|
||||
#
|
||||
# If you would like to see more information about the identity of
|
||||
# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
|
||||
# to one of these values:
|
||||
#
|
||||
# contains relative to newer annotated tag (v1.6.3.2~35)
|
||||
# branch relative to newer tag or branch (master~4)
|
||||
# describe relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
|
||||
# tag relative to any older tag (v1.6.3.1-13-gdd42c2f)
|
||||
# default exactly matching tag
|
||||
#
|
||||
# If you would like a colored hint about the current dirty state, set
|
||||
# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
|
||||
# the colored output of "git status -sb" and are available only when
|
||||
# using __git_ps1 for PROMPT_COMMAND or precmd.
|
||||
#
|
||||
# If you would like __git_ps1 to do nothing in the case when the current
|
||||
# directory is set up to be ignored by git, then set
|
||||
# GIT_PS1_HIDE_IF_PWD_IGNORED to a nonempty value. Override this on the
|
||||
# repository level by setting bash.hideIfPwdIgnored to "false".
|
||||
|
||||
# check whether printf supports -v
|
||||
__git_printf_supports_v=
|
||||
printf -v __git_printf_supports_v -- '%s' yes >/dev/null 2>&1
|
||||
|
||||
# stores the divergence from upstream in $p
|
||||
# used by GIT_PS1_SHOWUPSTREAM
|
||||
__git_ps1_show_upstream ()
|
||||
{
|
||||
local key value
|
||||
local svn_remote svn_url_pattern count n
|
||||
local upstream=git legacy="" verbose="" name=""
|
||||
|
||||
svn_remote=()
|
||||
# get some config options from git-config
|
||||
local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
|
||||
while read -r key value; do
|
||||
case "$key" in
|
||||
bash.showupstream)
|
||||
GIT_PS1_SHOWUPSTREAM="$value"
|
||||
if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
|
||||
p=""
|
||||
return
|
||||
fi
|
||||
;;
|
||||
svn-remote.*.url)
|
||||
svn_remote[$((${#svn_remote[@]} + 1))]="$value"
|
||||
svn_url_pattern="$svn_url_pattern\\|$value"
|
||||
upstream=svn+git # default upstream is SVN if available, else git
|
||||
;;
|
||||
esac
|
||||
done <<< "$output"
|
||||
|
||||
# parse configuration values
|
||||
for option in ${GIT_PS1_SHOWUPSTREAM}; do
|
||||
case "$option" in
|
||||
git|svn) upstream="$option" ;;
|
||||
verbose) verbose=1 ;;
|
||||
legacy) legacy=1 ;;
|
||||
name) name=1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Find our upstream
|
||||
case "$upstream" in
|
||||
git) upstream="@{upstream}" ;;
|
||||
svn*)
|
||||
# get the upstream from the "git-svn-id: ..." in a commit message
|
||||
# (git-svn uses essentially the same procedure internally)
|
||||
local -a svn_upstream
|
||||
svn_upstream=($(git log --first-parent -1 \
|
||||
--grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
|
||||
if [[ 0 -ne ${#svn_upstream[@]} ]]; then
|
||||
svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
|
||||
svn_upstream=${svn_upstream%@*}
|
||||
local n_stop="${#svn_remote[@]}"
|
||||
for ((n=1; n <= n_stop; n++)); do
|
||||
svn_upstream=${svn_upstream#${svn_remote[$n]}}
|
||||
done
|
||||
|
||||
if [[ -z "$svn_upstream" ]]; then
|
||||
# default branch name for checkouts with no layout:
|
||||
upstream=${GIT_SVN_ID:-git-svn}
|
||||
else
|
||||
upstream=${svn_upstream#/}
|
||||
fi
|
||||
elif [[ "svn+git" = "$upstream" ]]; then
|
||||
upstream="@{upstream}"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# Find how many commits we are ahead/behind our upstream
|
||||
if [[ -z "$legacy" ]]; then
|
||||
count="$(git rev-list --count --left-right \
|
||||
"$upstream"...HEAD 2>/dev/null)"
|
||||
else
|
||||
# produce equivalent output to --count for older versions of git
|
||||
local commits
|
||||
if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
|
||||
then
|
||||
local commit behind=0 ahead=0
|
||||
for commit in $commits
|
||||
do
|
||||
case "$commit" in
|
||||
"<"*) ((behind++)) ;;
|
||||
*) ((ahead++)) ;;
|
||||
esac
|
||||
done
|
||||
count="$behind $ahead"
|
||||
else
|
||||
count=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# calculate the result
|
||||
if [[ -z "$verbose" ]]; then
|
||||
case "$count" in
|
||||
"") # no upstream
|
||||
p="" ;;
|
||||
"0 0") # equal to upstream
|
||||
p="=" ;;
|
||||
"0 "*) # ahead of upstream
|
||||
p=">" ;;
|
||||
*" 0") # behind upstream
|
||||
p="<" ;;
|
||||
*) # diverged from upstream
|
||||
p="<>" ;;
|
||||
esac
|
||||
else
|
||||
case "$count" in
|
||||
"") # no upstream
|
||||
p="" ;;
|
||||
"0 0") # equal to upstream
|
||||
p=" u=" ;;
|
||||
"0 "*) # ahead of upstream
|
||||
p=" u+${count#0 }" ;;
|
||||
*" 0") # behind upstream
|
||||
p=" u-${count% 0}" ;;
|
||||
*) # diverged from upstream
|
||||
p=" u+${count#* }-${count% *}" ;;
|
||||
esac
|
||||
if [[ -n "$count" && -n "$name" ]]; then
|
||||
__git_ps1_upstream_name=$(git rev-parse \
|
||||
--abbrev-ref "$upstream" 2>/dev/null)
|
||||
if [ $pcmode = yes ] && [ $ps1_expanded = yes ]; then
|
||||
p="$p \${__git_ps1_upstream_name}"
|
||||
else
|
||||
p="$p ${__git_ps1_upstream_name}"
|
||||
# not needed anymore; keep user's
|
||||
# environment clean
|
||||
unset __git_ps1_upstream_name
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
# Helper function that is meant to be called from __git_ps1. It
|
||||
# injects color codes into the appropriate gitstring variables used
|
||||
# to build a gitstring.
|
||||
__git_ps1_colorize_gitstring ()
|
||||
{
|
||||
if [[ -n ${ZSH_VERSION-} ]]; then
|
||||
local c_red='%F{red}'
|
||||
local c_green='%F{green}'
|
||||
local c_lblue='%F{blue}'
|
||||
local c_yellow='%F{yellow}'
|
||||
local c_clear='%f'
|
||||
else
|
||||
# Using \[ and \] around colors is necessary to prevent
|
||||
# issues with command line editing/browsing/completion!
|
||||
local c_red='\[\e[31m\]'
|
||||
local c_green='\[\e[32m\]'
|
||||
local c_lblue='\[\e[1;34m\]'
|
||||
local c_yellow='\[\033[0;33m\]'
|
||||
local c_clear='\[\e[0m\]'
|
||||
fi
|
||||
local bad_color=$c_red
|
||||
local ok_color=$c_green
|
||||
local flags_color=$c_yellow
|
||||
|
||||
local branch_color=""
|
||||
if [ $detached = no ]; then
|
||||
branch_color="$ok_color"
|
||||
else
|
||||
branch_color="$bad_color"
|
||||
fi
|
||||
c="$branch_color$c"
|
||||
|
||||
z="$c_clear$z"
|
||||
if [ "$w" = "*" ]; then
|
||||
w="$bad_color$w"
|
||||
fi
|
||||
if [ -n "$i" ]; then
|
||||
i="$ok_color$i"
|
||||
fi
|
||||
if [ -n "$s" ]; then
|
||||
s="$flags_color$s"
|
||||
fi
|
||||
if [ -n "$u" ]; then
|
||||
u="$bad_color$u"
|
||||
fi
|
||||
r="$c_clear$r"
|
||||
}
|
||||
|
||||
# Helper function to read the first line of a file into a variable.
|
||||
# __git_eread requires 2 arguments, the file path and the name of the
|
||||
# variable, in that order.
|
||||
__git_eread ()
|
||||
{
|
||||
test -r "$1" && IFS=$'\r\n' read "$2" <"$1"
|
||||
}
|
||||
|
||||
# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
|
||||
# when called from PS1 using command substitution
|
||||
# in this mode it prints text to add to bash PS1 prompt (includes branch name)
|
||||
#
|
||||
# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
|
||||
# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
|
||||
# when two arguments are given, the first is prepended and the second appended
|
||||
# to the state string when assigned to PS1.
|
||||
# The optional third parameter will be used as printf format string to further
|
||||
# customize the output of the git-status string.
|
||||
# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
|
||||
__git_ps1 ()
|
||||
{
|
||||
# preserve exit status
|
||||
local exit=$?
|
||||
local pcmode=no
|
||||
local detached=no
|
||||
local ps1pc_start='\u@\h:\w '
|
||||
local ps1pc_end='\$ '
|
||||
local printf_format=' (%s)'
|
||||
|
||||
case "$#" in
|
||||
2|3) pcmode=yes
|
||||
ps1pc_start="$1"
|
||||
ps1pc_end="$2"
|
||||
printf_format="${3:-$printf_format}"
|
||||
# set PS1 to a plain prompt so that we can
|
||||
# simply return early if the prompt should not
|
||||
# be decorated
|
||||
PS1="$ps1pc_start$ps1pc_end"
|
||||
;;
|
||||
0|1) printf_format="${1:-$printf_format}"
|
||||
;;
|
||||
*) return $exit
|
||||
;;
|
||||
esac
|
||||
|
||||
# ps1_expanded: This variable is set to 'yes' if the shell
|
||||
# subjects the value of PS1 to parameter expansion:
|
||||
#
|
||||
# * bash does unless the promptvars option is disabled
|
||||
# * zsh does not unless the PROMPT_SUBST option is set
|
||||
# * POSIX shells always do
|
||||
#
|
||||
# If the shell would expand the contents of PS1 when drawing
|
||||
# the prompt, a raw ref name must not be included in PS1.
|
||||
# This protects the user from arbitrary code execution via
|
||||
# specially crafted ref names. For example, a ref named
|
||||
# 'refs/heads/$(IFS=_;cmd=sudo_rm_-rf_/;$cmd)' might cause the
|
||||
# shell to execute 'sudo rm -rf /' when the prompt is drawn.
|
||||
#
|
||||
# Instead, the ref name should be placed in a separate global
|
||||
# variable (in the __git_ps1_* namespace to avoid colliding
|
||||
# with the user's environment) and that variable should be
|
||||
# referenced from PS1. For example:
|
||||
#
|
||||
# __git_ps1_foo=$(do_something_to_get_ref_name)
|
||||
# PS1="...stuff...\${__git_ps1_foo}...stuff..."
|
||||
#
|
||||
# If the shell does not expand the contents of PS1, the raw
|
||||
# ref name must be included in PS1.
|
||||
#
|
||||
# The value of this variable is only relevant when in pcmode.
|
||||
#
|
||||
# Assume that the shell follows the POSIX specification and
|
||||
# expands PS1 unless determined otherwise. (This is more
|
||||
# likely to be correct if the user has a non-bash, non-zsh
|
||||
# shell and safer than the alternative if the assumption is
|
||||
# incorrect.)
|
||||
#
|
||||
local ps1_expanded=yes
|
||||
[ -z "${ZSH_VERSION-}" ] || [[ -o PROMPT_SUBST ]] || ps1_expanded=no
|
||||
[ -z "${BASH_VERSION-}" ] || shopt -q promptvars || ps1_expanded=no
|
||||
|
||||
local repo_info rev_parse_exit_code
|
||||
repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
|
||||
--is-bare-repository --is-inside-work-tree \
|
||||
--short HEAD 2>/dev/null)"
|
||||
rev_parse_exit_code="$?"
|
||||
|
||||
if [ -z "$repo_info" ]; then
|
||||
return $exit
|
||||
fi
|
||||
|
||||
local short_sha=""
|
||||
if [ "$rev_parse_exit_code" = "0" ]; then
|
||||
short_sha="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
fi
|
||||
local inside_worktree="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
local bare_repo="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
local inside_gitdir="${repo_info##*$'\n'}"
|
||||
local g="${repo_info%$'\n'*}"
|
||||
|
||||
if [ "true" = "$inside_worktree" ] &&
|
||||
[ -n "${GIT_PS1_HIDE_IF_PWD_IGNORED-}" ] &&
|
||||
[ "$(git config --bool bash.hideIfPwdIgnored)" != "false" ] &&
|
||||
git check-ignore -q .
|
||||
then
|
||||
return $exit
|
||||
fi
|
||||
|
||||
local r=""
|
||||
local b=""
|
||||
local step=""
|
||||
local total=""
|
||||
if [ -d "$g/rebase-merge" ]; then
|
||||
__git_eread "$g/rebase-merge/head-name" b
|
||||
__git_eread "$g/rebase-merge/msgnum" step
|
||||
__git_eread "$g/rebase-merge/end" total
|
||||
if [ -f "$g/rebase-merge/interactive" ]; then
|
||||
r="|REBASE-i"
|
||||
else
|
||||
r="|REBASE-m"
|
||||
fi
|
||||
else
|
||||
if [ -d "$g/rebase-apply" ]; then
|
||||
__git_eread "$g/rebase-apply/next" step
|
||||
__git_eread "$g/rebase-apply/last" total
|
||||
if [ -f "$g/rebase-apply/rebasing" ]; then
|
||||
__git_eread "$g/rebase-apply/head-name" b
|
||||
r="|REBASE"
|
||||
elif [ -f "$g/rebase-apply/applying" ]; then
|
||||
r="|AM"
|
||||
else
|
||||
r="|AM/REBASE"
|
||||
fi
|
||||
elif [ -f "$g/MERGE_HEAD" ]; then
|
||||
r="|MERGING"
|
||||
elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
|
||||
r="|CHERRY-PICKING"
|
||||
elif [ -f "$g/REVERT_HEAD" ]; then
|
||||
r="|REVERTING"
|
||||
elif [ -f "$g/BISECT_LOG" ]; then
|
||||
r="|BISECTING"
|
||||
fi
|
||||
|
||||
if [ -n "$b" ]; then
|
||||
:
|
||||
elif [ -h "$g/HEAD" ]; then
|
||||
# symlink symbolic ref
|
||||
b="$(git symbolic-ref HEAD 2>/dev/null)"
|
||||
else
|
||||
local head=""
|
||||
if ! __git_eread "$g/HEAD" head; then
|
||||
return $exit
|
||||
fi
|
||||
# is it a symbolic ref?
|
||||
b="${head#ref: }"
|
||||
if [ "$head" = "$b" ]; then
|
||||
detached=yes
|
||||
b="$(
|
||||
case "${GIT_PS1_DESCRIBE_STYLE-}" in
|
||||
(contains)
|
||||
git describe --contains HEAD ;;
|
||||
(branch)
|
||||
git describe --contains --all HEAD ;;
|
||||
(tag)
|
||||
git describe --tags HEAD ;;
|
||||
(describe)
|
||||
git describe HEAD ;;
|
||||
(* | default)
|
||||
git describe --tags --exact-match HEAD ;;
|
||||
esac 2>/dev/null)" ||
|
||||
|
||||
b="$short_sha..."
|
||||
b="($b)"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$step" ] && [ -n "$total" ]; then
|
||||
r="$r $step/$total"
|
||||
fi
|
||||
|
||||
local w=""
|
||||
local i=""
|
||||
local s=""
|
||||
local u=""
|
||||
local c=""
|
||||
local p=""
|
||||
|
||||
if [ "true" = "$inside_gitdir" ]; then
|
||||
if [ "true" = "$bare_repo" ]; then
|
||||
c="BARE:"
|
||||
else
|
||||
b="GIT_DIR!"
|
||||
fi
|
||||
elif [ "true" = "$inside_worktree" ]; then
|
||||
if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
|
||||
[ "$(git config --bool bash.showDirtyState)" != "false" ]
|
||||
then
|
||||
git diff --no-ext-diff --quiet || w="*"
|
||||
git diff --no-ext-diff --cached --quiet || i="+"
|
||||
if [ -z "$short_sha" ] && [ -z "$i" ]; then
|
||||
i="#"
|
||||
fi
|
||||
fi
|
||||
if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
|
||||
git rev-parse --verify --quiet refs/stash >/dev/null
|
||||
then
|
||||
s="$"
|
||||
fi
|
||||
|
||||
if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
|
||||
[ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
|
||||
git ls-files --others --exclude-standard --directory --no-empty-directory --error-unmatch -- ':/*' >/dev/null 2>/dev/null
|
||||
then
|
||||
u="%${ZSH_VERSION+%}"
|
||||
fi
|
||||
|
||||
if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
|
||||
__git_ps1_show_upstream
|
||||
fi
|
||||
fi
|
||||
|
||||
local z="${GIT_PS1_STATESEPARATOR-" "}"
|
||||
|
||||
# NO color option unless in PROMPT_COMMAND mode
|
||||
if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
|
||||
__git_ps1_colorize_gitstring
|
||||
fi
|
||||
|
||||
b=${b##refs/heads/}
|
||||
if [ $pcmode = yes ] && [ $ps1_expanded = yes ]; then
|
||||
__git_ps1_branch_name=$b
|
||||
b="\${__git_ps1_branch_name}"
|
||||
fi
|
||||
|
||||
local f="$w$i$s$u"
|
||||
local gitstring="$c$b${f:+$z$f}$r$p"
|
||||
|
||||
if [ $pcmode = yes ]; then
|
||||
if [ "${__git_printf_supports_v-}" != yes ]; then
|
||||
gitstring=$(printf -- "$printf_format" "$gitstring")
|
||||
else
|
||||
printf -v gitstring -- "$printf_format" "$gitstring"
|
||||
fi
|
||||
PS1="$ps1pc_start$gitstring$ps1pc_end"
|
||||
else
|
||||
printf -- "$printf_format" "$gitstring"
|
||||
fi
|
||||
|
||||
return $exit
|
||||
}
|
BIN
img/git_bash.png
Normal file
BIN
img/git_bash.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
224
jupyter_test.ipynb
Normal file
224
jupyter_test.ipynb
Normal file
File diff suppressed because one or more lines are too long
2210
project_1_data_preparation.ipynb
Normal file
2210
project_1_data_preparation.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1744
project_1_recommender_and_evaluation.ipynb
Normal file
1744
project_1_recommender_and_evaluation.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
131
readme.md
Normal file
131
readme.md
Normal file
@ -0,0 +1,131 @@
|
||||
# Recommender Systems class
|
||||
|
||||
Department of Mathematics and Computer Science, Adam Mickiewicz University, 2021
|
||||
|
||||
Author: Piotr Zioło
|
||||
|
||||
## Preparing your computer
|
||||
|
||||
1. Install [Anaconda](https://www.anaconda.com/products/individual) with Python 3.8.
|
||||
|
||||
|
||||
2. Install [Git](https://git-scm.com/downloads).
|
||||
|
||||
|
||||
3. Install [PyCharm](https://www.jetbrains.com/pycharm/) (community version).
|
||||
|
||||
|
||||
4. Fork this repository to your GitHub account.
|
||||
|
||||
|
||||
5. Go to the chosen folder on your machine where you want to have a local copy of the repository. Right-click in the folder and from the context menu choose "Git Bash Here". Run the following command to clone the forked repository on your GitHub account to your local machine:
|
||||
|
||||
<pre>git clone <i>your_repository_address_which_you'll_find_in_your_github</i></pre>
|
||||
|
||||
Alternatively, open Git Bash (installed with Git), change the path to the folder where you want to have a local copy of the repository, execute the above command.
|
||||
|
||||
|
||||
6. Prepare your conda environment (instructions given for Windows, but it should be similar on other systems):
|
||||
|
||||
1. Open Anaconda Prompt as administrator.
|
||||
|
||||
2. Make sure you're in the repository main folder. Run the following command:
|
||||
|
||||
conda env create --name rs-class-env -f environment.yml
|
||||
|
||||
You can replace *rs-class-env* with your own environment name.
|
||||
|
||||
You may need to install a C++ compiler to install certain packages.
|
||||
|
||||
|
||||
7. In Git Bash open the repository folder and activate just created environment with the following command:
|
||||
|
||||
conda activate rs-class-env
|
||||
|
||||
|
||||
8. In Git Bash type:
|
||||
|
||||
jupyter notebook
|
||||
|
||||
A new tab with Jupyter Notebook should open in your browser.
|
||||
|
||||
|
||||
9. In Jupyter Notebook open jupyter_test.ipynb.
|
||||
|
||||
|
||||
10. Click on the first cell and hit shift+enter. The first cell should get executed properly. Do the same for all other cells (you can continuously hit shift+enter until you execute all cells).
|
||||
|
||||
The most common error you may encounter is "ImportError: No module named...". In such a case:
|
||||
|
||||
- copy the package name,
|
||||
|
||||
- close the tabs with Jupyter and in Git Bash where you started Jupyter Notebook click ctrl+c,
|
||||
|
||||
- run the following command:
|
||||
pip install package_name
|
||||
|
||||
- the package should get installed successfully,
|
||||
|
||||
- after that you can open Jupyter Notebook again and test if it works now.
|
||||
|
||||
|
||||
11. After you finished a piece of code in your repository, run the following commands in Git Bash (in the repository folder):
|
||||
|
||||
git add -A
|
||||
|
||||
git commit -m "Commit message"
|
||||
|
||||
git push
|
||||
|
||||
The first command adds all changes and new files for the next commit. The second command commits your changes (it's a kind of a checkpoint/save to which you can later return if need be). The third one sends your commit to GitHub (or any remote repository, in general).
|
||||
|
||||
**Convention:** For your commit messages use imperatives, e.g. "Do this, do that". Try to give informative one-liners.
|
||||
|
||||
|
||||
12. (Optional) Set up your Git Bash to make it look as below:
|
||||
|
||||
![Git Bash](img/git_bash.png)
|
||||
|
||||
Copy .bash_profile and git-prompt.sh files from the git_configuration folder from this repository to your user folder (tested on Windows 10; on other systems they may need to land somewhere else).
|
||||
|
||||
|
||||
13. (Optional) Set up SSH on your machine for easier access to your GitHub repositories through Git. You can find tutorials on the internet how to do that.
|
||||
|
||||
To additionally add an automatic prompt for SSH password in Git Bash, copy a script similar to .bashrc from the git_configuration folder to your user folder. In the file change the name of the key (in the given file there are two given; you can just leave one).
|
||||
|
||||
|
||||
**In the case of any problems, consult your best friend - [StackOverflow](https://stackoverflow.com/)**.
|
||||
|
||||
|
||||
## Before every class
|
||||
|
||||
Fetch the new code from this repository and merge it into your code.
|
||||
|
||||
1. In Git Bash open your repository folder.
|
||||
|
||||
|
||||
2. Add the original repository as an upstream:
|
||||
|
||||
git remote add upstream git@github.com:PiotrZiolo/recommender-systems-class.git
|
||||
|
||||
|
||||
3. Fetch new changes from the original repository:
|
||||
|
||||
git fetch upstream
|
||||
|
||||
|
||||
4. Merge the changes into your local branch (if you don't mind having your commits and commits in the original repository mixed up) or rebase the changes into your local branch (if you want your commits to follow all commits in the original repository):
|
||||
|
||||
git merge upstream/master master
|
||||
|
||||
or
|
||||
|
||||
git rebase upstream/master
|
||||
|
||||
|
||||
5. In the case of conflicts you can resolve them manually, but it's easier to use PyCharm, especially in Jupyter Notebooks where manual merging is extremely painful. PyCharm provides side-by-side view of changes and allows to accept one of the conflicted file versions in one click.
|
||||
|
||||
|
||||
|
||||
|
||||
|
0
recommenders/__init__.py
Normal file
0
recommenders/__init__.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
231
recommenders/amazon_recommender.py
Normal file
@ -0,0 +1,231 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy.special as scisp
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class AmazonRecommender(Recommender):
|
||||
"""
|
||||
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
|
||||
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
|
||||
IEEE Internet Computing, 2003,
|
||||
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.e_xy = None
|
||||
self.n_xy = None
|
||||
self.scores = None
|
||||
self.most_popular_items = None
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get maximal number of interactions
|
||||
|
||||
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
|
||||
max_interactions = n_user_interactions['n_items'].max()
|
||||
|
||||
# Calculate P_Y's
|
||||
|
||||
n_interactions = len(interactions_df)
|
||||
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
|
||||
p_y = p_y.rename(columns={'user_id': 'P_Y'})
|
||||
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
|
||||
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
|
||||
|
||||
# Get the series of all items
|
||||
|
||||
# items = list(range(n_items))
|
||||
items = interactions_df['item_id'].unique()
|
||||
|
||||
# For every X calculate the E[Y|X]
|
||||
|
||||
e_xy = np.zeros(shape=(n_items, n_items))
|
||||
e_xy[:][:] = -1e100
|
||||
|
||||
p_y_powers = {}
|
||||
for y in items:
|
||||
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
|
||||
|
||||
# In the next version calculate all alpha_k first (this works well with parallelization)
|
||||
|
||||
for x in items:
|
||||
# Get users who bought X
|
||||
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
|
||||
|
||||
# Get users who bought only X
|
||||
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
|
||||
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
|
||||
|
||||
# Calculate the number of non-X interactions for each user who bought X
|
||||
# Include users with zero non-X interactions
|
||||
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
|
||||
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
|
||||
|
||||
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
|
||||
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
|
||||
|
||||
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
|
||||
|
||||
# Calculate the expected numbers of Y products bought by clients who bought X
|
||||
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
|
||||
for abs_c in n_non_x_interactions["n_items"]])
|
||||
for k in range(1, max_interactions + 1)])
|
||||
|
||||
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
|
||||
if y != x:
|
||||
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
|
||||
else:
|
||||
e_xy[x][y] = n_users * p_y[x]
|
||||
|
||||
self.e_xy = e_xy
|
||||
|
||||
# Calculate the number of users who bought both X and Y
|
||||
|
||||
# Simple and slow method (commented out)
|
||||
|
||||
# n_xy = np.zeros(shape=(n_items, n_items))
|
||||
|
||||
# for x in items:
|
||||
# for y in items:
|
||||
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
|
||||
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
|
||||
# users_x_and_y = users_x & users_y
|
||||
# n_xy[x][y] = len(users_x_and_y)
|
||||
|
||||
# Optimized method (can be further optimized by using sparse matrices)
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
# Get the number of users who bought both X and Y
|
||||
|
||||
n_xy = np.matmul(r.T, r)
|
||||
|
||||
self.n_xy = n_xy
|
||||
|
||||
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.replace({'item_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
final_scores = np.sum(self.scores[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
final_scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': final_scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
233
recommenders/nearest_neighbors_recommender.py
Normal file
233
recommenders/nearest_neighbors_recommender.py
Normal file
@ -0,0 +1,233 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class NearestNeighborsRecommender(Recommender):
|
||||
"""
|
||||
Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.
|
||||
|
||||
Possible similarity measures:
|
||||
- 'cosine',
|
||||
- 'pearson'.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.r = None
|
||||
self.similarities = None
|
||||
self.most_popular_items = None
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
self.n_neighbors = 10
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'n_neighbors' in params:
|
||||
self.n_neighbors = params['n_neighbors']
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
del users_df, items_df
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
if self.collaboration_type == 'item':
|
||||
r = r.T
|
||||
|
||||
self.r = r
|
||||
|
||||
# Calculate all similarities
|
||||
|
||||
similarities = None
|
||||
if self.similarity_measure == 'cosine':
|
||||
n_uv = np.matmul(r, r.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
elif self.similarity_measure == 'pearson':
|
||||
r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
|
||||
n_uv = np.matmul(r_shifted, r_shifted.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
norms[norms == 0] = 0.000001
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
|
||||
np.fill_diagonal(similarities, -1000)
|
||||
|
||||
self.similarities = similarities
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
chosen_ids = []
|
||||
scores = []
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
if self.collaboration_type == 'user':
|
||||
neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
|
||||
user_similarities = self.similarities[mapped_user_id][neighbor_ids]
|
||||
|
||||
item_ids = items_df['item_id'].tolist()
|
||||
|
||||
v_i = self.r[neighbor_ids][:, item_ids]
|
||||
|
||||
scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
elif self.collaboration_type == 'item':
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores = np.sum(self.similarities[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
||||
|
||||
|
||||
class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'pearson'
|
||||
|
||||
|
||||
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'pearson'
|
52
recommenders/recommender.py
Normal file
52
recommenders/recommender.py
Normal file
@ -0,0 +1,52 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class Recommender(object):
|
||||
"""
|
||||
Base recommender class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
|
||||
:param int seed: Seed for the random number generator.
|
||||
"""
|
||||
pass
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
|
||||
"""
|
||||
pass
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
for ix, user in users_df.iterrows():
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [-1] * n_recommendations,
|
||||
'score': [3.0] * n_recommendations})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
102
recommenders/tfidf_recommender.py
Normal file
102
recommenders/tfidf_recommender.py
Normal file
@ -0,0 +1,102 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from collections import defaultdict
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class TFIDFRecommender(Recommender):
|
||||
"""
|
||||
Recommender based on the TF-IDF method.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
"""
|
||||
super().__init__()
|
||||
self.tfidf_scores = None
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
|
||||
and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
|
||||
and the item feature columns.
|
||||
"""
|
||||
|
||||
self.tfidf_scores = defaultdict(lambda: 0.0)
|
||||
|
||||
# Prepare the corpus for tfidf calculation
|
||||
|
||||
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
|
||||
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
|
||||
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
|
||||
user_ids = user_genres.index.tolist()
|
||||
genres_corpus = user_genres['genres'].tolist()
|
||||
|
||||
# Calculate tf-idf scores
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_scores = vectorizer.fit_transform(genres_corpus)
|
||||
|
||||
# Transform results into a dict {(user_id, genre): score}
|
||||
|
||||
for u in range(tfidf_scores.shape[0]):
|
||||
for g in range(tfidf_scores.shape[1]):
|
||||
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
|
||||
should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
# Transform genres to a unified form used by the vectorizer
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
|
||||
|
||||
# Score items
|
||||
|
||||
for uix, user in users_df.iterrows():
|
||||
items = []
|
||||
for iix, item in items_df.iterrows():
|
||||
score = 0.0
|
||||
for genre in item['genres']:
|
||||
score += self.tfidf_scores[(user['user_id'], genre)]
|
||||
score /= len(item['genres'])
|
||||
items.append((item['item_id'], score))
|
||||
|
||||
items = sorted(items, key=lambda x: x[1], reverse=True)
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [item[0] for item in items][:n_recommendations],
|
||||
'score': [item[1] for item in items][:n_recommendations]})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
Loading…
Reference in New Issue
Block a user