meh
This commit is contained in:
commit
61c41dc046
108
recommender-systems-class-master/.gitignore
vendored
Normal file
108
recommender-systems-class-master/.gitignore
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# PyCharm project settings
|
||||
.idea
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
simulation.spec
|
BIN
recommender-systems-class-master/Class_11_Neural_networks.one
Normal file
BIN
recommender-systems-class-master/Class_11_Neural_networks.one
Normal file
Binary file not shown.
BIN
recommender-systems-class-master/Class_11_Neural_networks.pdf
Normal file
BIN
recommender-systems-class-master/Class_11_Neural_networks.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
recommender-systems-class-master/Class_5_Amazon_recommender.one
Normal file
BIN
recommender-systems-class-master/Class_5_Amazon_recommender.one
Normal file
Binary file not shown.
BIN
recommender-systems-class-master/Class_5_Amazon_recommender.pdf
Normal file
BIN
recommender-systems-class-master/Class_5_Amazon_recommender.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
recommender-systems-class-master/Class_8_9_10_Optimization.one
Normal file
BIN
recommender-systems-class-master/Class_8_9_10_Optimization.one
Normal file
Binary file not shown.
BIN
recommender-systems-class-master/Class_8_9_10_Optimization.pdf
Normal file
BIN
recommender-systems-class-master/Class_8_9_10_Optimization.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
0
recommender-systems-class-master/__init__.py
Normal file
0
recommender-systems-class-master/__init__.py
Normal file
1582
recommender-systems-class-master/class_12_pytorch.ipynb
Normal file
1582
recommender-systems-class-master/class_12_pytorch.ipynb
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
4669
recommender-systems-class-master/class_2_numpy_pandas.ipynb
Normal file
4669
recommender-systems-class-master/class_2_numpy_pandas.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1719
recommender-systems-class-master/class_5_amazon_recommender.ipynb
Normal file
1719
recommender-systems-class-master/class_5_amazon_recommender.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
17251
recommender-systems-class-master/data/hotel_data/hotel_data_original.csv
Normal file
17251
recommender-systems-class-master/data/hotel_data/hotel_data_original.csv
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
9743
recommender-systems-class-master/data/movielens_small/links.csv
Normal file
9743
recommender-systems-class-master/data/movielens_small/links.csv
Normal file
File diff suppressed because it is too large
Load Diff
9743
recommender-systems-class-master/data/movielens_small/movies.csv
Normal file
9743
recommender-systems-class-master/data/movielens_small/movies.csv
Normal file
File diff suppressed because it is too large
Load Diff
100837
recommender-systems-class-master/data/movielens_small/ratings.csv
Normal file
100837
recommender-systems-class-master/data/movielens_small/ratings.csv
Normal file
File diff suppressed because it is too large
Load Diff
153
recommender-systems-class-master/data/movielens_small/readme.txt
Normal file
153
recommender-systems-class-master/data/movielens_small/readme.txt
Normal file
@ -0,0 +1,153 @@
|
||||
Summary
|
||||
=======
|
||||
|
||||
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
|
||||
|
||||
Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
|
||||
|
||||
The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
|
||||
|
||||
This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
|
||||
|
||||
This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.
|
||||
|
||||
|
||||
Usage License
|
||||
=============
|
||||
|
||||
Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
|
||||
|
||||
* The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
|
||||
* The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
|
||||
* The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
|
||||
* The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
|
||||
* The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
|
||||
|
||||
In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
|
||||
|
||||
If you have any further questions or comments, please email <grouplens-info@umn.edu>
|
||||
|
||||
|
||||
Citation
|
||||
========
|
||||
|
||||
To acknowledge use of the dataset in publications, please cite the following paper:
|
||||
|
||||
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>
|
||||
|
||||
|
||||
Further Information About GroupLens
|
||||
===================================
|
||||
|
||||
GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
|
||||
|
||||
* recommender systems
|
||||
* online communities
|
||||
* mobile and ubiquitious technologies
|
||||
* digital libraries
|
||||
* local geographic information systems
|
||||
|
||||
GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit <http://movielens.org> to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at <grouplens-info@cs.umn.edu> - we are always interested in working with external collaborators.
|
||||
|
||||
|
||||
Content and Use of Files
|
||||
========================
|
||||
|
||||
Formatting and Encoding
|
||||
-----------------------
|
||||
|
||||
The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
|
||||
|
||||
|
||||
User Ids
|
||||
--------
|
||||
|
||||
MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
|
||||
|
||||
|
||||
Movie Ids
|
||||
---------
|
||||
|
||||
Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
|
||||
|
||||
|
||||
Ratings Data File Structure (ratings.csv)
|
||||
-----------------------------------------
|
||||
|
||||
All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,rating,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Tags Data File Structure (tags.csv)
|
||||
-----------------------------------
|
||||
|
||||
All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,tag,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Movies Data File Structure (movies.csv)
|
||||
---------------------------------------
|
||||
|
||||
Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,title,genres
|
||||
|
||||
Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
|
||||
|
||||
Genres are a pipe-separated list, and are selected from the following:
|
||||
|
||||
* Action
|
||||
* Adventure
|
||||
* Animation
|
||||
* Children's
|
||||
* Comedy
|
||||
* Crime
|
||||
* Documentary
|
||||
* Drama
|
||||
* Fantasy
|
||||
* Film-Noir
|
||||
* Horror
|
||||
* Musical
|
||||
* Mystery
|
||||
* Romance
|
||||
* Sci-Fi
|
||||
* Thriller
|
||||
* War
|
||||
* Western
|
||||
* (no genres listed)
|
||||
|
||||
|
||||
Links Data File Structure (links.csv)
|
||||
---------------------------------------
|
||||
|
||||
Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,imdbId,tmdbId
|
||||
|
||||
movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.
|
||||
|
||||
imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.
|
||||
|
||||
tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.
|
||||
|
||||
Use of the resources listed above is subject to the terms of each provider.
|
||||
|
||||
|
||||
Cross-Validation
|
||||
----------------
|
||||
|
||||
Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
|
3684
recommender-systems-class-master/data/movielens_small/tags.csv
Normal file
3684
recommender-systems-class-master/data/movielens_small/tags.csv
Normal file
File diff suppressed because it is too large
Load Diff
18
recommender-systems-class-master/data/steam/readme.txt
Normal file
18
recommender-systems-class-master/data/steam/readme.txt
Normal file
@ -0,0 +1,18 @@
|
||||
https://www.kaggle.com/tamber/steam-video-games
|
||||
|
||||
Context
|
||||
Steam is the world's most popular PC Gaming hub, with over 6,000 games and a community of millions of gamers. With a massive collection that includes everything from AAA blockbusters to small indie titles, great discovery tools are a highly valuable asset for Steam. How can we make them better?
|
||||
|
||||
Content
|
||||
This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are 'purchase' and 'play'. The value indicates the degree to which the behavior was performed - in the case of 'purchase' the value is always 1, and in the case of 'play' the value represents the number of hours the user has played the game.
|
||||
|
||||
Acknowledgements
|
||||
This dataset is generated entirely from public Steam data, so we want to thank Steam for building such an awesome platform and community!
|
||||
|
||||
Inspiration
|
||||
The dataset is formatted to be compatible with Tamber. Build a Tamber engine and take it for a spin!
|
||||
|
||||
Combine our collaborative filter's results with your favorite Machine Learning techniques with Ensemble Learning, or make Tamber do battle with something else you've built.
|
||||
|
||||
Have fun,
|
||||
The Tamber Team
|
200000
recommender-systems-class-master/data/steam/steam-200k.csv
Normal file
200000
recommender-systems-class-master/data/steam/steam-200k.csv
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,271 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil.easter import easter
|
||||
from data_preprocessing.dataset_specification import DatasetSpecification
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DataPreprocessingToolkit(object):
|
||||
|
||||
def __init__(self):
|
||||
dataset_specification = DatasetSpecification()
|
||||
|
||||
self.sum_columns = dataset_specification.get_sum_columns()
|
||||
self.mean_columns = dataset_specification.get_mean_columns()
|
||||
self.mode_columns = dataset_specification.get_mode_columns()
|
||||
self.first_columns = dataset_specification.get_first_columns()
|
||||
|
||||
self.nights_buckets = dataset_specification.get_nights_buckets()
|
||||
self.npeople_buckets = dataset_specification.get_npeople_buckets()
|
||||
self.room_segment_buckets = dataset_specification.get_room_segment_buckets()
|
||||
|
||||
self.arrival_terms = dataset_specification.get_arrival_terms()
|
||||
|
||||
self.item_features_columns = dataset_specification.get_items_df_feature_columns()
|
||||
|
||||
# #########################
|
||||
# Entire datasets functions
|
||||
# #########################
|
||||
|
||||
def fix_date_to(self, df):
|
||||
df.loc[:, "date_to"] = df["date_to"].apply(lambda x: x + timedelta(days=1))
|
||||
return df
|
||||
|
||||
def add_length_of_stay(self, df):
|
||||
# Write your code here
|
||||
return df
|
||||
|
||||
def add_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = (df["date_from"] - df["booking_date"]).dt.days
|
||||
return df
|
||||
|
||||
def add_nrooms(self, df):
|
||||
df.loc[:, "n_rooms"] = 1
|
||||
return df
|
||||
|
||||
def add_weekend_stay(self, df):
|
||||
s = df["date_from"].dt.dayofweek
|
||||
e = df["date_to"].dt.dayofweek
|
||||
dt = (df["date_to"] - df["date_from"]).dt.days
|
||||
df.loc[:, "weekend_stay"] = (((s >= 4) & (s != 6)) | (e >= 5) | ((e < s) & (s != 6)) | (dt >= 6))
|
||||
df.loc[:, "weekend_stay"] = df["weekend_stay"].replace({True: 'True', False: 'False'})
|
||||
return df
|
||||
|
||||
def add_night_price(self, df):
|
||||
# Write your code here
|
||||
return df
|
||||
|
||||
def clip_book_to_arrival(self, df):
|
||||
df.loc[:, "book_to_arrival"] = np.maximum(df["book_to_arrival"], 0)
|
||||
return df
|
||||
|
||||
def sum_npeople(self, df):
|
||||
df.loc[:, "n_people"] = np.maximum(df["n_people"] + df["n_children_1"] + df["n_children_2"] + df["n_children_3"], 1)
|
||||
return df
|
||||
|
||||
def filter_out_company_clients(self, df):
|
||||
df = df.loc[df["is_company"] == 0]
|
||||
return df
|
||||
|
||||
def filter_out_long_stays(self, df):
|
||||
df = df.loc[df["length_of_stay"] <= 21]
|
||||
return df
|
||||
|
||||
def leave_one_from_group_reservations(self, df):
|
||||
unique_group_rows = []
|
||||
|
||||
df.loc[:, "group_id"] = df["group_id"].fillna(-1)
|
||||
|
||||
group_ids = []
|
||||
for idx, row in df.iterrows():
|
||||
if row["group_id"] != -1:
|
||||
if row["group_id"] not in group_ids:
|
||||
unique_group_rows.append(row)
|
||||
group_ids.append(row["group_id"])
|
||||
else:
|
||||
unique_group_rows.append(row)
|
||||
|
||||
cleaned_dataset = pd.DataFrame(unique_group_rows, columns=df.columns)
|
||||
|
||||
return df
|
||||
|
||||
def aggregate_group_reservations(self, df):
|
||||
non_group_reservations = df.loc[df["group_id"] == "",
|
||||
self.sum_columns + self.mean_columns + self.mode_columns + self.first_columns]
|
||||
group_reservations = df.loc[df["group_id"] != ""]
|
||||
|
||||
agg_datasets = [group_reservations.loc[:, ["group_id"] + self.sum_columns].groupby("group_id").sum(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mean_columns].groupby("group_id").mean(),
|
||||
group_reservations.loc[:, ["group_id"] + self.mode_columns].groupby("group_id").agg(lambda x: x.value_counts().index[0]),
|
||||
group_reservations.loc[:, ["group_id"] + self.first_columns].groupby("group_id").first()]
|
||||
|
||||
group_reservations = agg_datasets[0]
|
||||
for i in range(1, len(agg_datasets)):
|
||||
group_reservations = group_reservations.merge(agg_datasets[i], on="group_id")
|
||||
|
||||
group_reservations = group_reservations.reset_index(drop=True)
|
||||
|
||||
df = pd.concat([non_group_reservations, group_reservations])
|
||||
|
||||
return df
|
||||
|
||||
def leave_only_ota(self, df):
|
||||
df = df.loc[df.loc[:, "Source"].apply(lambda x: "booking" in x.lower() or "expedia" in x.lower())]
|
||||
return df
|
||||
|
||||
def map_date_to_term_datasets(self, df):
|
||||
df.loc[:, "date_from"] = df["date_from"].astype(str).apply(lambda x: x[:10])
|
||||
df.loc[:, 'term'] = df['date_from'].apply(lambda x: self.map_date_to_term(x))
|
||||
return df
|
||||
|
||||
def map_length_of_stay_to_nights_buckets(self, df):
|
||||
df.loc[:, 'length_of_stay_bucket'] = df['length_of_stay'].apply(lambda x: self.map_value_to_bucket(x, self.nights_buckets))
|
||||
return df
|
||||
|
||||
def map_night_price_to_room_segment_buckets(self, df):
|
||||
# Write your code here
|
||||
return df
|
||||
|
||||
# def map_night_price_to_room_segment_buckets(self, df):
|
||||
# night_prices = df.loc[df['accomodation_price'] > 1]\
|
||||
# .groupby(['term', 'room_group_id'])['night_price'].mean().reset_index()
|
||||
# night_prices.columns = ['term', 'room_group_id', 'termnight_price']
|
||||
# df = pd.merge(df, night_prices, on=['term', 'room_group_id'], how='left')
|
||||
# df.loc[:, 'room_segment'] = df['termnight_price'].apply(
|
||||
# lambda x: self.map_value_to_bucket(x, self.room_segment_buckets))
|
||||
# df = df.drop(columns=['termnight_price'])
|
||||
# return df
|
||||
|
||||
def map_npeople_to_npeople_buckets(self, df):
|
||||
df.loc[:, 'n_people_bucket'] = df['n_people'].apply(lambda x: self.map_value_to_bucket(x, self.npeople_buckets))
|
||||
return df
|
||||
|
||||
def map_item_to_item_id(self, df):
|
||||
df.loc[:, 'item'] = df[self.item_features_columns].astype(str).agg(' '.join, axis=1)
|
||||
|
||||
ids = df['item'].unique().tolist()
|
||||
mapping = {ids[i]: i for i in range(len(ids))}
|
||||
|
||||
df['item_id'] = df['item'].apply(lambda x: mapping[x])
|
||||
|
||||
return df
|
||||
|
||||
def add_interaction_id(self, df):
|
||||
df.loc[:, 'interaction_id'] = range(df.shape[0])
|
||||
return df
|
||||
|
||||
# ################
|
||||
# Column functions
|
||||
# ################
|
||||
|
||||
def bundle_period(self, diff):
|
||||
diff = float(diff)
|
||||
if int(diff) < 0:
|
||||
return "<0"
|
||||
elif int(diff) <= 7:
|
||||
return diff
|
||||
elif 7 < int(diff) <= 14:
|
||||
return "<14"
|
||||
elif 14 < int(diff) <= 30:
|
||||
return "<30"
|
||||
elif 30 < int(diff) <= 60:
|
||||
return "<60"
|
||||
elif 60 < int(diff) <= 180:
|
||||
return "<180"
|
||||
elif int(diff) > 180:
|
||||
return ">180"
|
||||
|
||||
def bundle_price(self, price):
|
||||
mod = 300.0
|
||||
return int((price + mod / 2) / mod) * mod
|
||||
|
||||
def map_date_to_season(self, date):
|
||||
day = int(date[8:10])
|
||||
month = int(date[5:7])
|
||||
if (month == 12 and day >= 21) or (month == 1) or (month == 2) or (month == 3 and day <= 19):
|
||||
return "Winter"
|
||||
if (month == 3 and day >= 20) or (month == 4) or (month == 5) or (month == 6 and day <= 20):
|
||||
return "Spring"
|
||||
if (month == 6 and day >= 21) or (month == 7) or (month == 8) or (month == 9 and day <= 22):
|
||||
return "Summer"
|
||||
if (month == 9 and day >= 23) or (month == 10) or (month == 11) or (month == 12 and day <= 20):
|
||||
return "Autumn"
|
||||
|
||||
def map_value_to_bucket(self, value, buckets):
|
||||
if value == "":
|
||||
return str(buckets[0]).replace(", ", "-")
|
||||
for bucket in buckets:
|
||||
if bucket[0] <= value <= bucket[1]:
|
||||
return str(bucket).replace(", ", "-")
|
||||
|
||||
def map_date_to_term(self, date):
|
||||
|
||||
m = int(date[5:7])
|
||||
d = int(date[8:10])
|
||||
term = None
|
||||
|
||||
for arrival_term in self.arrival_terms:
|
||||
if arrival_term == "Easter":
|
||||
year = int(date[:4])
|
||||
easter_date = easter(year)
|
||||
easter_start = easter_date + timedelta(days=-4)
|
||||
easter_end = easter_date + timedelta(days=1)
|
||||
esm = easter_start.month
|
||||
esd = easter_start.day
|
||||
eem = easter_end.month
|
||||
eed = easter_end.day
|
||||
if ((m > esm) or (m == esm and d >= esd)) and ((m < eem) or (m == eem and d <= eed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
elif arrival_term == "NewYear":
|
||||
sm = self.arrival_terms[arrival_term][0]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][0]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][0]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][0]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) or ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
break
|
||||
|
||||
else:
|
||||
is_match = False
|
||||
|
||||
for i in range(len(self.arrival_terms[arrival_term])):
|
||||
sm = self.arrival_terms[arrival_term][i]["start"]["m"]
|
||||
sd = self.arrival_terms[arrival_term][i]["start"]["d"]
|
||||
em = self.arrival_terms[arrival_term][i]["end"]["m"]
|
||||
ed = self.arrival_terms[arrival_term][i]["end"]["d"]
|
||||
if ((m > sm) or (m == sm and d >= sd)) and ((m < em) or (m == em and d <= ed)):
|
||||
term = arrival_term
|
||||
is_match = True
|
||||
break
|
||||
|
||||
if is_match:
|
||||
break
|
||||
|
||||
return term
|
||||
|
||||
def map_dates_to_terms(self, dates):
|
||||
|
||||
terms = []
|
||||
for date in dates:
|
||||
term = self.map_date_to_term(date)
|
||||
terms.append(term)
|
||||
|
||||
return terms
|
||||
|
||||
def filter_out_historical_dates(self, date_list):
|
||||
"""
|
||||
Filters out past dates from a list of dates.
|
||||
"""
|
||||
future_dates = []
|
||||
|
||||
for date in date_list:
|
||||
if date >= datetime.now():
|
||||
future_dates.append(date.strftime("%Y-%m-%d"))
|
||||
|
||||
return future_dates
|
@ -0,0 +1,88 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class DatasetSpecification(object):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
# ################
|
||||
# Original dataset functions
|
||||
# ################
|
||||
|
||||
def get_sum_columns(self):
|
||||
return ["n_people", "n_children_1", "n_children_2", "n_children_3", "accomodation_price", "meal_price",
|
||||
"service_price", "paid", "n_rooms"]
|
||||
|
||||
def get_mean_columns(self):
|
||||
return ['discount']
|
||||
|
||||
def get_mode_columns(self):
|
||||
return ["room_id", "room_group_id", "date_from", "date_to", "booking_date", "rate_plan",
|
||||
"length_of_stay", "book_to_arrival", "weekend_stay"]
|
||||
|
||||
def get_first_columns(self):
|
||||
return ["user_id", "client_id", "client_name", "email", "phone", "is_company"]
|
||||
|
||||
def get_id_columns(self):
|
||||
return ["client_id", "client_name", "email", "phone"]
|
||||
|
||||
# ################
|
||||
# Output dataset functions
|
||||
# ################
|
||||
|
||||
def get_people_df_id_columns(self):
|
||||
return ['user_id']
|
||||
|
||||
def get_people_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
def get_items_df_id_columns(self):
|
||||
return ['item_id']
|
||||
|
||||
def get_items_df_feature_columns(self):
|
||||
return ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']
|
||||
|
||||
def get_purchases_df_id_columns(self):
|
||||
return ['user_id', 'item_id']
|
||||
|
||||
def get_purchases_df_feature_columns(self):
|
||||
return []
|
||||
|
||||
# ################
|
||||
# Mapping functions
|
||||
# ################
|
||||
|
||||
def get_nights_buckets(self):
|
||||
return [[0, 1], [2, 3], [4, 7], [8, np.inf]]
|
||||
|
||||
def get_npeople_buckets(self):
|
||||
return [[1, 1], [2, 2], [3, 4], [5, np.inf]]
|
||||
|
||||
def get_room_segment_buckets(self):
|
||||
return [[0, 160], [160, 260], [260, 360], [360, 500], [500, 900], [900, np.inf]]
|
||||
|
||||
def get_book_to_arrival_buckets(self):
|
||||
return [[0, 0], [1, 2], [3, 4], [5, 7], [8, 14], [15, 30], [31, 60], [61, 90], [91, 180], [181, np.inf]]
|
||||
|
||||
def get_arrival_terms(self):
|
||||
arrival_terms = {"Easter": [{"start": {"m": np.nan, "d": np.nan}, "end": {"m": np.nan, "d": np.nan}}],
|
||||
# Treated with priority
|
||||
"Christmas": [{"start": {"m": 12, "d": 22}, "end": {"m": 12, "d": 27}}],
|
||||
"NewYear": [{"start": {"m": 12, "d": 28}, "end": {"m": 1, "d": 4}}],
|
||||
"WinterVacation": [{"start": {"m": 1, "d": 5}, "end": {"m": 2, "d": 29}}],
|
||||
"OffSeason": [
|
||||
{"start": {"m": 3, "d": 1}, "end": {"m": 4, "d": 27}},
|
||||
{"start": {"m": 5, "d": 6}, "end": {"m": 6, "d": 20}},
|
||||
{"start": {"m": 9, "d": 26}, "end": {"m": 12, "d": 21}}],
|
||||
"MayLongWeekend": [{"start": {"m": 4, "d": 28}, "end": {"m": 5, "d": 5}}],
|
||||
"LowSeason": [
|
||||
{"start": {"m": 6, "d": 21}, "end": {"m": 7, "d": 10}},
|
||||
{"start": {"m": 8, "d": 23}, "end": {"m": 9, "d": 25}}],
|
||||
"HighSeason": [{"start": {"m": 7, "d": 11}, "end": {"m": 8, "d": 22}}]}
|
||||
return arrival_terms
|
@ -0,0 +1,77 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class PeopleIdentifier(object):
|
||||
|
||||
def __init__(self):
|
||||
self.id_column_names = []
|
||||
self.pid_cname = ""
|
||||
self.next_available_pid = 0
|
||||
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
|
||||
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
|
||||
self.data = None
|
||||
|
||||
def add_pid(self, data, id_column_names, pid_cname):
|
||||
self.id_column_names = id_column_names
|
||||
self.pid_cname = pid_cname
|
||||
|
||||
for cid_cname in id_column_names:
|
||||
self.cid_to_pid[cid_cname] = {}
|
||||
|
||||
for idx, reservation in data.iterrows():
|
||||
pids = set()
|
||||
for cid_cname in id_column_names:
|
||||
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
|
||||
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
||||
|
||||
if len(pids) > 0:
|
||||
min_pid = min(pids)
|
||||
|
||||
self.set_pid(min_pid, reservation)
|
||||
|
||||
# Merge pids connected through this node
|
||||
|
||||
if len(pids) > 1:
|
||||
pids.remove(min_pid)
|
||||
self.merge_pids(pids, min_pid)
|
||||
|
||||
# print("Chosen pid: {}".format(min_pid))
|
||||
else:
|
||||
new_pid = self.next_available_pid
|
||||
self.next_available_pid += 1
|
||||
|
||||
self.set_pid(new_pid, reservation)
|
||||
# print("Chosen pid: {}".format(new_pid))
|
||||
|
||||
# print("=======")
|
||||
# print(self.pid_to_cid)
|
||||
# print("=======")
|
||||
|
||||
data_pid = data.copy()
|
||||
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
|
||||
self.data = data_pid
|
||||
|
||||
return data_pid
|
||||
|
||||
def set_pid(self, pid, reservation):
|
||||
for cid_cname in self.id_column_names:
|
||||
if reservation[cid_cname] != "":
|
||||
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
|
||||
if pid in self.pid_to_cid:
|
||||
for cid_cname in self.id_column_names:
|
||||
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
else:
|
||||
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
||||
for cid_cname in self.id_column_names}
|
||||
|
||||
def merge_pids(self, pids_from, pid_to):
|
||||
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
|
||||
for pid_from in pids_from:
|
||||
for cid_cname in self.id_column_names:
|
||||
for cid in self.pid_to_cid[pid_from][cid_cname]:
|
||||
self.cid_to_pid[cid_cname][cid] = pid_to
|
||||
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
|
||||
self.pid_to_cid.pop(pid_from)
|
86
recommender-systems-class-master/environment.yml
Normal file
86
recommender-systems-class-master/environment.yml
Normal file
@ -0,0 +1,86 @@
|
||||
name: rs-class-env
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=21.0.1
|
||||
- python=3.8.8
|
||||
- setuptools=52.0.0
|
||||
- sqlite=3.35.1
|
||||
- wheel=0.36.2
|
||||
- pip:
|
||||
- anyio==2.2.0
|
||||
- argon2-cffi==20.1.0
|
||||
- async-generator==1.10
|
||||
- attrs==20.3.0
|
||||
- babel==2.9.0
|
||||
- backcall==0.2.0
|
||||
- bleach==3.3.0
|
||||
- cffi==1.14.5
|
||||
- chardet==4.0.0
|
||||
- colorama==0.4.4
|
||||
- cycler==0.10.0
|
||||
- decorator==4.4.2
|
||||
- defusedxml==0.7.1
|
||||
- entrypoints==0.3
|
||||
- idna==2.10
|
||||
- ipykernel==5.5.0
|
||||
- ipython==7.21.0
|
||||
- ipython-genutils==0.2.0
|
||||
- jedi==0.18.0
|
||||
- jinja2==2.11.3
|
||||
- joblib==1.0.1
|
||||
- json5==0.9.5
|
||||
- jsonschema==3.2.0
|
||||
- jupyter-client==6.1.12
|
||||
- jupyter-core==4.7.1
|
||||
- jupyter-packaging==0.7.12
|
||||
- jupyter-server==1.4.1
|
||||
- jupyterlab==3.0.11
|
||||
- jupyterlab-pygments==0.1.2
|
||||
- jupyterlab-server==2.3.0
|
||||
- kiwisolver==1.3.1
|
||||
- markupsafe==1.1.1
|
||||
- matplotlib==3.3.4
|
||||
- mistune==0.8.4
|
||||
- nbclassic==0.2.6
|
||||
- nbclient==0.5.3
|
||||
- nbconvert==6.0.7
|
||||
- nbformat==5.1.2
|
||||
- nest-asyncio==1.5.1
|
||||
- notebook==6.2.0
|
||||
- numpy==1.20.1
|
||||
- packaging==20.9
|
||||
- pandas==1.2.3
|
||||
- pandocfilters==1.4.3
|
||||
- parso==0.8.1
|
||||
- patsy==0.5.1
|
||||
- pickleshare==0.7.5
|
||||
- pillow==8.1.2
|
||||
- prometheus-client==0.9.0
|
||||
- prompt-toolkit==3.0.17
|
||||
- pycparser==2.20
|
||||
- pygments==2.8.1
|
||||
- pyparsing==2.4.7
|
||||
- pyrsistent==0.17.3
|
||||
- python-dateutil==2.8.1
|
||||
- pytz==2021.1
|
||||
- pyzmq==22.0.3
|
||||
- requests==2.25.1
|
||||
- scikit-learn==0.24.1
|
||||
- scipy==1.6.1
|
||||
- seaborn==0.11.1
|
||||
- send2trash==1.5.0
|
||||
- six==1.15.0
|
||||
- sklearn==0.0
|
||||
- sniffio==1.2.0
|
||||
- statsmodels==0.12.2
|
||||
- terminado==0.9.3
|
||||
- testpath==0.4.4
|
||||
- threadpoolctl==2.1.0
|
||||
- torch==1.8.0
|
||||
- tornado==6.1
|
||||
- traitlets==5.0.5
|
||||
- typing-extensions==3.7.4.3
|
||||
- urllib3==1.26.4
|
||||
- wcwidth==0.2.5
|
||||
- webencodings==0.5.1
|
@ -0,0 +1,89 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def rmse(r_pred, r_real):
|
||||
return np.sqrt(np.sum(np.power(r_pred - r_real, 2)) / len(r_pred))
|
||||
|
||||
|
||||
def mape(r_pred, r_real):
|
||||
return 1 / len(r_pred) * np.sum(np.abs(r_pred - r_real) / np.abs(r_real))
|
||||
|
||||
|
||||
def tre(r_pred, r_real):
|
||||
return np.sum(np.abs(r_pred - r_real)) / np.sum(np.abs(r_real))
|
||||
|
||||
|
||||
def hr(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])]
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def ndcg(recommendations, real_interactions, n=1):
|
||||
"""
|
||||
Assumes recommendations are ordered by user_id and then by score.
|
||||
|
||||
:param pd.DataFrame recommendations:
|
||||
:param pd.DataFrame real_interactions:
|
||||
:param int n:
|
||||
"""
|
||||
# Transform real_interactions to a dict for a large speed-up
|
||||
rui = defaultdict(lambda: 0)
|
||||
|
||||
for idx, row in real_interactions.iterrows():
|
||||
rui[(row['user_id'], row['item_id'])] = 1
|
||||
|
||||
result = 0.0
|
||||
|
||||
previous_user_id = -1
|
||||
rank = 0
|
||||
for idx, row in recommendations.iterrows():
|
||||
if previous_user_id == row['user_id']:
|
||||
rank += 1
|
||||
else:
|
||||
rank = 1
|
||||
|
||||
if rank <= n:
|
||||
result += rui[(row['user_id'], row['item_id'])] / np.log2(1 + rank)
|
||||
|
||||
previous_user_id = row['user_id']
|
||||
|
||||
if len(recommendations['user_id'].unique()) > 0:
|
||||
result /= len(recommendations['user_id'].unique())
|
||||
|
||||
return result
|
@ -0,0 +1,209 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
from evaluation_and_testing.evaluation_measures import rmse
|
||||
from evaluation_and_testing.evaluation_measures import mape
|
||||
from evaluation_and_testing.evaluation_measures import tre
|
||||
from evaluation_and_testing.evaluation_measures import hr
|
||||
from evaluation_and_testing.evaluation_measures import ndcg
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
def evaluate_train_test_split_explicit(recommender, interactions_df, items_df, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Gather predictions
|
||||
|
||||
r_pred = []
|
||||
|
||||
for idx, row in interactions_df_test.iterrows():
|
||||
users_df = pd.DataFrame([row['user_id']], columns=['user_id'])
|
||||
eval_items_df = pd.DataFrame([row['item_id']], columns=['item_id'])
|
||||
eval_items_df = pd.merge(eval_items_df, items_df, on='item_id')
|
||||
recommendations = recommender.recommend(users_df, eval_items_df, n_recommendations=1)
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
|
||||
# Gather real ratings
|
||||
|
||||
r_real = np.array(interactions_df_test['rating'].tolist())
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_train_test_split_implicit(recommender, interactions_df, items_df, seed=6789):
|
||||
# Write your code here
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
if isinstance(interactions_df, dict):
|
||||
# If interactions_df is a dict with already split data, use the split
|
||||
interactions_df_train = interactions_df['train']
|
||||
interactions_df_test = interactions_df['test']
|
||||
else:
|
||||
# Otherwise split the dataset into train and test
|
||||
|
||||
shuffle = np.arange(len(interactions_df))
|
||||
rng.shuffle(shuffle)
|
||||
shuffle = list(shuffle)
|
||||
|
||||
train_test_split = 0.8
|
||||
split_index = int(len(interactions_df) * train_test_split)
|
||||
|
||||
interactions_df_train = interactions_df.iloc[shuffle[:split_index]]
|
||||
interactions_df_test = interactions_df.iloc[shuffle[split_index:]]
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# Train the recommender
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
|
||||
# Make recommendations for each user in the test set and calculate the metric
|
||||
# against all items of that user in the test set
|
||||
|
||||
test_user_interactions = interactions_df_test.groupby(by='user_id')
|
||||
|
||||
for user_id, user_interactions in test_user_interactions:
|
||||
|
||||
recommendations = recommender.recommend(pd.DataFrame([user_id], columns=['user_id']),
|
||||
items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, user_interactions, n=1))
|
||||
hr_3.append(hr(recommendations, user_interactions, n=3))
|
||||
hr_5.append(hr(recommendations, user_interactions, n=5))
|
||||
hr_10.append(hr(recommendations, user_interactions, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, user_interactions, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, user_interactions, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, user_interactions, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, user_interactions, n=10))
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
||||
|
||||
|
||||
def evaluate_leave_one_out_explicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
r_pred = []
|
||||
r_real = []
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']],
|
||||
items_df.loc[items_df['item_id'] == interactions_df_test.iloc[0]['item_id']])
|
||||
|
||||
r_pred.append(recommendations.iloc[0]['score'])
|
||||
r_real.append(interactions_df_test.iloc[0]['rating'])
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
r_pred = np.array(r_pred)
|
||||
r_real = np.array(r_real)
|
||||
|
||||
# Return evaluation metrics
|
||||
|
||||
return rmse(r_pred, r_real), mape(r_pred, r_real), tre(r_pred, r_real)
|
||||
|
||||
|
||||
def evaluate_leave_one_out_implicit(recommender, interactions_df, items_df, max_evals=300, seed=6789):
|
||||
rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Prepare splits of the datasets
|
||||
kf = KFold(n_splits=len(interactions_df), random_state=rng, shuffle=True)
|
||||
|
||||
hr_1 = []
|
||||
hr_3 = []
|
||||
hr_5 = []
|
||||
hr_10 = []
|
||||
ndcg_1 = []
|
||||
ndcg_3 = []
|
||||
ndcg_5 = []
|
||||
ndcg_10 = []
|
||||
|
||||
# For each split of the dataset train the recommender, generate recommendations and evaluate
|
||||
|
||||
n_eval = 1
|
||||
for train_index, test_index in kf.split(interactions_df.index):
|
||||
interactions_df_train = interactions_df.loc[interactions_df.index[train_index]]
|
||||
interactions_df_test = interactions_df.loc[interactions_df.index[test_index]]
|
||||
|
||||
recommender.fit(interactions_df_train, None, items_df)
|
||||
recommendations = recommender.recommend(
|
||||
interactions_df_test.loc[:, ['user_id']], items_df, n_recommendations=10)
|
||||
|
||||
hr_1.append(hr(recommendations, interactions_df_test, n=1))
|
||||
hr_3.append(hr(recommendations, interactions_df_test, n=3))
|
||||
hr_5.append(hr(recommendations, interactions_df_test, n=5))
|
||||
hr_10.append(hr(recommendations, interactions_df_test, n=10))
|
||||
ndcg_1.append(ndcg(recommendations, interactions_df_test, n=1))
|
||||
ndcg_3.append(ndcg(recommendations, interactions_df_test, n=3))
|
||||
ndcg_5.append(ndcg(recommendations, interactions_df_test, n=5))
|
||||
ndcg_10.append(ndcg(recommendations, interactions_df_test, n=10))
|
||||
|
||||
if n_eval == max_evals:
|
||||
break
|
||||
n_eval += 1
|
||||
|
||||
hr_1 = np.mean(hr_1)
|
||||
hr_3 = np.mean(hr_3)
|
||||
hr_5 = np.mean(hr_5)
|
||||
hr_10 = np.mean(hr_10)
|
||||
ndcg_1 = np.mean(ndcg_1)
|
||||
ndcg_3 = np.mean(ndcg_3)
|
||||
ndcg_5 = np.mean(ndcg_5)
|
||||
ndcg_10 = np.mean(ndcg_10)
|
||||
|
||||
return hr_1, hr_3, hr_5, hr_10, ndcg_1, ndcg_3, ndcg_5, ndcg_10
|
@ -0,0 +1,27 @@
|
||||
# Enable tab completion
|
||||
source ~/git-completion.bash
|
||||
|
||||
# colors!
|
||||
green="\[\033[0;32m\]"
|
||||
blue="\[\033[0;34m\]"
|
||||
purple="\[\033[0;35m\]"
|
||||
yellow="\[\033[0;33m\]"
|
||||
reset="\[\033[0m\]"
|
||||
|
||||
# Change command prompt
|
||||
source ~/git-prompt.sh
|
||||
export GIT_PS1_SHOWDIRTYSTATE=1
|
||||
# '\u' adds the name of the current user to the prompt
|
||||
# '\$(__git_ps1)' adds git-related stuff
|
||||
# '\W' adds the name of the current directory
|
||||
export PS1="$purple\u$green\$(__git_ps1)$yellow \W $ $reset"
|
||||
|
||||
alias ntpd="C:/Program\ Files\ \(x86\)/Notepad++/notepad++.exe"
|
||||
|
||||
test -f ~/.profile && . ~/.profile
|
||||
test -f ~/.bashrc && . ~/.bashrc
|
||||
# >>> conda initialize >>>
|
||||
# !! Contents within this block are managed by 'conda init' !!
|
||||
eval "$('/c/ProgramData/Anaconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
|
||||
# <<< conda initialize <<<
|
||||
|
23
recommender-systems-class-master/git_configuration/.bashrc
Normal file
23
recommender-systems-class-master/git_configuration/.bashrc
Normal file
@ -0,0 +1,23 @@
|
||||
env=~/.ssh/agent.env
|
||||
|
||||
agent_load_env () { test -f "$env" && . "$env" >| /dev/null ; }
|
||||
|
||||
agent_start () {
|
||||
(umask 077; ssh-agent >| "$env")
|
||||
. "$env" >| /dev/null ; }
|
||||
|
||||
agent_load_env
|
||||
|
||||
# agent_run_state: 0=agent running w/ key; 1=agent w/o key; 2= agent not running
|
||||
agent_run_state=$(ssh-add -l >| /dev/null 2>&1; echo $?)
|
||||
|
||||
if [ ! "$SSH_AUTH_SOCK" ] || [ $agent_run_state = 2 ]; then
|
||||
agent_start
|
||||
ssh-add ~/.ssh/PZ_BitBucket_key;
|
||||
ssh-add ~/.ssh/PZ_GitHub_key;
|
||||
elif [ "$SSH_AUTH_SOCK" ] && [ $agent_run_state = 1 ]; then
|
||||
ssh-add ~/.ssh/PZ_BitBucket_key;
|
||||
ssh-add ~/.ssh/PZ_GitHub_key;
|
||||
fi
|
||||
|
||||
unset env
|
537
recommender-systems-class-master/git_configuration/git-prompt.sh
Normal file
537
recommender-systems-class-master/git_configuration/git-prompt.sh
Normal file
@ -0,0 +1,537 @@
|
||||
# bash/zsh git prompt support
|
||||
#
|
||||
# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
|
||||
# Distributed under the GNU General Public License, version 2.0.
|
||||
#
|
||||
# This script allows you to see repository status in your prompt.
|
||||
#
|
||||
# To enable:
|
||||
#
|
||||
# 1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
|
||||
# 2) Add the following line to your .bashrc/.zshrc:
|
||||
# source ~/.git-prompt.sh
|
||||
# 3a) Change your PS1 to call __git_ps1 as
|
||||
# command-substitution:
|
||||
# Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
|
||||
# ZSH: setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
|
||||
# the optional argument will be used as format string.
|
||||
# 3b) Alternatively, for a slightly faster prompt, __git_ps1 can
|
||||
# be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
|
||||
# with two parameters, <pre> and <post>, which are strings
|
||||
# you would put in $PS1 before and after the status string
|
||||
# generated by the git-prompt machinery. e.g.
|
||||
# Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
|
||||
# will show username, at-sign, host, colon, cwd, then
|
||||
# various status string, followed by dollar and SP, as
|
||||
# your prompt.
|
||||
# ZSH: precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
|
||||
# will show username, pipe, then various status string,
|
||||
# followed by colon, cwd, dollar and SP, as your prompt.
|
||||
# Optionally, you can supply a third argument with a printf
|
||||
# format string to finetune the output of the branch status
|
||||
#
|
||||
# The repository status will be displayed only if you are currently in a
|
||||
# git repository. The %s token is the placeholder for the shown status.
|
||||
#
|
||||
# The prompt status always includes the current branch name.
|
||||
#
|
||||
# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
|
||||
# unstaged (*) and staged (+) changes will be shown next to the branch
|
||||
# name. You can configure this per-repository with the
|
||||
# bash.showDirtyState variable, which defaults to true once
|
||||
# GIT_PS1_SHOWDIRTYSTATE is enabled.
|
||||
#
|
||||
# You can also see if currently something is stashed, by setting
|
||||
# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
|
||||
# then a '$' will be shown next to the branch name.
|
||||
#
|
||||
# If you would like to see if there're untracked files, then you can set
|
||||
# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
|
||||
# files, then a '%' will be shown next to the branch name. You can
|
||||
# configure this per-repository with the bash.showUntrackedFiles
|
||||
# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
|
||||
# enabled.
|
||||
#
|
||||
# If you would like to see the difference between HEAD and its upstream,
|
||||
# set GIT_PS1_SHOWUPSTREAM="auto". A "<" indicates you are behind, ">"
|
||||
# indicates you are ahead, "<>" indicates you have diverged and "="
|
||||
# indicates that there is no difference. You can further control
|
||||
# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
|
||||
# of values:
|
||||
#
|
||||
# verbose show number of commits ahead/behind (+/-) upstream
|
||||
# name if verbose, then also show the upstream abbrev name
|
||||
# legacy don't use the '--count' option available in recent
|
||||
# versions of git-rev-list
|
||||
# git always compare HEAD to @{upstream}
|
||||
# svn always compare HEAD to your SVN upstream
|
||||
#
|
||||
# You can change the separator between the branch name and the above
|
||||
# state symbols by setting GIT_PS1_STATESEPARATOR. The default separator
|
||||
# is SP.
|
||||
#
|
||||
# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
|
||||
# find one, or @{upstream} otherwise. Once you have set
|
||||
# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
|
||||
# setting the bash.showUpstream config variable.
|
||||
#
|
||||
# If you would like to see more information about the identity of
|
||||
# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
|
||||
# to one of these values:
|
||||
#
|
||||
# contains relative to newer annotated tag (v1.6.3.2~35)
|
||||
# branch relative to newer tag or branch (master~4)
|
||||
# describe relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
|
||||
# tag relative to any older tag (v1.6.3.1-13-gdd42c2f)
|
||||
# default exactly matching tag
|
||||
#
|
||||
# If you would like a colored hint about the current dirty state, set
|
||||
# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
|
||||
# the colored output of "git status -sb" and are available only when
|
||||
# using __git_ps1 for PROMPT_COMMAND or precmd.
|
||||
#
|
||||
# If you would like __git_ps1 to do nothing in the case when the current
|
||||
# directory is set up to be ignored by git, then set
|
||||
# GIT_PS1_HIDE_IF_PWD_IGNORED to a nonempty value. Override this on the
|
||||
# repository level by setting bash.hideIfPwdIgnored to "false".
|
||||
|
||||
# check whether printf supports -v
|
||||
__git_printf_supports_v=
|
||||
printf -v __git_printf_supports_v -- '%s' yes >/dev/null 2>&1
|
||||
|
||||
# stores the divergence from upstream in $p
|
||||
# used by GIT_PS1_SHOWUPSTREAM
|
||||
__git_ps1_show_upstream ()
|
||||
{
|
||||
local key value
|
||||
local svn_remote svn_url_pattern count n
|
||||
local upstream=git legacy="" verbose="" name=""
|
||||
|
||||
svn_remote=()
|
||||
# get some config options from git-config
|
||||
local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
|
||||
while read -r key value; do
|
||||
case "$key" in
|
||||
bash.showupstream)
|
||||
GIT_PS1_SHOWUPSTREAM="$value"
|
||||
if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
|
||||
p=""
|
||||
return
|
||||
fi
|
||||
;;
|
||||
svn-remote.*.url)
|
||||
svn_remote[$((${#svn_remote[@]} + 1))]="$value"
|
||||
svn_url_pattern="$svn_url_pattern\\|$value"
|
||||
upstream=svn+git # default upstream is SVN if available, else git
|
||||
;;
|
||||
esac
|
||||
done <<< "$output"
|
||||
|
||||
# parse configuration values
|
||||
for option in ${GIT_PS1_SHOWUPSTREAM}; do
|
||||
case "$option" in
|
||||
git|svn) upstream="$option" ;;
|
||||
verbose) verbose=1 ;;
|
||||
legacy) legacy=1 ;;
|
||||
name) name=1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Find our upstream
|
||||
case "$upstream" in
|
||||
git) upstream="@{upstream}" ;;
|
||||
svn*)
|
||||
# get the upstream from the "git-svn-id: ..." in a commit message
|
||||
# (git-svn uses essentially the same procedure internally)
|
||||
local -a svn_upstream
|
||||
svn_upstream=($(git log --first-parent -1 \
|
||||
--grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
|
||||
if [[ 0 -ne ${#svn_upstream[@]} ]]; then
|
||||
svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
|
||||
svn_upstream=${svn_upstream%@*}
|
||||
local n_stop="${#svn_remote[@]}"
|
||||
for ((n=1; n <= n_stop; n++)); do
|
||||
svn_upstream=${svn_upstream#${svn_remote[$n]}}
|
||||
done
|
||||
|
||||
if [[ -z "$svn_upstream" ]]; then
|
||||
# default branch name for checkouts with no layout:
|
||||
upstream=${GIT_SVN_ID:-git-svn}
|
||||
else
|
||||
upstream=${svn_upstream#/}
|
||||
fi
|
||||
elif [[ "svn+git" = "$upstream" ]]; then
|
||||
upstream="@{upstream}"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# Find how many commits we are ahead/behind our upstream
|
||||
if [[ -z "$legacy" ]]; then
|
||||
count="$(git rev-list --count --left-right \
|
||||
"$upstream"...HEAD 2>/dev/null)"
|
||||
else
|
||||
# produce equivalent output to --count for older versions of git
|
||||
local commits
|
||||
if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
|
||||
then
|
||||
local commit behind=0 ahead=0
|
||||
for commit in $commits
|
||||
do
|
||||
case "$commit" in
|
||||
"<"*) ((behind++)) ;;
|
||||
*) ((ahead++)) ;;
|
||||
esac
|
||||
done
|
||||
count="$behind $ahead"
|
||||
else
|
||||
count=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# calculate the result
|
||||
if [[ -z "$verbose" ]]; then
|
||||
case "$count" in
|
||||
"") # no upstream
|
||||
p="" ;;
|
||||
"0 0") # equal to upstream
|
||||
p="=" ;;
|
||||
"0 "*) # ahead of upstream
|
||||
p=">" ;;
|
||||
*" 0") # behind upstream
|
||||
p="<" ;;
|
||||
*) # diverged from upstream
|
||||
p="<>" ;;
|
||||
esac
|
||||
else
|
||||
case "$count" in
|
||||
"") # no upstream
|
||||
p="" ;;
|
||||
"0 0") # equal to upstream
|
||||
p=" u=" ;;
|
||||
"0 "*) # ahead of upstream
|
||||
p=" u+${count#0 }" ;;
|
||||
*" 0") # behind upstream
|
||||
p=" u-${count% 0}" ;;
|
||||
*) # diverged from upstream
|
||||
p=" u+${count#* }-${count% *}" ;;
|
||||
esac
|
||||
if [[ -n "$count" && -n "$name" ]]; then
|
||||
__git_ps1_upstream_name=$(git rev-parse \
|
||||
--abbrev-ref "$upstream" 2>/dev/null)
|
||||
if [ $pcmode = yes ] && [ $ps1_expanded = yes ]; then
|
||||
p="$p \${__git_ps1_upstream_name}"
|
||||
else
|
||||
p="$p ${__git_ps1_upstream_name}"
|
||||
# not needed anymore; keep user's
|
||||
# environment clean
|
||||
unset __git_ps1_upstream_name
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
# Helper function that is meant to be called from __git_ps1. It
|
||||
# injects color codes into the appropriate gitstring variables used
|
||||
# to build a gitstring.
|
||||
__git_ps1_colorize_gitstring ()
|
||||
{
|
||||
if [[ -n ${ZSH_VERSION-} ]]; then
|
||||
local c_red='%F{red}'
|
||||
local c_green='%F{green}'
|
||||
local c_lblue='%F{blue}'
|
||||
local c_yellow='%F{yellow}'
|
||||
local c_clear='%f'
|
||||
else
|
||||
# Using \[ and \] around colors is necessary to prevent
|
||||
# issues with command line editing/browsing/completion!
|
||||
local c_red='\[\e[31m\]'
|
||||
local c_green='\[\e[32m\]'
|
||||
local c_lblue='\[\e[1;34m\]'
|
||||
local c_yellow='\[\033[0;33m\]'
|
||||
local c_clear='\[\e[0m\]'
|
||||
fi
|
||||
local bad_color=$c_red
|
||||
local ok_color=$c_green
|
||||
local flags_color=$c_yellow
|
||||
|
||||
local branch_color=""
|
||||
if [ $detached = no ]; then
|
||||
branch_color="$ok_color"
|
||||
else
|
||||
branch_color="$bad_color"
|
||||
fi
|
||||
c="$branch_color$c"
|
||||
|
||||
z="$c_clear$z"
|
||||
if [ "$w" = "*" ]; then
|
||||
w="$bad_color$w"
|
||||
fi
|
||||
if [ -n "$i" ]; then
|
||||
i="$ok_color$i"
|
||||
fi
|
||||
if [ -n "$s" ]; then
|
||||
s="$flags_color$s"
|
||||
fi
|
||||
if [ -n "$u" ]; then
|
||||
u="$bad_color$u"
|
||||
fi
|
||||
r="$c_clear$r"
|
||||
}
|
||||
|
||||
# Helper function to read the first line of a file into a variable.
|
||||
# __git_eread requires 2 arguments, the file path and the name of the
|
||||
# variable, in that order.
|
||||
__git_eread ()
|
||||
{
|
||||
test -r "$1" && IFS=$'\r\n' read "$2" <"$1"
|
||||
}
|
||||
|
||||
# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
|
||||
# when called from PS1 using command substitution
|
||||
# in this mode it prints text to add to bash PS1 prompt (includes branch name)
|
||||
#
|
||||
# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
|
||||
# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
|
||||
# when two arguments are given, the first is prepended and the second appended
|
||||
# to the state string when assigned to PS1.
|
||||
# The optional third parameter will be used as printf format string to further
|
||||
# customize the output of the git-status string.
|
||||
# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
|
||||
__git_ps1 ()
|
||||
{
|
||||
# preserve exit status
|
||||
local exit=$?
|
||||
local pcmode=no
|
||||
local detached=no
|
||||
local ps1pc_start='\u@\h:\w '
|
||||
local ps1pc_end='\$ '
|
||||
local printf_format=' (%s)'
|
||||
|
||||
case "$#" in
|
||||
2|3) pcmode=yes
|
||||
ps1pc_start="$1"
|
||||
ps1pc_end="$2"
|
||||
printf_format="${3:-$printf_format}"
|
||||
# set PS1 to a plain prompt so that we can
|
||||
# simply return early if the prompt should not
|
||||
# be decorated
|
||||
PS1="$ps1pc_start$ps1pc_end"
|
||||
;;
|
||||
0|1) printf_format="${1:-$printf_format}"
|
||||
;;
|
||||
*) return $exit
|
||||
;;
|
||||
esac
|
||||
|
||||
# ps1_expanded: This variable is set to 'yes' if the shell
|
||||
# subjects the value of PS1 to parameter expansion:
|
||||
#
|
||||
# * bash does unless the promptvars option is disabled
|
||||
# * zsh does not unless the PROMPT_SUBST option is set
|
||||
# * POSIX shells always do
|
||||
#
|
||||
# If the shell would expand the contents of PS1 when drawing
|
||||
# the prompt, a raw ref name must not be included in PS1.
|
||||
# This protects the user from arbitrary code execution via
|
||||
# specially crafted ref names. For example, a ref named
|
||||
# 'refs/heads/$(IFS=_;cmd=sudo_rm_-rf_/;$cmd)' might cause the
|
||||
# shell to execute 'sudo rm -rf /' when the prompt is drawn.
|
||||
#
|
||||
# Instead, the ref name should be placed in a separate global
|
||||
# variable (in the __git_ps1_* namespace to avoid colliding
|
||||
# with the user's environment) and that variable should be
|
||||
# referenced from PS1. For example:
|
||||
#
|
||||
# __git_ps1_foo=$(do_something_to_get_ref_name)
|
||||
# PS1="...stuff...\${__git_ps1_foo}...stuff..."
|
||||
#
|
||||
# If the shell does not expand the contents of PS1, the raw
|
||||
# ref name must be included in PS1.
|
||||
#
|
||||
# The value of this variable is only relevant when in pcmode.
|
||||
#
|
||||
# Assume that the shell follows the POSIX specification and
|
||||
# expands PS1 unless determined otherwise. (This is more
|
||||
# likely to be correct if the user has a non-bash, non-zsh
|
||||
# shell and safer than the alternative if the assumption is
|
||||
# incorrect.)
|
||||
#
|
||||
local ps1_expanded=yes
|
||||
[ -z "${ZSH_VERSION-}" ] || [[ -o PROMPT_SUBST ]] || ps1_expanded=no
|
||||
[ -z "${BASH_VERSION-}" ] || shopt -q promptvars || ps1_expanded=no
|
||||
|
||||
local repo_info rev_parse_exit_code
|
||||
repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
|
||||
--is-bare-repository --is-inside-work-tree \
|
||||
--short HEAD 2>/dev/null)"
|
||||
rev_parse_exit_code="$?"
|
||||
|
||||
if [ -z "$repo_info" ]; then
|
||||
return $exit
|
||||
fi
|
||||
|
||||
local short_sha=""
|
||||
if [ "$rev_parse_exit_code" = "0" ]; then
|
||||
short_sha="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
fi
|
||||
local inside_worktree="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
local bare_repo="${repo_info##*$'\n'}"
|
||||
repo_info="${repo_info%$'\n'*}"
|
||||
local inside_gitdir="${repo_info##*$'\n'}"
|
||||
local g="${repo_info%$'\n'*}"
|
||||
|
||||
if [ "true" = "$inside_worktree" ] &&
|
||||
[ -n "${GIT_PS1_HIDE_IF_PWD_IGNORED-}" ] &&
|
||||
[ "$(git config --bool bash.hideIfPwdIgnored)" != "false" ] &&
|
||||
git check-ignore -q .
|
||||
then
|
||||
return $exit
|
||||
fi
|
||||
|
||||
local r=""
|
||||
local b=""
|
||||
local step=""
|
||||
local total=""
|
||||
if [ -d "$g/rebase-merge" ]; then
|
||||
__git_eread "$g/rebase-merge/head-name" b
|
||||
__git_eread "$g/rebase-merge/msgnum" step
|
||||
__git_eread "$g/rebase-merge/end" total
|
||||
if [ -f "$g/rebase-merge/interactive" ]; then
|
||||
r="|REBASE-i"
|
||||
else
|
||||
r="|REBASE-m"
|
||||
fi
|
||||
else
|
||||
if [ -d "$g/rebase-apply" ]; then
|
||||
__git_eread "$g/rebase-apply/next" step
|
||||
__git_eread "$g/rebase-apply/last" total
|
||||
if [ -f "$g/rebase-apply/rebasing" ]; then
|
||||
__git_eread "$g/rebase-apply/head-name" b
|
||||
r="|REBASE"
|
||||
elif [ -f "$g/rebase-apply/applying" ]; then
|
||||
r="|AM"
|
||||
else
|
||||
r="|AM/REBASE"
|
||||
fi
|
||||
elif [ -f "$g/MERGE_HEAD" ]; then
|
||||
r="|MERGING"
|
||||
elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
|
||||
r="|CHERRY-PICKING"
|
||||
elif [ -f "$g/REVERT_HEAD" ]; then
|
||||
r="|REVERTING"
|
||||
elif [ -f "$g/BISECT_LOG" ]; then
|
||||
r="|BISECTING"
|
||||
fi
|
||||
|
||||
if [ -n "$b" ]; then
|
||||
:
|
||||
elif [ -h "$g/HEAD" ]; then
|
||||
# symlink symbolic ref
|
||||
b="$(git symbolic-ref HEAD 2>/dev/null)"
|
||||
else
|
||||
local head=""
|
||||
if ! __git_eread "$g/HEAD" head; then
|
||||
return $exit
|
||||
fi
|
||||
# is it a symbolic ref?
|
||||
b="${head#ref: }"
|
||||
if [ "$head" = "$b" ]; then
|
||||
detached=yes
|
||||
b="$(
|
||||
case "${GIT_PS1_DESCRIBE_STYLE-}" in
|
||||
(contains)
|
||||
git describe --contains HEAD ;;
|
||||
(branch)
|
||||
git describe --contains --all HEAD ;;
|
||||
(tag)
|
||||
git describe --tags HEAD ;;
|
||||
(describe)
|
||||
git describe HEAD ;;
|
||||
(* | default)
|
||||
git describe --tags --exact-match HEAD ;;
|
||||
esac 2>/dev/null)" ||
|
||||
|
||||
b="$short_sha..."
|
||||
b="($b)"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$step" ] && [ -n "$total" ]; then
|
||||
r="$r $step/$total"
|
||||
fi
|
||||
|
||||
local w=""
|
||||
local i=""
|
||||
local s=""
|
||||
local u=""
|
||||
local c=""
|
||||
local p=""
|
||||
|
||||
if [ "true" = "$inside_gitdir" ]; then
|
||||
if [ "true" = "$bare_repo" ]; then
|
||||
c="BARE:"
|
||||
else
|
||||
b="GIT_DIR!"
|
||||
fi
|
||||
elif [ "true" = "$inside_worktree" ]; then
|
||||
if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
|
||||
[ "$(git config --bool bash.showDirtyState)" != "false" ]
|
||||
then
|
||||
git diff --no-ext-diff --quiet || w="*"
|
||||
git diff --no-ext-diff --cached --quiet || i="+"
|
||||
if [ -z "$short_sha" ] && [ -z "$i" ]; then
|
||||
i="#"
|
||||
fi
|
||||
fi
|
||||
if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
|
||||
git rev-parse --verify --quiet refs/stash >/dev/null
|
||||
then
|
||||
s="$"
|
||||
fi
|
||||
|
||||
if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
|
||||
[ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
|
||||
git ls-files --others --exclude-standard --directory --no-empty-directory --error-unmatch -- ':/*' >/dev/null 2>/dev/null
|
||||
then
|
||||
u="%${ZSH_VERSION+%}"
|
||||
fi
|
||||
|
||||
if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
|
||||
__git_ps1_show_upstream
|
||||
fi
|
||||
fi
|
||||
|
||||
local z="${GIT_PS1_STATESEPARATOR-" "}"
|
||||
|
||||
# NO color option unless in PROMPT_COMMAND mode
|
||||
if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
|
||||
__git_ps1_colorize_gitstring
|
||||
fi
|
||||
|
||||
b=${b##refs/heads/}
|
||||
if [ $pcmode = yes ] && [ $ps1_expanded = yes ]; then
|
||||
__git_ps1_branch_name=$b
|
||||
b="\${__git_ps1_branch_name}"
|
||||
fi
|
||||
|
||||
local f="$w$i$s$u"
|
||||
local gitstring="$c$b${f:+$z$f}$r$p"
|
||||
|
||||
if [ $pcmode = yes ]; then
|
||||
if [ "${__git_printf_supports_v-}" != yes ]; then
|
||||
gitstring=$(printf -- "$printf_format" "$gitstring")
|
||||
else
|
||||
printf -v gitstring -- "$printf_format" "$gitstring"
|
||||
fi
|
||||
PS1="$ps1pc_start$gitstring$ps1pc_end"
|
||||
else
|
||||
printf -- "$printf_format" "$gitstring"
|
||||
fi
|
||||
|
||||
return $exit
|
||||
}
|
BIN
recommender-systems-class-master/img/git_bash.png
Normal file
BIN
recommender-systems-class-master/img/git_bash.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
224
recommender-systems-class-master/jupyter_test.ipynb
Normal file
224
recommender-systems-class-master/jupyter_test.ipynb
Normal file
File diff suppressed because one or more lines are too long
14586
recommender-systems-class-master/project_1_data_preparation.html
Normal file
14586
recommender-systems-class-master/project_1_data_preparation.html
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,338 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "alike-morgan",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"from IPython.display import Markdown, display, HTML\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
|
||||
"import os\n",
|
||||
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "friendly-herald",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from data_preprocessing.dataset_specification import DatasetSpecification\n",
|
||||
"from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit\n",
|
||||
"from data_preprocessing.people_identifier import PeopleIdentifier"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "prepared-signal",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Load original data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "solid-crisis",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = os.path.join(\"data\", \"hotel_data\")\n",
|
||||
"\n",
|
||||
"original_data = pd.read_csv(os.path.join(data_path, \"hotel_data_original.csv\"), index_col=0)\n",
|
||||
"\n",
|
||||
"original_data = original_data.replace({\"\\\\N\": \"\"})\n",
|
||||
"original_data = original_data.fillna(\"\")\n",
|
||||
"\n",
|
||||
"numeric_columns = [\"n_people\", \"n_children_1\", \"n_children_2\", \"n_children_3\",\n",
|
||||
" \"discount\", \"accomodation_price\", \"meal_price\", \"service_price\",\n",
|
||||
" \"paid\"]\n",
|
||||
"\n",
|
||||
"for column in numeric_columns:\n",
|
||||
" original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors=\"coerce\")\n",
|
||||
"\n",
|
||||
"original_data = original_data.astype(\n",
|
||||
" {\n",
|
||||
" \"date_from\": np.datetime64,\n",
|
||||
" \"date_to\": np.datetime64,\n",
|
||||
" \"booking_time\": np.datetime64,\n",
|
||||
" \"booking_date\": np.datetime64,\n",
|
||||
" \"n_people\": np.int64,\n",
|
||||
" \"n_children_1\": np.int64,\n",
|
||||
" \"n_children_2\": np.int64,\n",
|
||||
" \"n_children_3\": np.int64,\n",
|
||||
" \"discount\": np.float64,\n",
|
||||
" \"accomodation_price\": np.float64,\n",
|
||||
" \"meal_price\": np.float64,\n",
|
||||
" \"service_price\": np.float64,\n",
|
||||
" \"paid\": np.float64,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"display(HTML(original_data.head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "endangered-lingerie",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Preprocess the data\n",
|
||||
"\n",
|
||||
"- Identify users by client_id, name hash, phone hash, email hash.\n",
|
||||
"- Fix date_to - originally it points to the last full day of stay, not the departure date.\n",
|
||||
"- Add length of stay.\n",
|
||||
"- Add book to arrival.\n",
|
||||
"- Add number of rooms (important for group reservations).\n",
|
||||
"- Add indicator for stays encompasing a weekend.\n",
|
||||
"- Add night price.\n",
|
||||
"- Fix book to arrival to be not smaller than 0.\n",
|
||||
"- Filter out companies as recommendations for such clients should work differently.\n",
|
||||
"- Aggregate group reservations into single interactions.\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"In the file data_preprocessing/data_preprocessing_toolkit write code for the add_length_of_stay and add_night_price methods:\n",
|
||||
" - add_length_of_stay - should add 'length_of_stay' variable to the DataFrame, which counts the number of nights the customer stayed at the hotel,\n",
|
||||
" - add_night_price - should add 'night_price' column to the dataset DataFrame, which shows the average accomodation price per night per room (there can be many rooms in group reservations - 'n_rooms' column).\n",
|
||||
"You have to pass all assertions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "swedish-iceland",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preprocessed_data = original_data.copy()\n",
|
||||
"\n",
|
||||
"dataset_specification = DatasetSpecification()\n",
|
||||
"dp_toolkit = DataPreprocessingToolkit()\n",
|
||||
"\n",
|
||||
"id_column_names = dataset_specification.get_id_columns()\n",
|
||||
"\n",
|
||||
"people_identifier = PeopleIdentifier()\n",
|
||||
"preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, \"user_id\")\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data) # Code this method\n",
|
||||
"preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.add_night_price(preprocessed_data) # Code this method (remember that there can be many rooms)\n",
|
||||
"\n",
|
||||
"preprocessed_data = preprocessed_data.reset_index(drop=True)\n",
|
||||
"\n",
|
||||
"assert preprocessed_data.iloc[1]['length_of_stay'] == 3\n",
|
||||
"assert preprocessed_data.iloc[2]['length_of_stay'] == 2\n",
|
||||
"assert preprocessed_data.iloc[3]['length_of_stay'] == 7\n",
|
||||
"\n",
|
||||
"assert preprocessed_data.iloc[0]['night_price'] == 330.76\n",
|
||||
"assert preprocessed_data.iloc[1]['night_price'] == 231.13\n",
|
||||
"assert preprocessed_data.iloc[2]['night_price'] == 183.40\n",
|
||||
"\n",
|
||||
"display(HTML(preprocessed_data.head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "coupled-river",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Bucket important features to reduce the offer space size\n",
|
||||
"\n",
|
||||
"Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \\sim 3.3 \\text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"column_values_dict = {\n",
|
||||
" 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n",
|
||||
" 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n",
|
||||
" 'rate_plan': ['Standard', 'Nonref'],\n",
|
||||
" 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n",
|
||||
" 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n",
|
||||
" 'weekend_stay': ['True', 'False']\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Explanation:\n",
|
||||
" - term - the term of the arrival date,\n",
|
||||
" - length_of_stay_bucket - aggregated length of stay,\n",
|
||||
" - rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),\n",
|
||||
" - room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,\n",
|
||||
" - n_people_bucket - aggregated number of people in a reservation,\n",
|
||||
" - weekend_stay - indicates if the stay encompassed a weekend.\n",
|
||||
"\n",
|
||||
"The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are significantly better.\n",
|
||||
"\n",
|
||||
"The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.\n",
|
||||
"\n",
|
||||
"### The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate average of night prices for every **room_group_id** and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "interracial-rendering",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preprocessed_data = dp_toolkit.map_date_to_term_datasets(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.map_length_of_stay_to_nights_buckets(preprocessed_data)\n",
|
||||
"preprocessed_data = dp_toolkit.map_night_price_to_room_segment_buckets(preprocessed_data) # Code this method\n",
|
||||
"preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)\n",
|
||||
"\n",
|
||||
"assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'\n",
|
||||
"assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'\n",
|
||||
"assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'\n",
|
||||
"\n",
|
||||
"preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)\n",
|
||||
"\n",
|
||||
"preprocessed_data.to_csv(os.path.join(data_path, \"hotel_data_preprocessed.csv\"))\n",
|
||||
"\n",
|
||||
"display(HTML(preprocessed_data.head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "offshore-biography",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Base statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acknowledged-crime",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Number of users: {}\".format(len(preprocessed_data['user_id'].unique())))\n",
|
||||
"print()\n",
|
||||
"print(\"Number of items: {}\".format(len(preprocessed_data['item_id'].unique())))\n",
|
||||
"print()\n",
|
||||
"print(\"Number of interactions: {}\".format(len(preprocessed_data)))\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)\n",
|
||||
"n_user = n_user.rename(columns={'user_id': 'n_users'})\n",
|
||||
"display(HTML(n_user.head(10).to_html()))\n",
|
||||
"\n",
|
||||
"n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)\n",
|
||||
"n_item = n_item.rename(columns={'item_id': 'n_items'})\n",
|
||||
"display(HTML(n_item.head(10).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "blessed-knitting",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Prepare the dataset for recommenders\n",
|
||||
"\n",
|
||||
"One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, room_segment, weekend_stay are the most important."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "victorian-bottom",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']\n",
|
||||
"\n",
|
||||
"interactions_df = preprocessed_data.loc[\n",
|
||||
" :, ['user_id', 'item_id'] + item_features]\n",
|
||||
"\n",
|
||||
"column_values_dict = {\n",
|
||||
" 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n",
|
||||
" 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n",
|
||||
" 'rate_plan': ['Standard', 'Nonref'],\n",
|
||||
" 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n",
|
||||
" 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n",
|
||||
" 'weekend_stay': ['True', 'False']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"interactions_df.loc[:, 'term'] = pd.Categorical(\n",
|
||||
" interactions_df['term'], categories=column_values_dict['term'])\n",
|
||||
"interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(\n",
|
||||
" interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])\n",
|
||||
"interactions_df.loc[:, 'rate_plan'] = pd.Categorical(\n",
|
||||
" interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])\n",
|
||||
"interactions_df.loc[:, 'room_segment'] = pd.Categorical(\n",
|
||||
" interactions_df['room_segment'], categories=column_values_dict['room_segment'])\n",
|
||||
"interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(\n",
|
||||
" interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])\n",
|
||||
"interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(\n",
|
||||
" interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])\n",
|
||||
"\n",
|
||||
"interactions_df.to_csv(os.path.join(data_path, \"hotel_data_interactions_df.csv\"))\n",
|
||||
"\n",
|
||||
"display(HTML(interactions_df.head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "incredible-feeling",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,743 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "alike-morgan",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"from IPython.display import Markdown, display, HTML\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
|
||||
"import os\n",
|
||||
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "blessed-knitting",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Load the dataset for recommenders"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "victorian-bottom",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = os.path.join(\"data\", \"hotel_data\")\n",
|
||||
"\n",
|
||||
"interactions_df = pd.read_csv(os.path.join(data_path, \"hotel_data_interactions_df.csv\"), index_col=0)\n",
|
||||
"\n",
|
||||
"base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']\n",
|
||||
"\n",
|
||||
"column_values_dict = {\n",
|
||||
" 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n",
|
||||
" 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n",
|
||||
" 'rate_plan': ['Standard', 'Nonref'],\n",
|
||||
" 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n",
|
||||
" 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n",
|
||||
" 'weekend_stay': ['True', 'False']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"interactions_df.loc[:, 'term'] = pd.Categorical(\n",
|
||||
" interactions_df['term'], categories=column_values_dict['term'])\n",
|
||||
"interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(\n",
|
||||
" interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])\n",
|
||||
"interactions_df.loc[:, 'rate_plan'] = pd.Categorical(\n",
|
||||
" interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])\n",
|
||||
"interactions_df.loc[:, 'room_segment'] = pd.Categorical(\n",
|
||||
" interactions_df['room_segment'], categories=column_values_dict['room_segment'])\n",
|
||||
"interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(\n",
|
||||
" interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])\n",
|
||||
"interactions_df.loc[:, 'weekend_stay'] = interactions_df['weekend_stay'].astype('str')\n",
|
||||
"interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(\n",
|
||||
" interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])\n",
|
||||
"\n",
|
||||
"display(HTML(interactions_df.head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "realistic-third",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Define user features based on reservations\n",
|
||||
"\n",
|
||||
"The content-based recommenders will be forecasting the probability of interaction between user and item based on user features vector and item features vector:\n",
|
||||
"\n",
|
||||
"<center>\n",
|
||||
"$$\n",
|
||||
" r_{u, i} = f(user\\_features, item\\_features)\n",
|
||||
"$$\n",
|
||||
"</center>\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"Design numerical user features based on user reservations. Code the following method which for a given interactions DataFrame (it will be used in the fit method of the recommender) returns a DataFrame with user_id and user features as well as a list with names of user features (this will be important to select the right columns for an ML algorithm). Remember to name the columns differently than item features which you will create in the next task. Validate your features on users with several interactions (sample user ids are already given below).\n",
|
||||
"\n",
|
||||
"Ideas for user features:\n",
|
||||
"- Find the vector of most popular feature values from all user reservations and encode every feature with one-hot encoding.\n",
|
||||
"- For every reservation feature calculate the probability distribution of its values among all user's reservations.\n",
|
||||
"- For numerical buckets (length_of_stay, room_segment, n_people) you can calculate the average value for every user from their reservations (you will have to map the buckets back to numerical values before averaging them).\n",
|
||||
"\n",
|
||||
"Remember that you will have to select the best features (with the highest explanatory power). Using all above features at once would make the number of variables too large for this dataset and would also introduce too much correlations between features.\n",
|
||||
"\n",
|
||||
"You can also prepare several version of the prepare_users_df method and test which works best in your recommender."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "variable-jaguar",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def prepare_users_df(interactions_df):\n",
|
||||
"\n",
|
||||
" # Write your code here\n",
|
||||
" \n",
|
||||
" return users_df, user_features\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"users_df, user_features = prepare_users_df(interactions_df)\n",
|
||||
"\n",
|
||||
"print(user_features)\n",
|
||||
"\n",
|
||||
"display(HTML(users_df.loc[users_df['user_id'].isin([706, 1736, 7779, 96, 1, 50, 115])].head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "built-complaint",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Prepare numerical item features\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"Code the prepare_items_df method which will be used in the recommender fit and recommend methods to map items to numerical features. This method should take the interactions_df DataFrame as input and return a DataFrame containing one record per item_id with item_id column and numerical item feature columns.\n",
|
||||
"\n",
|
||||
"You can try turning all item features into on-hot representations. You can use the get_dummies method from pandas. It will return the same columns on any dataset of interactions because of the categorical variables with all possible values have been defined in the second cell in this notebook.\n",
|
||||
"\n",
|
||||
"You are welcome to design your own numerical item features."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "formal-munich",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def prepare_items_df(interactions_df):\n",
|
||||
" \n",
|
||||
" # Write your code here\n",
|
||||
" \n",
|
||||
" return items_df, item_features\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"items_df, item_features = prepare_items_df(interactions_df)\n",
|
||||
"\n",
|
||||
"print(item_features)\n",
|
||||
"\n",
|
||||
"display(HTML(items_df.loc[items_df['item_id'].isin([0, 1, 2, 3, 4, 5, 6])].head(15).to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "figured-imaging",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Content-based recommender\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"Code the content-based recommender. User features should be calculated within the fit method based on available training data and should be saved in the object for later use in the recommend method. Overwrite the users_df variable. Item features should be calculated both in the fit method (from interactions_df) and in the recommend method (from items_df - the items to be evaluated).\n",
|
||||
"\n",
|
||||
"In the fit method you have to randomly generate non-existing interactions and add them to the training data for the regressor. You should add the target variable to interactions - equal to 1 for real interactions and equal to 0 for those newly added interactions. Generate several negative interactions per every positive interactions (n_neg_per_pos). Treat the proportion as a tunable parameter of the model.\n",
|
||||
"\n",
|
||||
"Remember to keep control over randomness - in the init method add seed as a parameter and use initialize the random seed generator with that seed:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"self.seed = seed\n",
|
||||
"self.rng = np.random.RandomState(seed=seed)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Below the base content-based recommender class there are several classes which inherit from the base class and use different ML models:\n",
|
||||
" - LinearRegressionCBUIRecommender - based on linear regression,\n",
|
||||
" - SVRCBUIRecommender - based on Support Vector Regressor (if you want to test it, sample the data in the fit method, as the training can take many hours on the entire dataset of interactions),\n",
|
||||
" - RandomForestCBUIRecommender - based on Random Forest,\n",
|
||||
" - XGBoostCBUIRecommender - based on XGBoost.\n",
|
||||
" \n",
|
||||
"There is no need to change anything in those inheriting classes, although you can experiment with other tunable parameters of the underlying models.\n",
|
||||
"\n",
|
||||
"You are encouraged to experiment with:\n",
|
||||
" - Other numerical user and item features (but always train and evaluate the model on buckets defined in the first notebook).\n",
|
||||
" - Other ML models, e.g. Huber regression, Lasso regression, Ridge regression, LARS regression, Linear SVR, Decision Tree, Naive Bayes, Neural Networks or any model of your choice.\n",
|
||||
" - A different approach where you treat each item as a class, you train directly on categorical features of items and users (you would have to design appropriate categorical features for users) and you fit classifiers (e.g. Decision Tree classifier, Naive Bayes classifier etc.) instead of regressors."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "unlike-recipient",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"from sklearn.svm import SVR\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
||||
"\n",
|
||||
"from recommenders.recommender import Recommender\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ContentBasedUserItemRecommender(Recommender):\n",
|
||||
" \"\"\"\n",
|
||||
" Linear recommender class based on user and item features.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" def __init__(self, seed=6789, n_neg_per_pos=5):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize base recommender params and variables.\n",
|
||||
" \"\"\"\n",
|
||||
" self.model = LinearRegression()\n",
|
||||
" self.n_neg_per_pos = n_neg_per_pos\n",
|
||||
" \n",
|
||||
" self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
|
||||
" self.users_df = None\n",
|
||||
" self.user_features = None\n",
|
||||
" \n",
|
||||
" self.seed = seed\n",
|
||||
" self.rng = np.random.RandomState(seed=seed)\n",
|
||||
" \n",
|
||||
" def fit(self, interactions_df, users_df, items_df):\n",
|
||||
" \"\"\"\n",
|
||||
" Training of the recommender.\n",
|
||||
" \n",
|
||||
" :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items \n",
|
||||
" defined by user_id, item_id and features of the interaction.\n",
|
||||
" :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.\n",
|
||||
" :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" interactions_df = interactions_df.copy()\n",
|
||||
" \n",
|
||||
" # Prepare users_df and items_df\n",
|
||||
" \n",
|
||||
" users_df, user_features = prepare_users_df(interactions_df)\n",
|
||||
" \n",
|
||||
" self.users_df = users_df\n",
|
||||
" self.user_features = user_features\n",
|
||||
" \n",
|
||||
" items_df, item_features = prepare_items_df(interactions_df)\n",
|
||||
" items_df = items_df.loc[:, ['item_id'] + item_features]\n",
|
||||
" \n",
|
||||
" # Generate negative interactions\n",
|
||||
" \n",
|
||||
" interactions_df = interactions_df.loc[:, ['user_id', 'item_id']]\n",
|
||||
" \n",
|
||||
" interactions_df.loc[:, 'interacted'] = 1\n",
|
||||
" \n",
|
||||
" negative_interactions = []\n",
|
||||
" \n",
|
||||
" # Write your code here\n",
|
||||
" # Generate tuples (user_id, item_id, 0) for pairs (user_id, item_id) which do not\n",
|
||||
" # appear in the interactions_df and add those tuples to the list negative_interactions.\n",
|
||||
" # Generate self.n_neg_per_pos * len(interactions_df) negative interactions \n",
|
||||
" # (self.n_neg_per_pos per one positive).\n",
|
||||
" # Make sure the code is efficient and runs fast, otherwise you will not be able to properly tune your model.\n",
|
||||
" \n",
|
||||
" interactions_df = pd.concat(\n",
|
||||
" [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])\n",
|
||||
" \n",
|
||||
" # Get the input data for the model\n",
|
||||
" \n",
|
||||
" interactions_df = pd.merge(interactions_df, users_df, on=['user_id'])\n",
|
||||
" interactions_df = pd.merge(interactions_df, items_df, on=['item_id'])\n",
|
||||
" \n",
|
||||
" x = interactions_df.loc[:, user_features + item_features].values\n",
|
||||
" y = interactions_df['interacted'].values\n",
|
||||
" \n",
|
||||
" self.model.fit(x, y)\n",
|
||||
" \n",
|
||||
" def recommend(self, users_df, items_df, n_recommendations=1):\n",
|
||||
" \"\"\"\n",
|
||||
" Serving of recommendations. Scores items in items_df for each user in users_df and returns \n",
|
||||
" top n_recommendations for each user.\n",
|
||||
" \n",
|
||||
" :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.\n",
|
||||
" :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n",
|
||||
" :param int n_recommendations: Number of recommendations to be returned for each user.\n",
|
||||
" :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations \n",
|
||||
" for each user.\n",
|
||||
" :rtype: pd.DataFrame\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" # Clean previous recommendations (iloc could be used alternatively)\n",
|
||||
" self.recommender_df = self.recommender_df[:0]\n",
|
||||
" \n",
|
||||
" # Write your code here\n",
|
||||
" # Prepare users_df and items_df\n",
|
||||
" # For users_df you just need to merge user features from self.users_df to users_df \n",
|
||||
" # (the users for which you generate recommendations)\n",
|
||||
" # For items you have to apply the prepare_items_df method to items_df.\n",
|
||||
" \n",
|
||||
" # Score the items\n",
|
||||
" \n",
|
||||
" recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
|
||||
" \n",
|
||||
" for ix, user in users_df.iterrows():\n",
|
||||
" \n",
|
||||
" # Write your code here\n",
|
||||
" # Create a Carthesian product of users from users_df and items from items_df\n",
|
||||
"\n",
|
||||
" # Write your code here\n",
|
||||
" # Use self.model.predict method to calculate scores for all records in the just created DataFrame\n",
|
||||
" # of users and items\n",
|
||||
" \n",
|
||||
" # Write your code here\n",
|
||||
" # Obtain item ids with the highest score and save those ids under the chosen_ids variable\n",
|
||||
" # Do not exclude already booked items.\n",
|
||||
" chosen_ids = None\n",
|
||||
" \n",
|
||||
" recommendations = []\n",
|
||||
" for item_id in chosen_ids:\n",
|
||||
" recommendations.append(\n",
|
||||
" {\n",
|
||||
" 'user_id': user['user_id'],\n",
|
||||
" 'item_id': item_id,\n",
|
||||
" 'score': scores[item_id]\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" user_recommendations = pd.DataFrame(recommendations)\n",
|
||||
"\n",
|
||||
" self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n",
|
||||
"\n",
|
||||
" return self.recommender_df\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"class LinearRegressionCBUIRecommender(ContentBasedUserItemRecommender):\n",
|
||||
" \"\"\"\n",
|
||||
" Linear regression recommender class based on user and item features.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize base recommender params and variables.\n",
|
||||
" \"\"\"\n",
|
||||
" super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n",
|
||||
" self.model = LinearRegression()\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"class SVRCBUIRecommender(ContentBasedUserItemRecommender):\n",
|
||||
" \"\"\"\n",
|
||||
" SVR recommender class based on user and item features.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize base recommender params and variables.\n",
|
||||
" \"\"\"\n",
|
||||
" super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n",
|
||||
" if 'kernel' in model_params:\n",
|
||||
" self.kernel = model_params['kernel']\n",
|
||||
" else:\n",
|
||||
" self.kernel = 'rbf'\n",
|
||||
" if 'C' in model_params:\n",
|
||||
" self.C = model_params['C']\n",
|
||||
" else:\n",
|
||||
" self.C = 1.0\n",
|
||||
" if 'epsilon' in model_params:\n",
|
||||
" self.epsilon = model_params['epsilon']\n",
|
||||
" else:\n",
|
||||
" self.epsilon = 0.1\n",
|
||||
" self.model = SVR(kernel=self.kernel, C=self.C, epsilon=self.epsilon)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"class RandomForestCBUIRecommender(ContentBasedUserItemRecommender):\n",
|
||||
" \"\"\"\n",
|
||||
" Random forest recommender class based on user and item features.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize base recommender params and variables.\n",
|
||||
" \"\"\"\n",
|
||||
" super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n",
|
||||
" if 'n_estimators' in model_params:\n",
|
||||
" self.n_estimators = int(model_params['n_estimators'])\n",
|
||||
" else:\n",
|
||||
" self.n_estimators = 100\n",
|
||||
" if 'max_depth' in model_params:\n",
|
||||
" self.max_depth = int(model_params['max_depth'])\n",
|
||||
" else:\n",
|
||||
" self.max_depth = 30\n",
|
||||
" if 'min_samples_split' in model_params:\n",
|
||||
" self.min_samples_split = int(model_params['min_samples_split'])\n",
|
||||
" else:\n",
|
||||
" self.min_samples_split = 30\n",
|
||||
" self.model = RandomForestRegressor(\n",
|
||||
" n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"class XGBoostCBUIRecommender(ContentBasedUserItemRecommender):\n",
|
||||
" \"\"\"\n",
|
||||
" XGBoost recommender class based on user and item features.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize base recommender params and variables.\n",
|
||||
" \"\"\"\n",
|
||||
" super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n",
|
||||
" if 'n_estimators' in model_params:\n",
|
||||
" self.n_estimators = int(model_params['n_estimators'])\n",
|
||||
" else:\n",
|
||||
" self.n_estimators = 100\n",
|
||||
" if 'max_depth' in model_params:\n",
|
||||
" self.max_depth = int(model_params['max_depth'])\n",
|
||||
" else:\n",
|
||||
" self.max_depth = 30\n",
|
||||
" if 'min_samples_split' in model_params:\n",
|
||||
" self.min_samples_split = int(model_params['min_samples_split'])\n",
|
||||
" else:\n",
|
||||
" self.min_samples_split = 30\n",
|
||||
" if 'learning_rate' in model_params:\n",
|
||||
" self.learning_rate = model_params['learning_rate']\n",
|
||||
" else:\n",
|
||||
" self.learning_rate = 30\n",
|
||||
" self.model = GradientBoostingRegressor(\n",
|
||||
" n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split,\n",
|
||||
" learning_rate=self.learning_rate) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "copyrighted-relative",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Quick test of the recommender"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "greatest-canon",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"items_df = interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "initial-capital",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Fit method\n",
|
||||
"cb_user_item_recommender = RandomForestCBUIRecommender()\n",
|
||||
"cb_user_item_recommender.fit(interactions_df, None, None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "digital-consolidation",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recommender method\n",
|
||||
"\n",
|
||||
"recommendations = cb_user_item_recommender.recommend(pd.DataFrame([[1], [2], [3], [4], [5]], columns=['user_id']), interactions_df, 10)\n",
|
||||
"\n",
|
||||
"recommendations = pd.merge(recommendations, items_df, on='item_id', how='left')\n",
|
||||
"display(HTML(recommendations.to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "advanced-eleven",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tuning method"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "strange-alaska",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n",
|
||||
"\n",
|
||||
"seed = 6789"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "stable-theta",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from hyperopt import hp, fmin, tpe, Trials\n",
|
||||
"import traceback\n",
|
||||
"\n",
|
||||
"def tune_recommender(recommender_class, interactions_df, items_df, \n",
|
||||
" param_space, max_evals=1, show_progressbar=True, seed=6789):\n",
|
||||
" # Split into train_validation and test sets\n",
|
||||
"\n",
|
||||
" shuffle = np.arange(len(interactions_df))\n",
|
||||
" rng = np.random.RandomState(seed=seed)\n",
|
||||
" rng.shuffle(shuffle)\n",
|
||||
" shuffle = list(shuffle)\n",
|
||||
"\n",
|
||||
" train_test_split = 0.8\n",
|
||||
" split_index = int(len(interactions_df) * train_test_split)\n",
|
||||
"\n",
|
||||
" train_validation = interactions_df.iloc[shuffle[:split_index]]\n",
|
||||
" test = interactions_df.iloc[shuffle[split_index:]]\n",
|
||||
"\n",
|
||||
" # Tune\n",
|
||||
"\n",
|
||||
" def loss(tuned_params):\n",
|
||||
" recommender = recommender_class(seed=seed, **tuned_params)\n",
|
||||
" hr1, hr3, hr5, hr10, ndcg1, ndcg3, ndcg5, ndcg10 = evaluate_train_test_split_implicit(\n",
|
||||
" recommender, train_validation, items_df, seed=seed)\n",
|
||||
" return -hr10\n",
|
||||
"\n",
|
||||
" n_tries = 1\n",
|
||||
" succeded = False\n",
|
||||
" try_id = 0\n",
|
||||
" while not succeded and try_id < n_tries:\n",
|
||||
" try:\n",
|
||||
" trials = Trials()\n",
|
||||
" best_param_set = fmin(loss, space=param_space, algo=tpe.suggest, \n",
|
||||
" max_evals=max_evals, show_progressbar=show_progressbar, trials=trials, verbose=True)\n",
|
||||
" succeded = True\n",
|
||||
" except:\n",
|
||||
" traceback.print_exc()\n",
|
||||
" try_id += 1\n",
|
||||
" \n",
|
||||
" if not succeded:\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" # Validate\n",
|
||||
" \n",
|
||||
" recommender = recommender_class(seed=seed, **best_param_set)\n",
|
||||
"\n",
|
||||
" results = [[recommender_class.__name__] + list(evaluate_train_test_split_implicit(\n",
|
||||
" recommender, {'train': train_validation, 'test': test}, items_df, seed=seed))]\n",
|
||||
"\n",
|
||||
" results = pd.DataFrame(results, \n",
|
||||
" columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
||||
"\n",
|
||||
" display(HTML(results.to_html()))\n",
|
||||
" \n",
|
||||
" return best_param_set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "spiritual-orbit",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tuning of the recommender\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"Tune your models using the code below. You only need to put the class name of your recommender and choose an appropriate parameter space."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dependent-capital",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"param_space = {\n",
|
||||
" 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_param_set = tune_recommender(LinearRegressionCBUIRecommender, interactions_df, items_df,\n",
|
||||
" param_space, max_evals=10, show_progressbar=True, seed=seed)\n",
|
||||
"\n",
|
||||
"print(\"Best parameters:\")\n",
|
||||
"print(best_param_set)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "palestinian-clearance",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"param_space = {\n",
|
||||
" 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n",
|
||||
" 'C': hp.loguniform('C', np.log(0.01), np.log(100.0))\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_param_set = tune_recommender(SVRCBUIRecommender, interactions_df, items_df,\n",
|
||||
" param_space, max_evals=100, show_progressbar=True, seed=seed)\n",
|
||||
"\n",
|
||||
"print(\"Best parameters:\")\n",
|
||||
"print(best_param_set)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "seasonal-header",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"param_space = {\n",
|
||||
" 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n",
|
||||
" 'n_estimators': hp.quniform('n_estimators', 30, 300, 1),\n",
|
||||
" 'max_depth': hp.quniform('max_depth', 2, 10, 1),\n",
|
||||
" 'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_param_set = tune_recommender(RandomForestCBUIRecommender, interactions_df, items_df,\n",
|
||||
" param_space, max_evals=100, show_progressbar=True, seed=seed)\n",
|
||||
"\n",
|
||||
"print(\"Best parameters:\")\n",
|
||||
"print(best_param_set)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "moved-gothic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This tuning may take around 12 hours\n",
|
||||
"\n",
|
||||
"param_space = {\n",
|
||||
" 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n",
|
||||
" 'n_estimators': hp.quniform('n_estimators', 10, 300, 1),\n",
|
||||
" 'max_depth': hp.quniform('max_depth', 2, 10, 1),\n",
|
||||
" 'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1),\n",
|
||||
" 'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1))\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_param_set = tune_recommender(XGBoostCBUIRecommender, interactions_df, items_df,\n",
|
||||
" param_space, max_evals=300, show_progressbar=True, seed=seed)\n",
|
||||
"\n",
|
||||
"print(\"Best parameters:\")\n",
|
||||
"print(best_param_set)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "accredited-strap",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Final evaluation\n",
|
||||
"\n",
|
||||
"<span style=\"color:red\"><font size=\"4\">**Task:**</font></span><br> \n",
|
||||
"Run the final evaluation of your recommender and present its results against the Amazon recommender's results. You can present results for several of your recommenders. You just need to give the class name of your recommender and its tuned parameters below. If you present results for several recommenders, you should add a separate cell for each recommender and change the names of the DataFrames containing results."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "given-homework",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cb_user_item_recommender = LinearRegressionCBUIRecommender(\n",
|
||||
" **{'n_neg_per_pos': 7}) # Initialize your recommender here with the best params from tuning\n",
|
||||
"\n",
|
||||
"# Give the name of your recommender in the line below\n",
|
||||
"linear_cbui_tts_results = [['LinearRegressionCBUIRecommender'] + list(evaluate_train_test_split_implicit(\n",
|
||||
" cb_user_item_recommender, interactions_df, items_df))]\n",
|
||||
"\n",
|
||||
"linear_cbui_tts_results = pd.DataFrame(\n",
|
||||
" linear_cbui_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
||||
"\n",
|
||||
"display(HTML(linear_cbui_tts_results.to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "suited-nomination",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from recommenders.amazon_recommender import AmazonRecommender\n",
|
||||
"\n",
|
||||
"amazon_recommender = AmazonRecommender()\n",
|
||||
"\n",
|
||||
"amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n",
|
||||
" amazon_recommender, interactions_df, items_df))]\n",
|
||||
"\n",
|
||||
"amazon_tts_results = pd.DataFrame(\n",
|
||||
" amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
||||
"\n",
|
||||
"display(HTML(amazon_tts_results.to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "moderate-printing",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tts_results = pd.concat([linear_cbui_tts_results, amazon_tts_results]).reset_index(drop=True)\n",
|
||||
"display(HTML(tts_results.to_html()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "white-demographic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
131
recommender-systems-class-master/readme.md
Normal file
131
recommender-systems-class-master/readme.md
Normal file
@ -0,0 +1,131 @@
|
||||
# Recommender Systems class
|
||||
|
||||
Department of Mathematics and Computer Science, Adam Mickiewicz University, 2021
|
||||
|
||||
Author: Piotr Zioło
|
||||
|
||||
## Preparing your computer
|
||||
|
||||
1. Install [Anaconda](https://www.anaconda.com/products/individual) with Python 3.8.
|
||||
|
||||
|
||||
2. Install [Git](https://git-scm.com/downloads).
|
||||
|
||||
|
||||
3. Install [PyCharm](https://www.jetbrains.com/pycharm/) (community version).
|
||||
|
||||
|
||||
4. Fork this repository to your GitHub account.
|
||||
|
||||
|
||||
5. Go to the chosen folder on your machine where you want to have a local copy of the repository. Right-click in the folder and from the context menu choose "Git Bash Here". Run the following command to clone the forked repository on your GitHub account to your local machine:
|
||||
|
||||
<pre>git clone <i>your_repository_address_which_you'll_find_in_your_github</i></pre>
|
||||
|
||||
Alternatively, open Git Bash (installed with Git), change the path to the folder where you want to have a local copy of the repository, execute the above command.
|
||||
|
||||
|
||||
6. Prepare your conda environment (instructions given for Windows, but it should be similar on other systems):
|
||||
|
||||
1. Open Anaconda Prompt as administrator.
|
||||
|
||||
2. Make sure you're in the repository main folder. Run the following command:
|
||||
|
||||
conda env create --name rs-class-env -f environment.yml
|
||||
|
||||
You can replace *rs-class-env* with your own environment name.
|
||||
|
||||
You may need to install a C++ compiler to install certain packages.
|
||||
|
||||
|
||||
7. In Git Bash open the repository folder and activate just created environment with the following command:
|
||||
|
||||
conda activate rs-class-env
|
||||
|
||||
|
||||
8. In Git Bash type:
|
||||
|
||||
jupyter notebook
|
||||
|
||||
A new tab with Jupyter Notebook should open in your browser.
|
||||
|
||||
|
||||
9. In Jupyter Notebook open jupyter_test.ipynb.
|
||||
|
||||
|
||||
10. Click on the first cell and hit shift+enter. The first cell should get executed properly. Do the same for all other cells (you can continuously hit shift+enter until you execute all cells).
|
||||
|
||||
The most common error you may encounter is "ImportError: No module named...". In such a case:
|
||||
|
||||
- copy the package name,
|
||||
|
||||
- close the tabs with Jupyter and in Git Bash where you started Jupyter Notebook click ctrl+c,
|
||||
|
||||
- run the following command:
|
||||
pip install package_name
|
||||
|
||||
- the package should get installed successfully,
|
||||
|
||||
- after that you can open Jupyter Notebook again and test if it works now.
|
||||
|
||||
|
||||
11. After you finished a piece of code in your repository, run the following commands in Git Bash (in the repository folder):
|
||||
|
||||
git add -A
|
||||
|
||||
git commit -m "Commit message"
|
||||
|
||||
git push
|
||||
|
||||
The first command adds all changes and new files for the next commit. The second command commits your changes (it's a kind of a checkpoint/save to which you can later return if need be). The third one sends your commit to GitHub (or any remote repository, in general).
|
||||
|
||||
**Convention:** For your commit messages use imperatives, e.g. "Do this, do that". Try to give informative one-liners.
|
||||
|
||||
|
||||
12. (Optional) Set up your Git Bash to make it look as below:
|
||||
|
||||
![Git Bash](img/git_bash.png)
|
||||
|
||||
Copy .bash_profile and git-prompt.sh files from the git_configuration folder from this repository to your user folder (tested on Windows 10; on other systems they may need to land somewhere else).
|
||||
|
||||
|
||||
13. (Optional) Set up SSH on your machine for easier access to your GitHub repositories through Git. You can find tutorials on the internet how to do that.
|
||||
|
||||
To additionally add an automatic prompt for SSH password in Git Bash, copy a script similar to .bashrc from the git_configuration folder to your user folder. In the file change the name of the key (in the given file there are two given; you can just leave one).
|
||||
|
||||
|
||||
**In the case of any problems, consult your best friend - [StackOverflow](https://stackoverflow.com/)**.
|
||||
|
||||
|
||||
## Before every class
|
||||
|
||||
Fetch the new code from this repository and merge it into your code.
|
||||
|
||||
1. In Git Bash open your repository folder.
|
||||
|
||||
|
||||
2. Add the original repository as an upstream:
|
||||
|
||||
git remote add upstream git@github.com:PiotrZiolo/recommender-systems-class.git
|
||||
|
||||
|
||||
3. Fetch new changes from the original repository:
|
||||
|
||||
git fetch upstream
|
||||
|
||||
|
||||
4. Merge the changes into your local branch (if you don't mind having your commits and commits in the original repository mixed up) or rebase the changes into your local branch (if you want your commits to follow all commits in the original repository):
|
||||
|
||||
git merge upstream/master master
|
||||
|
||||
or
|
||||
|
||||
git rebase upstream/master
|
||||
|
||||
|
||||
5. In the case of conflicts you can resolve them manually, but it's easier to use PyCharm, especially in Jupyter Notebooks where manual merging is extremely painful. PyCharm provides side-by-side view of changes and allows to accept one of the conflicted file versions in one click.
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,231 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy.special as scisp
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class AmazonRecommender(Recommender):
|
||||
"""
|
||||
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
|
||||
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
|
||||
IEEE Internet Computing, 2003,
|
||||
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.e_xy = None
|
||||
self.n_xy = None
|
||||
self.scores = None
|
||||
self.most_popular_items = None
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get maximal number of interactions
|
||||
|
||||
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
|
||||
max_interactions = n_user_interactions['n_items'].max()
|
||||
|
||||
# Calculate P_Y's
|
||||
|
||||
n_interactions = len(interactions_df)
|
||||
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
|
||||
p_y = p_y.rename(columns={'user_id': 'P_Y'})
|
||||
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
|
||||
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
|
||||
|
||||
# Get the series of all items
|
||||
|
||||
# items = list(range(n_items))
|
||||
items = interactions_df['item_id'].unique()
|
||||
|
||||
# For every X calculate the E[Y|X]
|
||||
|
||||
e_xy = np.zeros(shape=(n_items, n_items))
|
||||
e_xy[:][:] = -1e100
|
||||
|
||||
p_y_powers = {}
|
||||
for y in items:
|
||||
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
|
||||
|
||||
# In the next version calculate all alpha_k first (this works well with parallelization)
|
||||
|
||||
for x in items:
|
||||
# Get users who bought X
|
||||
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
|
||||
|
||||
# Get users who bought only X
|
||||
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
|
||||
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
|
||||
|
||||
# Calculate the number of non-X interactions for each user who bought X
|
||||
# Include users with zero non-X interactions
|
||||
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
|
||||
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
|
||||
# Unnecessary, but added for readability
|
||||
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
|
||||
|
||||
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
|
||||
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
|
||||
|
||||
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
|
||||
|
||||
# Calculate the expected numbers of Y products bought by clients who bought X
|
||||
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
|
||||
for abs_c in n_non_x_interactions["n_items"]])
|
||||
for k in range(1, max_interactions + 1)])
|
||||
|
||||
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
|
||||
if y != x:
|
||||
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
|
||||
else:
|
||||
e_xy[x][y] = n_users * p_y[x]
|
||||
|
||||
self.e_xy = e_xy
|
||||
|
||||
# Calculate the number of users who bought both X and Y
|
||||
|
||||
# Simple and slow method (commented out)
|
||||
|
||||
# n_xy = np.zeros(shape=(n_items, n_items))
|
||||
|
||||
# for x in items:
|
||||
# for y in items:
|
||||
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
|
||||
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
|
||||
# users_x_and_y = users_x & users_y
|
||||
# n_xy[x][y] = len(users_x_and_y)
|
||||
|
||||
# Optimized method (can be further optimized by using sparse matrices)
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
# Get the number of users who bought both X and Y
|
||||
|
||||
n_xy = np.matmul(r.T, r)
|
||||
|
||||
self.n_xy = n_xy
|
||||
|
||||
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
final_scores = np.sum(self.scores[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
final_scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': final_scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
@ -0,0 +1,233 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class NearestNeighborsRecommender(Recommender):
|
||||
"""
|
||||
Nearest neighbors recommender allowing to do user-based or item-based collaborative filtering.
|
||||
|
||||
Possible similarity measures:
|
||||
- 'cosine',
|
||||
- 'pearson'.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.r = None
|
||||
self.similarities = None
|
||||
self.most_popular_items = None
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
self.n_neighbors = 10
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
def initialize(self, **params):
|
||||
if 'n_neighbors' in params:
|
||||
self.n_neighbors = params['n_neighbors']
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
del users_df, items_df
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
if self.collaboration_type == 'item':
|
||||
r = r.T
|
||||
|
||||
self.r = r
|
||||
|
||||
# Calculate all similarities
|
||||
|
||||
similarities = None
|
||||
if self.similarity_measure == 'cosine':
|
||||
n_uv = np.matmul(r, r.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
elif self.similarity_measure == 'pearson':
|
||||
r_shifted = r - np.mean(r, axis=1).reshape(-1, 1)
|
||||
n_uv = np.matmul(r_shifted, r_shifted.T)
|
||||
norms = np.sqrt(np.diag(n_uv))
|
||||
norms[norms == 0] = 0.000001
|
||||
similarities = n_uv / norms[:, np.newaxis] / norms[np.newaxis, :]
|
||||
|
||||
np.fill_diagonal(similarities, -1000)
|
||||
|
||||
self.similarities = similarities
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
chosen_ids = []
|
||||
scores = []
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
if self.collaboration_type == 'user':
|
||||
neighbor_ids = np.argsort(-self.similarities[mapped_user_id])[:self.n_neighbors]
|
||||
user_similarities = self.similarities[mapped_user_id][neighbor_ids]
|
||||
|
||||
item_ids = items_df['item_id'].tolist()
|
||||
|
||||
v_i = self.r[neighbor_ids][:, item_ids]
|
||||
|
||||
scores = np.matmul(user_similarities, v_i) / np.sum(user_similarities)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
elif self.collaboration_type == 'item':
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores = np.sum(self.similarities[x_list], axis=0)
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
scores[x_list] = -1e100
|
||||
|
||||
chosen_ids = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
for item_id in chosen_ids:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[item_id],
|
||||
'score': scores[item_id]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
||||
|
||||
|
||||
class UserBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class UserBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'user'
|
||||
self.similarity_measure = 'pearson'
|
||||
|
||||
|
||||
class ItemBasedCosineNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'cosine'
|
||||
|
||||
|
||||
class ItemBasedPearsonNearestNeighborsRecommender(NearestNeighborsRecommender):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
self.collaboration_type = 'item'
|
||||
self.similarity_measure = 'pearson'
|
@ -0,0 +1,305 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import scipy.special as scisp
|
||||
from livelossplot import PlotLosses
|
||||
from collections import defaultdict, deque
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class NetflixRecommender(Recommender):
|
||||
"""
|
||||
Collaborative filtering based on matrix factorization with the following choice of an optimizer:
|
||||
- Stochastic Gradient Descent (SGD),
|
||||
- Mini-Batch Gradient Descent (MBGD),
|
||||
- Alternating Least Squares (ALS).
|
||||
"""
|
||||
|
||||
def __init__(self, seed=6789, n_neg_per_pos=5, print_type=None, **params):
|
||||
super().__init__()
|
||||
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
self.interactions_df = None
|
||||
self.item_id_mapping = None
|
||||
self.user_id_mapping = None
|
||||
self.item_id_reverse_mapping = None
|
||||
self.user_id_reverse_mapping = None
|
||||
self.r = None
|
||||
self.most_popular_items = None
|
||||
|
||||
self.n_neg_per_pos = n_neg_per_pos
|
||||
if 'optimizer' in params:
|
||||
self.optimizer = params['optimizer']
|
||||
else:
|
||||
self.optimizer = 'SGD'
|
||||
if 'n_epochs' in params: # number of epochs (each epoch goes through the entire training set)
|
||||
self.n_epochs = params['n_epochs']
|
||||
else:
|
||||
self.n_epochs = 10
|
||||
if 'lr' in params: # learning rate
|
||||
self.lr = params['lr']
|
||||
else:
|
||||
self.lr = 0.01
|
||||
if 'reg_l' in params: # regularization coefficient
|
||||
self.reg_l = params['reg_l']
|
||||
else:
|
||||
self.reg_l = 0.1
|
||||
if 'embedding_dim' in params:
|
||||
self.embedding_dim = params['embedding_dim']
|
||||
else:
|
||||
self.embedding_dim = 8
|
||||
|
||||
self.user_repr = None
|
||||
self.item_repr = None
|
||||
|
||||
if 'should_recommend_already_bought' in params:
|
||||
self.should_recommend_already_bought = params['should_recommend_already_bought']
|
||||
else:
|
||||
self.should_recommend_already_bought = False
|
||||
|
||||
self.validation_set_size = 0.2
|
||||
|
||||
self.seed = seed
|
||||
self.rng = np.random.RandomState(seed=seed)
|
||||
|
||||
self.print_type = print_type
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by
|
||||
user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined
|
||||
by item_id and the item feature columns.
|
||||
"""
|
||||
|
||||
del users_df, items_df
|
||||
|
||||
# Shift item ids and user ids so that they are consecutive
|
||||
|
||||
unique_item_ids = interactions_df['item_id'].unique()
|
||||
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
|
||||
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
|
||||
unique_user_ids = interactions_df['user_id'].unique()
|
||||
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
|
||||
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
|
||||
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
|
||||
|
||||
# Get the number of items and users
|
||||
|
||||
self.interactions_df = interactions_df
|
||||
n_users = np.max(interactions_df['user_id']) + 1
|
||||
n_items = np.max(interactions_df['item_id']) + 1
|
||||
|
||||
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
|
||||
r = np.zeros(shape=(n_users, n_items))
|
||||
for idx, interaction in interactions_df.iterrows():
|
||||
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
|
||||
|
||||
self.r = r
|
||||
|
||||
# Generate negative interactions
|
||||
negative_interactions = []
|
||||
|
||||
i = 0
|
||||
while i < self.n_neg_per_pos * len(interactions_df):
|
||||
sample_size = 1000
|
||||
user_ids = self.rng.choice(np.arange(n_users), size=sample_size)
|
||||
item_ids = self.rng.choice(np.arange(n_items), size=sample_size)
|
||||
|
||||
j = 0
|
||||
while j < sample_size and i < self.n_neg_per_pos * len(interactions_df):
|
||||
if r[user_ids[j]][item_ids[j]] == 0:
|
||||
negative_interactions.append([user_ids[j], item_ids[j], 0])
|
||||
i += 1
|
||||
j += 1
|
||||
|
||||
interactions_df = pd.concat(
|
||||
[interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])
|
||||
|
||||
# Initialize user and item embeddings as random vectors (from Gaussian distribution)
|
||||
|
||||
self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim))
|
||||
self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim))
|
||||
|
||||
# Initialize losses and loss visualization
|
||||
|
||||
if self.print_type is not None and self.print_type == 'live':
|
||||
liveloss = PlotLosses()
|
||||
|
||||
training_losses = deque(maxlen=50)
|
||||
training_avg_losses = []
|
||||
training_epoch_losses = []
|
||||
validation_losses = deque(maxlen=50)
|
||||
validation_avg_losses = []
|
||||
validation_epoch_losses = []
|
||||
last_training_total_loss = 0.0
|
||||
last_validation_total_loss = 0.0
|
||||
|
||||
# Split the data
|
||||
|
||||
interaction_ids = self.rng.permutation(len(interactions_df))
|
||||
train_validation_slice_idx = int(len(interactions_df) * (1 - self.validation_set_size))
|
||||
training_ids = interaction_ids[:train_validation_slice_idx]
|
||||
validation_ids = interaction_ids[train_validation_slice_idx:]
|
||||
|
||||
# Train the model
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
if self.print_type is not None and self.print_type == 'live':
|
||||
logs = {}
|
||||
|
||||
# Train
|
||||
|
||||
training_losses.clear()
|
||||
training_total_loss = 0.0
|
||||
batch_idx = 0
|
||||
for idx in training_ids:
|
||||
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||
|
||||
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||
self.user_repr[user_id] = self.user_repr[user_id] \
|
||||
+ self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id])
|
||||
self.item_repr[item_id] = self.item_repr[item_id] \
|
||||
+ self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id])
|
||||
|
||||
loss = e_ui**2
|
||||
training_total_loss += loss
|
||||
|
||||
if self.print_type is not None and self.print_type == 'text':
|
||||
print("\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}".format(
|
||||
epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="")
|
||||
|
||||
batch_idx += 1
|
||||
|
||||
training_losses.append(loss)
|
||||
training_avg_losses.append(np.mean(training_losses))
|
||||
|
||||
# Validate
|
||||
|
||||
validation_losses.clear()
|
||||
validation_total_loss = 0.0
|
||||
for idx in validation_ids:
|
||||
user_id = int(interactions_df.iloc[idx]['user_id'])
|
||||
item_id = int(interactions_df.iloc[idx]['item_id'])
|
||||
|
||||
e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id])
|
||||
|
||||
loss = e_ui**2
|
||||
validation_total_loss += loss
|
||||
|
||||
validation_losses.append(loss)
|
||||
validation_avg_losses.append(np.mean(validation_losses))
|
||||
|
||||
# Save and print epoch losses
|
||||
|
||||
training_last_avg_loss = training_total_loss / len(training_ids)
|
||||
training_epoch_losses.append(training_last_avg_loss)
|
||||
validation_last_avg_loss = validation_total_loss / len(validation_ids)
|
||||
validation_epoch_losses.append(validation_last_avg_loss)
|
||||
|
||||
if self.print_type is not None and self.print_type == 'live' and epoch >= 3:
|
||||
# A bound on epoch prevents showing extremely high losses in the first epochs
|
||||
# noinspection PyUnboundLocalVariable
|
||||
logs['loss'] = training_last_avg_loss
|
||||
logs['val_loss'] = validation_last_avg_loss
|
||||
# noinspection PyUnboundLocalVariable
|
||||
liveloss.update(logs)
|
||||
liveloss.send()
|
||||
|
||||
# Find the most popular items for the cold start problem
|
||||
|
||||
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
|
||||
offers_count = offers_count.sort_values('user_id', ascending=False)
|
||||
self.most_popular_items = offers_count.index
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which
|
||||
recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
# Clean previous recommendations (iloc could be used alternatively)
|
||||
self.recommender_df = self.recommender_df[:0]
|
||||
|
||||
# Handle users not in the training data
|
||||
|
||||
# Map item ids
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df = items_df.loc[items_df['item_id'].isin(self.item_id_mapping)]
|
||||
items_df.replace({'item_id': self.item_id_mapping}, inplace=True)
|
||||
|
||||
# Generate recommendations
|
||||
|
||||
for idx, user in users_df.iterrows():
|
||||
recommendations = []
|
||||
|
||||
user_id = user['user_id']
|
||||
|
||||
if user_id in self.user_id_mapping:
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
|
||||
ids_list = items_df['item_id'].tolist()
|
||||
id_to_pos = np.array([0]*len(ids_list))
|
||||
for k in range(len(ids_list)):
|
||||
id_to_pos[ids_list[k]] = k
|
||||
scores = np.matmul(self.user_repr[mapped_user_id].reshape(1, -1),
|
||||
self.item_repr[ids_list].T).flatten()
|
||||
|
||||
# Choose n recommendations based on highest scores
|
||||
if not self.should_recommend_already_bought:
|
||||
x_list = self.interactions_df.loc[
|
||||
self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
|
||||
scores[id_to_pos[x_list]] = -1e100
|
||||
|
||||
chosen_pos = np.argsort(-scores)[:n_recommendations]
|
||||
|
||||
for item_pos in chosen_pos:
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': self.user_id_reverse_mapping[mapped_user_id],
|
||||
'item_id': self.item_id_reverse_mapping[ids_list[item_pos]],
|
||||
'score': scores[item_pos]
|
||||
}
|
||||
)
|
||||
else: # For new users recommend most popular items
|
||||
for i in range(n_recommendations):
|
||||
recommendations.append(
|
||||
{
|
||||
'user_id': user['user_id'],
|
||||
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
|
||||
'score': 1.0
|
||||
}
|
||||
)
|
||||
|
||||
user_recommendations = pd.DataFrame(recommendations)
|
||||
|
||||
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
|
||||
|
||||
return self.recommender_df
|
||||
|
||||
def get_user_repr(self, user_id):
|
||||
mapped_user_id = self.user_id_mapping[user_id]
|
||||
return self.user_repr[mapped_user_id]
|
||||
|
||||
def get_item_repr(self, item_id):
|
||||
mapped_item_id = self.item_id_mapping[item_id]
|
||||
return self.item_repr[mapped_item_id]
|
52
recommender-systems-class-master/recommenders/recommender.py
Normal file
52
recommender-systems-class-master/recommenders/recommender.py
Normal file
@ -0,0 +1,52 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class Recommender(object):
|
||||
"""
|
||||
Base recommender class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
|
||||
:param int seed: Seed for the random number generator.
|
||||
"""
|
||||
pass
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
|
||||
"""
|
||||
pass
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
for ix, user in users_df.iterrows():
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [-1] * n_recommendations,
|
||||
'score': [3.0] * n_recommendations})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
@ -0,0 +1,102 @@
|
||||
# Load libraries ---------------------------------------------
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from collections import defaultdict
|
||||
|
||||
from recommenders.recommender import Recommender
|
||||
|
||||
# ------------------------------------------------------------
|
||||
|
||||
|
||||
class TFIDFRecommender(Recommender):
|
||||
"""
|
||||
Recommender based on the TF-IDF method.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize base recommender params and variables.
|
||||
"""
|
||||
super().__init__()
|
||||
self.tfidf_scores = None
|
||||
|
||||
def fit(self, interactions_df, users_df, items_df):
|
||||
"""
|
||||
Training of the recommender.
|
||||
|
||||
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
|
||||
defined by user_id, item_id and features of the interaction.
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
|
||||
and the user feature columns.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
|
||||
and the item feature columns.
|
||||
"""
|
||||
|
||||
self.tfidf_scores = defaultdict(lambda: 0.0)
|
||||
|
||||
# Prepare the corpus for tfidf calculation
|
||||
|
||||
interactions_df = pd.merge(interactions_df, items_df, on='item_id')
|
||||
user_genres = interactions_df.loc[:, ['user_id', 'genres']]
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
|
||||
user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
|
||||
user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
|
||||
user_ids = user_genres.index.tolist()
|
||||
genres_corpus = user_genres['genres'].tolist()
|
||||
|
||||
# Calculate tf-idf scores
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_scores = vectorizer.fit_transform(genres_corpus)
|
||||
|
||||
# Transform results into a dict {(user_id, genre): score}
|
||||
|
||||
for u in range(tfidf_scores.shape[0]):
|
||||
for g in range(tfidf_scores.shape[1]):
|
||||
self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]
|
||||
|
||||
def recommend(self, users_df, items_df, n_recommendations=1):
|
||||
"""
|
||||
Serving of recommendations. Scores items in items_df for each user in users_df and returns
|
||||
top n_recommendations for each user.
|
||||
|
||||
:param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
|
||||
should be generated.
|
||||
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
|
||||
:param int n_recommendations: Number of recommendations to be returned for each user.
|
||||
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
|
||||
for each user.
|
||||
:rtype: pd.DataFrame
|
||||
"""
|
||||
|
||||
recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
|
||||
|
||||
# Transform genres to a unified form used by the vectorizer
|
||||
|
||||
items_df = items_df.copy()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
|
||||
items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
|
||||
|
||||
# Score items
|
||||
|
||||
for uix, user in users_df.iterrows():
|
||||
items = []
|
||||
for iix, item in items_df.iterrows():
|
||||
score = 0.0
|
||||
for genre in item['genres']:
|
||||
score += self.tfidf_scores[(user['user_id'], genre)]
|
||||
score /= len(item['genres'])
|
||||
items.append((item['item_id'], score))
|
||||
|
||||
items = sorted(items, key=lambda x: x[1], reverse=True)
|
||||
user_recommendations = pd.DataFrame({'user_id': user['user_id'],
|
||||
'item_id': [item[0] for item in items][:n_recommendations],
|
||||
'score': [item[1] for item in items][:n_recommendations]})
|
||||
|
||||
recommendations = pd.concat([recommendations, user_recommendations])
|
||||
|
||||
return recommendations
|
Loading…
Reference in New Issue
Block a user