{ "cells": [ { "cell_type": "code", "execution_count": 111, "id": "alike-morgan", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n", "Collecting package metadata (current_repodata.json): failed\n", "\n", "CondaHTTPError: HTTP 000 CONNECTION FAILED for url \n", "Elapsed: -\n", "\n", "An HTTP error occurred when trying to retrieve this URL.\n", "HTTP errors are often intermittent, and a simple retry will get you on your way.\n", "'https://conda.anaconda.org/conda-forge/linux-64'\n", "\n", "\n" ] } ], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from IPython.display import Markdown, display, HTML\n", "from collections import defaultdict\n", "\n", "# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n", "import os\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'\n", "\n", "######################\n", "# I have added hyperopt package to environment.yml. \n", "######################\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "id": "blessed-knitting", "metadata": {}, "source": [ "# Load the dataset for recommenders" ] }, { "cell_type": "code", "execution_count": 52, "id": "victorian-bottom", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idtermlength_of_stay_bucketrate_planroom_segmentn_people_bucketweekend_stay
010WinterVacation[2-3]Standard[260-360][5-inf]True
121WinterVacation[2-3]Standard[160-260][3-4]True
232WinterVacation[2-3]Standard[160-260][2-2]False
343WinterVacation[4-7]Standard[160-260][3-4]True
454WinterVacation[4-7]Standard[0-160][2-2]True
565Easter[4-7]Standard[260-360][5-inf]True
676OffSeason[2-3]Standard[260-360][5-inf]True
787HighSeason[2-3]Standard[160-260][1-1]True
898HighSeason[2-3]Standard[0-160][1-1]True
987HighSeason[2-3]Standard[160-260][1-1]True
1087HighSeason[2-3]Standard[160-260][1-1]True
11109HighSeason[2-3]Standard[160-260][3-4]True
12119HighSeason[2-3]Standard[160-260][3-4]True
131210HighSeason[8-inf]Standard[160-260][3-4]True
141411HighSeason[2-3]Standard[0-160][3-4]True
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data_path = os.path.join(\"data\", \"hotel_data\")\n", "\n", "interactions_df = pd.read_csv(os.path.join(data_path, \"hotel_data_interactions_df.csv\"), index_col=0)\n", "preprocessed_data = pd.read_csv(os.path.join(data_path, \"hotel_data_preprocessed.csv\"), index_col=0)\n", "\n", "base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']\n", "\n", "column_values_dict = {\n", " 'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],\n", " 'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],\n", " 'rate_plan': ['Standard', 'Nonref'],\n", " 'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],\n", " 'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],\n", " 'weekend_stay': ['True', 'False']\n", "}\n", "\n", "interactions_df.loc[:, 'term'] = pd.Categorical(\n", " interactions_df['term'], categories=column_values_dict['term'])\n", "interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(\n", " interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])\n", "interactions_df.loc[:, 'rate_plan'] = pd.Categorical(\n", " interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])\n", "interactions_df.loc[:, 'room_segment'] = pd.Categorical(\n", " interactions_df['room_segment'], categories=column_values_dict['room_segment'])\n", "interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(\n", " interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])\n", "interactions_df.loc[:, 'weekend_stay'] = interactions_df['weekend_stay'].astype('str')\n", "interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(\n", " interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])\n", "\n", "display(HTML(interactions_df.head(15).to_html()))" ] }, { "cell_type": "markdown", "id": "realistic-third", "metadata": {}, "source": [ "# Define user features based on reservations\n", "\n", "The content-based recommenders will be forecasting the probability of interaction between user and item based on user features vector and item features vector:\n", "\n", "
\n", "$$\n", " r_{u, i} = f(user\\_features, item\\_features)\n", "$$\n", "
\n", "\n", "**Task:**
\n", "Design numerical user features based on user reservations. Code the following method which for a given interactions DataFrame (it will be used in the fit method of the recommender) returns a DataFrame with user_id and user features as well as a list with names of user features (this will be important to select the right columns for an ML algorithm). Remember to name the columns differently than item features which you will create in the next task. Validate your features on users with several interactions (sample user ids are already given below).\n", "\n", "Ideas for user features:\n", "- Find the vector of most popular feature values from all user reservations and encode every feature with one-hot encoding.\n", "- For every reservation feature calculate the probability distribution of its values among all user's reservations.\n", "- For numerical buckets (length_of_stay, room_segment, n_people) you can calculate the average value for every user from their reservations (you will have to map the buckets back to numerical values before averaging them).\n", "\n", "Remember that you will have to select the best features (with the highest explanatory power). Using all above features at once would make the number of variables too large for this dataset and would also introduce too much correlations between features.\n", "\n", "You can also prepare several version of the prepare_users_df method and test which works best in your recommender." ] }, { "cell_type": "code", "execution_count": 87, "id": "variable-jaguar", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['user_id', 'n_people', 'length_of_stay', 'night_price']\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idn_peoplelength_of_staynight_price
012.4347833.434783161.930000
52504.0434782.695652270.541739
103963.0000002.291667154.016250
1501151.4090911.90909115.909091
7007063.9139473.82789354.159792
175817361.8965522.655172202.308621
809777794.2592594.33333333.788889
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def prepare_users_df(interactions_df):\n", " users_df = interactions_df[['user_id']]\n", " users_df = users_df.drop_duplicates()\n", " \n", " users_df['n_people'] = preprocessed_data.groupby('user_id')['n_people'].transform('mean').reindex()\n", " users_df['length_of_stay'] = preprocessed_data.groupby('user_id')['length_of_stay'].transform('mean').reindex()\n", " users_df['night_price'] = preprocessed_data.groupby('user_id')['night_price'].transform('mean').reindex()\n", " \n", " user_features = list(users_df.columns)\n", " return users_df, user_features\n", " \n", "\n", "users_df, user_features = prepare_users_df(interactions_df)\n", "\n", "print(user_features)\n", "\n", "display(HTML(users_df.loc[users_df['user_id'].isin([706, 1736, 7779, 96, 1, 50, 115])].head(15).to_html()))" ] }, { "cell_type": "markdown", "id": "built-complaint", "metadata": {}, "source": [ "# Prepare numerical item features\n", "\n", "**Task:**
\n", "Code the prepare_items_df method which will be used in the recommender fit and recommend methods to map items to numerical features. This method should take the interactions_df DataFrame as input and return a DataFrame containing one record per item_id with item_id column and numerical item feature columns.\n", "\n", "You can try turning all item features into on-hot representations. You can use the get_dummies method from pandas. It will return the same columns on any dataset of interactions because of the categorical variables with all possible values have been defined in the second cell in this notebook.\n", "\n", "You are welcome to design your own numerical item features." ] }, { "cell_type": "code", "execution_count": 96, "id": "formal-munich", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
term_WinterVacationterm_Easterterm_OffSeasonterm_HighSeasonterm_LowSeasonterm_MayLongWeekendterm_NewYearterm_Christmaslength_of_stay_bucket_[0-1]length_of_stay_bucket_[2-3]length_of_stay_bucket_[4-7]length_of_stay_bucket_[8-inf]rate_plan_Standardrate_plan_Nonrefroom_segment_[0-160]room_segment_[160-260]room_segment_[260-360]room_segment_[360-500]room_segment_[500-900]n_people_bucket_[1-1]n_people_bucket_[2-2]n_people_bucket_[3-4]n_people_bucket_[5-inf]weekend_stay_Trueweekend_stay_Falseitem_id
010000000010010001000001100
110000000010010010000010101
210000000010010010000100012
310000000001010010000010103
410000000001010100000100104
501000000001010001000001105
600100000010010001000001106
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def prepare_items_df(interactions_df):\n", " items_df = interactions_df[[\n", " 'term',\n", " 'length_of_stay_bucket',\n", " 'rate_plan',\n", " 'room_segment',\n", " 'n_people_bucket',\n", " 'weekend_stay'\n", " ]]\n", " \n", " onehot_df = pd.get_dummies(items_df)\n", " onehot_df['item_id'] = interactions_df['item_id']\n", " \n", " onehot_df = onehot_df.drop_duplicates()\n", " \n", " \n", " item_features = list(onehot_df.columns)\n", " return onehot_df, item_features\n", "\n", "\n", "items_df, item_features = prepare_items_df(interactions_df)\n", "\n", "display(HTML(items_df.loc[items_df['item_id'].isin([0, 1, 2, 3, 4, 5, 6])].head(15).to_html()))" ] }, { "cell_type": "code", "execution_count": 97, "id": "agricultural-adjustment", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "interactions 16102\n", "items 772\n", "users 14188\n" ] } ], "source": [ "print(\"interactions\", len(interactions_df))\n", "print(\"items\",len(items_df))\n", "print(\"users\", len(users_df))" ] }, { "cell_type": "markdown", "id": "figured-imaging", "metadata": {}, "source": [ "# Content-based recommender\n", "\n", "**Task:**
\n", "Code the content-based recommender. User features should be calculated within the fit method based on available training data and should be saved in the object for later use in the recommend method. Overwrite the users_df variable. Item features should be calculated both in the fit method (from interactions_df) and in the recommend method (from items_df - the items to be evaluated).\n", "\n", "In the fit method you have to randomly generate non-existing interactions and add them to the training data for the regressor. You should add the target variable to interactions - equal to 1 for real interactions and equal to 0 for those newly added interactions. Generate several negative interactions per every positive interactions (n_neg_per_pos). Treat the proportion as a tunable parameter of the model.\n", "\n", "Remember to keep control over randomness - in the init method add seed as a parameter and use initialize the random seed generator with that seed:\n", "\n", "```python\n", "self.seed = seed\n", "self.rng = np.random.RandomState(seed=seed)\n", "```\n", "\n", "Below the base content-based recommender class there are several classes which inherit from the base class and use different ML models:\n", " - LinearRegressionCBUIRecommender - based on linear regression,\n", " - SVRCBUIRecommender - based on Support Vector Regressor (if you want to test it, sample the data in the fit method, as the training can take many hours on the entire dataset of interactions),\n", " - RandomForestCBUIRecommender - based on Random Forest,\n", " - XGBoostCBUIRecommender - based on XGBoost.\n", " \n", "There is no need to change anything in those inheriting classes, although you can experiment with other tunable parameters of the underlying models.\n", "\n", "You are encouraged to experiment with:\n", " - Other numerical user and item features (but always train and evaluate the model on buckets defined in the first notebook).\n", " - Other ML models, e.g. Huber regression, Lasso regression, Ridge regression, LARS regression, Linear SVR, Decision Tree, Naive Bayes, Neural Networks or any model of your choice.\n", " - A different approach where you treat each item as a class, you train directly on categorical features of items and users (you would have to design appropriate categorical features for users) and you fit classifiers (e.g. Decision Tree classifier, Naive Bayes classifier etc.) instead of regressors." ] }, { "cell_type": "code", "execution_count": 98, "id": "unlike-recipient", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.svm import SVR\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "\n", "from recommenders.recommender import Recommender\n", "\n", "\n", "class ContentBasedUserItemRecommender(Recommender):\n", " \"\"\"\n", " Linear recommender class based on user and item features.\n", " \"\"\"\n", " \n", " def __init__(self, seed=6789, n_neg_per_pos=5):\n", " \"\"\"\n", " Initialize base recommender params and variables.\n", " \"\"\"\n", " self.model = LinearRegression()\n", " self.n_neg_per_pos = n_neg_per_pos\n", " \n", " self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n", " self.users_df = None\n", " self.user_features = None\n", " \n", " self.seed = seed\n", " self.rng = np.random.RandomState(seed=seed)\n", " \n", " def gen_neg_interactions(self):\n", " user_ids = interactions_df['user_id']\n", " item_ids = interactions_df['item_id']\n", " \n", " while True:\n", " user_id = user_ids.sample().item()\n", " item_id = item_ids.sample().item()\n", " \n", " found_interaction = interactions_df[(interactions_df['item_id'] == item_id) & (interactions_df['user_id'] == user_id)]\n", " if found_interaction.empty:\n", " return (user_id, item_id, 0)\n", "\n", " \n", " def fit(self, interactions_df, users_df, items_df):\n", " \"\"\"\n", " Training of the recommender.\n", " \n", " :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items \n", " defined by user_id, item_id and features of the interaction.\n", " :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.\n", " :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.\n", " \"\"\"\n", " \n", " interactions_df = interactions_df.copy()\n", " \n", " # Prepare users_df and items_df\n", " \n", " users_df, user_features = prepare_users_df(interactions_df)\n", " \n", " self.users_df = users_df\n", " self.user_features = user_features\n", " \n", " items_df, item_features = prepare_items_df(interactions_df)\n", "# items_df = items_df.loc[:, ['item_id'] + item_features]\n", " \n", " # Generate negative interactions\n", " \n", " interactions_df = interactions_df.loc[:, ['user_id', 'item_id']]\n", " \n", " interactions_df.loc[:, 'interacted'] = 1\n", " \n", " negative_interactions = []\n", " \n", " # Write your code here\n", " # Generate tuples (user_id, item_id, 0) for pairs (user_id, item_id) which do not\n", " # appear in the interactions_df and add those tuples to the list negative_interactions.\n", " # Generate self.n_neg_per_pos * len(interactions_df) negative interactions \n", " # (self.n_neg_per_pos per one positive).\n", " # Make sure the code is efficient and runs fast, otherwise you will not be able to properly tune your model.\n", " \n", " num_of_neg = int(self.n_neg_per_pos * len(interactions_df))\n", " for i in range(num_of_neg):\n", " negative_interactions.append(self.gen_neg_interactions())\n", " \n", " interactions_df = pd.concat(\n", " [interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted'])])\n", " \n", " # Get the input data for the model\n", " \n", "# print(\"==================================\")\n", "# print(users_df)\n", "# print(\"==================================\")\n", "# print(items_df)\n", "# print(\"==================================\")\n", " \n", " interactions_df = pd.merge(interactions_df, users_df, on=['user_id'])\n", " interactions_df = pd.merge(interactions_df, items_df, on=['item_id'])\n", " \n", " x = interactions_df.loc[:, user_features + item_features].values\n", " y = interactions_df['interacted'].values\n", " \n", " self.model.fit(x, y)\n", " \n", " def recommend(self, users_df, items_df, n_recommendations=1):\n", " \"\"\"\n", " Serving of recommendations. Scores items in items_df for each user in users_df and returns \n", " top n_recommendations for each user.\n", " \n", " :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.\n", " :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n", " :param int n_recommendations: Number of recommendations to be returned for each user.\n", " :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations \n", " for each user.\n", " :rtype: pd.DataFrame\n", " \"\"\"\n", " \n", " # Clean previous recommendations (iloc could be used alternatively)\n", " self.recommender_df = self.recommender_df[:0]\n", " \n", " # Write your code here\n", " # Prepare users_df and items_df\n", " # For users_df you just need to merge user features from self.users_df to users_df \n", " # (the users for which you generate recommendations)\n", " users_df = pd.merge(self.users_df, users_df, on='user_id')\n", " # For items you have to apply the prepare_items_df method to items_df.\n", " items_df, item_features = prepare_items_df(items_df)\n", " \n", " # Score the items\n", " \n", " recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n", " \n", " for ix, user in users_df.iterrows():\n", " \n", " # Write your code here\n", " # Create a Carthesian product of users from users_df and items from items_df\n", " # https://stackoverflow.com/questions/13269890/cartesian-product-in-pandas\n", " carthesian_df = users_df.merge(items_df, how='cross')\n", "\n", " # Write your code here\n", " # Use self.model.predict method to calculate scores for all records in the just created DataFrame\n", " # of users and items\n", " scores = self.model.predict(carthesian_df)\n", " \n", " # Write your code here\n", " # Obtain item ids with the highest score and save those ids under the chosen_ids variable\n", " # Do not exclude already booked items.\n", " chosen_ids = np.argsort(scores)[-n_recommendations:]\n", " \n", " recommendations = []\n", " if len(chosen_ids) == 0:\n", " print(\"empty chosen_ids\")\n", " \n", " for item_id in chosen_ids:\n", " recommendations.append(\n", " {\n", " 'user_id': user['user_id'],\n", " 'item_id': item_id,\n", " 'score': scores[item_id]\n", " }\n", " )\n", " \n", " user_recommendations = pd.DataFrame(recommendations)\n", "\n", " self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n", "\n", " return self.recommender_df\n", " \n", " \n", "class LinearRegressionCBUIRecommender(ContentBasedUserItemRecommender):\n", " \"\"\"\n", " Linear regression recommender class based on user and item features.\n", " \"\"\"\n", " \n", " def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n", " \"\"\"\n", " Initialize base recommender params and variables.\n", " \"\"\"\n", " super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n", " self.model = LinearRegression()\n", " \n", " \n", "class SVRCBUIRecommender(ContentBasedUserItemRecommender):\n", " \"\"\"\n", " SVR recommender class based on user and item features.\n", " \"\"\"\n", " \n", " def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n", " \"\"\"\n", " Initialize base recommender params and variables.\n", " \"\"\"\n", " super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n", " if 'kernel' in model_params:\n", " self.kernel = model_params['kernel']\n", " else:\n", " self.kernel = 'rbf'\n", " if 'C' in model_params:\n", " self.C = model_params['C']\n", " else:\n", " self.C = 1.0\n", " if 'epsilon' in model_params:\n", " self.epsilon = model_params['epsilon']\n", " else:\n", " self.epsilon = 0.1\n", " self.model = SVR(kernel=self.kernel, C=self.C, epsilon=self.epsilon)\n", " \n", " \n", "class RandomForestCBUIRecommender(ContentBasedUserItemRecommender):\n", " \"\"\"\n", " Random forest recommender class based on user and item features.\n", " \"\"\"\n", " \n", " def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n", " \"\"\"\n", " Initialize base recommender params and variables.\n", " \"\"\"\n", " super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n", " if 'n_estimators' in model_params:\n", " self.n_estimators = int(model_params['n_estimators'])\n", " else:\n", " self.n_estimators = 100\n", " if 'max_depth' in model_params:\n", " self.max_depth = int(model_params['max_depth'])\n", " else:\n", " self.max_depth = 30\n", " if 'min_samples_split' in model_params:\n", " self.min_samples_split = int(model_params['min_samples_split'])\n", " else:\n", " self.min_samples_split = 30\n", " self.model = RandomForestRegressor(\n", " n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split)\n", " \n", " \n", "class XGBoostCBUIRecommender(ContentBasedUserItemRecommender):\n", " \"\"\"\n", " XGBoost recommender class based on user and item features.\n", " \"\"\"\n", " \n", " def __init__(self, seed=6789, n_neg_per_pos=5, **model_params):\n", " \"\"\"\n", " Initialize base recommender params and variables.\n", " \"\"\"\n", " super().__init__(seed=seed, n_neg_per_pos=n_neg_per_pos)\n", " if 'n_estimators' in model_params:\n", " self.n_estimators = int(model_params['n_estimators'])\n", " else:\n", " self.n_estimators = 100\n", " if 'max_depth' in model_params:\n", " self.max_depth = int(model_params['max_depth'])\n", " else:\n", " self.max_depth = 30\n", " if 'min_samples_split' in model_params:\n", " self.min_samples_split = int(model_params['min_samples_split'])\n", " else:\n", " self.min_samples_split = 30\n", " if 'learning_rate' in model_params:\n", " self.learning_rate = model_params['learning_rate']\n", " else:\n", " self.learning_rate = 30\n", " self.model = GradientBoostingRegressor(\n", " n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split,\n", " learning_rate=self.learning_rate) \n", " \n", " \n", "recommender = ContentBasedUserItemRecommender()\n", "# print(recommender.gen_neg_interactions())\n", "# items_df = interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()\n", "# recommender.fit(interactions_df.copy(), users_df.copy(), items_df.copy())\n", "# recommender.recommend(users_df.copy(), items_df.copy())\n" ] }, { "cell_type": "markdown", "id": "copyrighted-relative", "metadata": {}, "source": [ "# Quick test of the recommender" ] }, { "cell_type": "code", "execution_count": 99, "id": "greatest-canon", "metadata": {}, "outputs": [], "source": [ "items_df = interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 76, "id": "initial-capital", "metadata": {}, "outputs": [], "source": [ "# Fit method\n", "cb_user_item_recommender = RandomForestCBUIRecommender()\n", "cb_user_item_recommender.fit(interactions_df, None, None)" ] }, { "cell_type": "code", "execution_count": null, "id": "digital-consolidation", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Recommender method\n", "\n", "recommendations = cb_user_item_recommender.recommend(pd.DataFrame([[1], [2], [3], [4], [5]], columns=['user_id']), interactions_df, 10)\n", "\n", "recommendations = pd.merge(recommendations, items_df, on='item_id', how='left')\n", "display(HTML(recommendations.to_html()))" ] }, { "cell_type": "markdown", "id": "advanced-eleven", "metadata": {}, "source": [ "# Tuning method" ] }, { "cell_type": "code", "execution_count": 100, "id": "strange-alaska", "metadata": {}, "outputs": [], "source": [ "from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n", "\n", "seed = 6789" ] }, { "cell_type": "code", "execution_count": 101, "id": "stable-theta", "metadata": {}, "outputs": [], "source": [ "from hyperopt import hp, fmin, tpe, Trials\n", "import traceback\n", "\n", "def tune_recommender(recommender_class, interactions_df, items_df, \n", " param_space, max_evals=1, show_progressbar=True, seed=6789):\n", " # Split into train_validation and test sets\n", "\n", " shuffle = np.arange(len(interactions_df))\n", " rng = np.random.RandomState(seed=seed)\n", " rng.shuffle(shuffle)\n", " shuffle = list(shuffle)\n", "\n", " train_test_split = 0.8\n", " split_index = int(len(interactions_df) * train_test_split)\n", "\n", " train_validation = interactions_df.iloc[shuffle[:split_index]]\n", " test = interactions_df.iloc[shuffle[split_index:]]\n", "\n", " # Tune\n", "\n", " def loss(tuned_params):\n", " recommender = recommender_class(seed=seed, **tuned_params)\n", " hr1, hr3, hr5, hr10, ndcg1, ndcg3, ndcg5, ndcg10 = evaluate_train_test_split_implicit(\n", " recommender, train_validation, items_df, seed=seed)\n", " return -hr10\n", "\n", " n_tries = 1\n", " succeded = False\n", " try_id = 0\n", " while not succeded and try_id < n_tries:\n", " try:\n", " trials = Trials()\n", " best_param_set = fmin(loss, space=param_space, algo=tpe.suggest, \n", " max_evals=max_evals, show_progressbar=show_progressbar, trials=trials, verbose=True)\n", " succeded = True\n", " except:\n", " traceback.print_exc()\n", " try_id += 1\n", " \n", " if not succeded:\n", " return None\n", " \n", " # Validate\n", " \n", " recommender = recommender_class(seed=seed, **best_param_set)\n", "\n", " results = [[recommender_class.__name__] + list(evaluate_train_test_split_implicit(\n", " recommender, {'train': train_validation, 'test': test}, items_df, seed=seed))]\n", "\n", " results = pd.DataFrame(results, \n", " columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", " display(HTML(results.to_html()))\n", " \n", " return best_param_set" ] }, { "cell_type": "markdown", "id": "spiritual-orbit", "metadata": {}, "source": [ "## Tuning of the recommender\n", "\n", "**Task:**
\n", "Tune your models using the code below. You only need to put the class name of your recommender and choose an appropriate parameter space." ] }, { "cell_type": "code", "execution_count": 94, "id": "dependent-capital", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [35:55<00:00, 215.58s/trial, best loss: -0.0024580090126997134]\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0LinearRegressionCBUIRecommender0.0003290.0003290.0003290.0016450.0003290.0003290.0003290.000756
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Best parameters:\n", "{'n_neg_per_pos': 4.0}\n" ] } ], "source": [ "param_space = {\n", " 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1)\n", "}\n", "\n", "best_param_set_LinearRegressionCBUIRecommender = tune_recommender(LinearRegressionCBUIRecommender, interactions_df, items_df,\n", " param_space, max_evals=10, show_progressbar=True, seed=seed)\n", "\n", "print(\"Best parameters:\")\n", "print(best_param_set_LinearRegressionCBUIRecommender)" ] }, { "cell_type": "code", "execution_count": 103, "id": "palestinian-clearance", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [2:03:32<00:00, 741.28s/trial, best loss: -0.0061450225317492835]\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0SVRCBUIRecommender0.0006580.0016450.0016450.0029620.0006580.0011950.0011950.001608
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Best parameters:\n", "{'C': 0.21020345682666736, 'n_neg_per_pos': 7.0}\n" ] } ], "source": [ "param_space = {\n", " 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n", " 'C': hp.loguniform('C', np.log(0.01), np.log(100.0))\n", "}\n", "\n", "best_param_set_SVRCBUIRecommender = tune_recommender(SVRCBUIRecommender, interactions_df, items_df,\n", " param_space, max_evals=10, show_progressbar=True, seed=seed)\n", "\n", "print(\"Best parameters:\")\n", "print(best_param_set_SVRCBUIRecommender)" ] }, { "cell_type": "code", "execution_count": 104, "id": "seasonal-header", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 100/100 [5:28:02<00:00, 196.83s/trial, best loss: -0.04629250307251127] \n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0RandomForestCBUIRecommender0.0026320.0078970.0144780.04870.0026320.0056530.00840.019035
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Best parameters:\n", "{'max_depth': 10.0, 'min_samples_split': 11.0, 'n_estimators': 277.0, 'n_neg_per_pos': 1.0}\n" ] } ], "source": [ "param_space = {\n", " 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n", " 'n_estimators': hp.quniform('n_estimators', 30, 300, 1),\n", " 'max_depth': hp.quniform('max_depth', 2, 10, 1),\n", " 'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1)\n", "}\n", "\n", "best_param_set_RandomForestCBUIRecommender = tune_recommender(RandomForestCBUIRecommender, interactions_df, items_df,\n", " param_space, max_evals=100, show_progressbar=True, seed=seed)\n", "\n", "print(\"Best parameters:\")\n", "print(best_param_set_RandomForestCBUIRecommender)" ] }, { "cell_type": "code", "execution_count": 42, "id": "moved-gothic", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 20/20 [1:24:13<00:00, 252.67s/trial, best loss: -0.039737812371978695]\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0XGBoostCBUIRecommender0.0029620.0062520.0141490.0434350.0029620.0047790.0079930.017166
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Best parameters:\n", "{'learning_rate': 0.03474119783812193, 'max_depth': 8.0, 'min_samples_split': 12.0, 'n_estimators': 71.0, 'n_neg_per_pos': 4.0}\n" ] } ], "source": [ "# This tuning may take around 12 hours\n", "\n", "param_space = {\n", " 'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),\n", " 'n_estimators': hp.quniform('n_estimators', 10, 300, 1),\n", " 'max_depth': hp.quniform('max_depth', 2, 10, 1),\n", " 'min_samples_split': hp.quniform('min_samples_split', 2, 30, 1),\n", " 'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1))\n", "}\n", "\n", "best_param_set_XGBoostCBUIRecommender = tune_recommender(XGBoostCBUIRecommender, interactions_df, items_df,\n", " param_space, max_evals=20, show_progressbar=True, seed=seed)\n", "\n", "print(\"Best parameters:\")\n", "print(best_param_set_XGBoostCBUIRecommender)" ] }, { "cell_type": "markdown", "id": "accredited-strap", "metadata": {}, "source": [ "# Final evaluation\n", "\n", "**Task:**
\n", "Run the final evaluation of your recommender and present its results against the Amazon recommender's results. You can present results for several of your recommenders. You just need to give the class name of your recommender and its tuned parameters below. If you present results for several recommenders, you should add a separate cell for each recommender and change the names of the DataFrames containing results." ] }, { "cell_type": "code", "execution_count": 108, "id": "given-homework", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0My recomender0.0026320.010530.0190850.0529780.0026320.0069260.0104090.020894
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# cb_user_item_recommender = LinearRegressionCBUIRecommender(**best_param_set_LinearRegressionCBUIRecommender)\n", "# cb_user_item_recommender = SVRCBUIRecommender(**best_param_set_SVRCBUIRecommender)\n", "# cb_user_item_recommender = RandomForestCBUIRecommender(**best_param_set_RandomForestCBUIRecommender)\n", "cb_user_item_recommender = XGBoostCBUIRecommender(**best_param_set_XGBoostCBUIRecommender)\n", "\n", "# Give the name of your recommender in the line below\n", "linear_cbui_tts_results = [['My recomender'] + list(evaluate_train_test_split_implicit(\n", " cb_user_item_recommender, interactions_df, items_df))]\n", "\n", "linear_cbui_tts_results = pd.DataFrame(\n", " linear_cbui_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(linear_cbui_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 106, "id": "suited-nomination", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.0421190.104640.1405070.1994080.0421190.0768260.0917970.110705
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.amazon_recommender import AmazonRecommender\n", "\n", "amazon_recommender = AmazonRecommender()\n", "\n", "amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n", " amazon_recommender, interactions_df, items_df))]\n", "\n", "amazon_tts_results = pd.DataFrame(\n", " amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(amazon_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 109, "id": "moderate-printing", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0My recomender0.0026320.010530.0190850.0529780.0026320.0069260.0104090.020894
1AmazonRecommender0.0421190.104640.1405070.1994080.0421190.0768260.0917970.110705
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tts_results = pd.concat([linear_cbui_tts_results, amazon_tts_results]).reset_index(drop=True)\n", "display(HTML(tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": null, "id": "white-demographic", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "REK", "language": "python", "name": "rek" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }