meh/recommender-systems-class-master/class_5_amazon_recommender.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "verified-accommodation",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from IPython.display import Markdown, display, HTML\n",
    "from collections import defaultdict\n",
    "from sklearn.model_selection import KFold\n",
    "import scipy.special as scisp\n",
    "\n",
    "# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
    "import os\n",
    "os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "educated-tourist",
   "metadata": {},
   "source": [
    "# Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "looking-feeling",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>item_id</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>Sabrina (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>Tom and Huck (1995)</td>\n",
       "      <td>Adventure|Children</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>Sudden Death (1995)</td>\n",
       "      <td>Action</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "      <td>Action|Adventure|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of interactions left: 1170\n"
     ]
    }
   ],
   "source": [
    "ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n",
    "ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n",
    "ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n",
    "ml_df.head(10)\n",
    "\n",
    "display(HTML(ml_movies_df.head(10).to_html()))\n",
    "\n",
    "# Filter the data to reduce the number of movies\n",
    "seed = 6789\n",
    "rng = np.random.RandomState(seed=seed)\n",
    "left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n",
    "\n",
    "ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n",
    "ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n",
    "ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n",
    "\n",
    "print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "protecting-recognition",
   "metadata": {},
   "source": [
    "# Inner workings of the Amazon recommender fit method"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "plastic-brooklyn",
   "metadata": {},
   "source": [
    "## Shift item ids and user ids so that they are consecutive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "valuable-modem",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Item mapping\n",
      "{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}\n",
      "\n",
      "Item reverse mapping\n",
      "{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}\n",
      "\n",
      "User mapping\n",
      "{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}\n",
      "\n",
      "User reverse mapping\n",
      "{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>964984086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964980985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>216</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>310</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>945078428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>398</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964622830</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>416</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964622714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>513</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1007574532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>616</th>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>845553966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>845555402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>677</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>845554376</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "interactions_df = ml_ratings_df.copy()\n",
    "\n",
    "unique_item_ids = interactions_df['item_id'].unique()\n",
    "item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
    "item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
    "unique_user_ids = interactions_df['user_id'].unique()\n",
    "user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
    "user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
    "\n",
    "interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)\n",
    "\n",
    "print(\"Item mapping\")\n",
    "print(item_id_mapping)\n",
    "print()\n",
    "\n",
    "print(\"Item reverse mapping\")\n",
    "print(item_id_reverse_mapping)\n",
    "print()\n",
    "\n",
    "print(\"User mapping\")\n",
    "print(user_id_mapping)\n",
    "print()\n",
    "\n",
    "print(\"User reverse mapping\")\n",
    "print(user_id_reverse_mapping)\n",
    "print()\n",
    "\n",
    "display(HTML(interactions_df.head(10).to_html()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "basic-meeting",
   "metadata": {},
   "source": [
    "## Get the number of items and users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "close-massachusetts",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n_items=100\n",
      "n_users=378\n"
     ]
    }
   ],
   "source": [
    "n_items = np.max(interactions_df['item_id']) + 1\n",
    "n_users = np.max(interactions_df['user_id']) + 1\n",
    "\n",
    "print(\"n_items={}\\nn_users={}\".format(n_items, n_users))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "permanent-corrections",
   "metadata": {},
   "source": [
    "## Get the maximal number of interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "peripheral-natural",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max_interaction=31\n"
     ]
    }
   ],
   "source": [
    "n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
    "# Unnecessary, but added for readability\n",
    "n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
    "max_interactions = n_user_interactions['n_items'].max()\n",
    "\n",
    "print(\"max_interaction={}\".format(max_interactions))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "basic-production",
   "metadata": {},
   "source": [
    "## Calculate P_Y's"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "concrete-transparency",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}\n"
     ]
    }
   ],
   "source": [
    "n_interactions = len(interactions_df)\n",
    "p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
    "p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
    "p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
    "p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
    "\n",
    "print(p_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "consolidated-constant",
   "metadata": {},
   "source": [
    "## For every X calculate the E[Y|X]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "alive-cameroon",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p_y_powers for the first item\n",
      "[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06\n",
      " 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11\n",
      " 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17\n",
      " 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22\n",
      " 7.547e-23 1.303e-23 2.250e-24]\n",
      "alpha_k\n",
      "[ 6.290e+02 -2.785e+03  1.408e+04 -6.937e+04  3.018e+05 -1.120e+06\n",
      "  3.530e+06 -9.507e+06  2.202e+07 -4.418e+07  7.716e+07 -1.179e+08\n",
      "  1.579e+08 -1.860e+08  1.928e+08 -1.759e+08  1.413e+08 -9.962e+07\n",
      "  6.154e+07 -3.315e+07  1.549e+07 -6.230e+06  2.134e+06 -6.142e+05\n",
      "  1.458e+05 -2.778e+04  4.088e+03 -4.360e+02  3.000e+01 -1.000e+00\n",
      "  0.000e+00]\n",
      "\n",
      "E[Y|X]\n",
      "[[65.262 26.076  9.065  3.154  4.68 ]\n",
      " [28.303 19.062  4.288  1.5    2.223]\n",
      " [10.216  5.074  5.815  0.712  1.046]\n",
      " [ 2.315  0.859  0.283  1.938  0.144]\n",
      " [ 4.526  2.47   0.999  0.366  2.908]]\n"
     ]
    }
   ],
   "source": [
    "e_xy = np.zeros(shape=(n_items, n_items))\n",
    "e_xy[:][:] = -1e100\n",
    "    \n",
    "items = interactions_df['item_id'].unique()\n",
    "    \n",
    "p_y_powers = {}\n",
    "for y in items:\n",
    "    p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
    "    \n",
    "print(\"p_y_powers for the first item\")\n",
    "print(p_y_powers[0])\n",
    "\n",
    "for x in items:\n",
    "    # Get users who bought X\n",
    "    c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
    "\n",
    "    # Get users who bought only X\n",
    "    c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
    "    c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
    "\n",
    "    # Calculate the number of non-X interactions for each user who bought X\n",
    "    # Include users with zero non-X interactions\n",
    "    n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
    "    n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
    "    # Unnecessary, but added for readability\n",
    "    n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
    "\n",
    "    zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x)  # Remove\n",
    "    n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
    "\n",
    "    n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
    "\n",
    "    # Calculate the expected numbers of Y products bought by clients who bought X\n",
    "    alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
    "                                for abs_c in n_non_x_interactions[\"n_items\"]])\n",
    "                        for k in range(1, max_interactions + 1)])\n",
    "    \n",
    "    if x == 0:\n",
    "        print(\"alpha_k\")\n",
    "        print(alpha_k)\n",
    "        print()\n",
    "\n",
    "    for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
    "        if y != x:\n",
    "            e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
    "        else:\n",
    "            e_xy[x][y] = n_users * p_y[x]\n",
    "\n",
    "print(\"E[Y|X]\")\n",
    "print(np.around(e_xy[:10, :10], 3))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acknowledged-threshold",
   "metadata": {},
   "source": [
    "## Get the user-item interaction matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "extraordinary-mexico",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]\n",
      " [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]\n",
      " [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]\n",
      " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "# mapping to int is necessary because of how iterrows works\n",
    "r = np.zeros(shape=(n_users, n_items))\n",
    "for idx, interaction in interactions_df.iterrows():\n",
    "    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
    "    \n",
    "print(r[:10, :10])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "lovely-password",
   "metadata": {},
   "source": [
    "## Calculate the number of users who bought both X and Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "rubber-detector",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]\n",
      " [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]\n",
      " [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]\n",
      " [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]\n",
      " [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]\n",
      " [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]\n",
      " [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]\n",
      " [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]\n",
      " [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]\n",
      " [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]\n"
     ]
    }
   ],
   "source": [
    "# Simple and slow method (commented out)\n",
    "\n",
    "# n_xy = np.zeros(shape=(n_items, n_items))\n",
    "\n",
    "# for x in items:\n",
    "#     for y in items:\n",
    "#         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
    "#         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
    "#         users_x_and_y = users_x & users_y\n",
    "#         n_xy[x][y] = len(users_x_and_y)\n",
    "\n",
    "# Optimized method (can be further optimized by using sparse matrices)\n",
    "\n",
    "n_xy = np.matmul(r.T, r)\n",
    "\n",
    "print(n_xy[:10, :10])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "distinguished-consequence",
   "metadata": {},
   "source": [
    "## Calculate the scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "pointed-deputy",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]\n",
      " [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]\n",
      " [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]\n",
      " [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]\n",
      " [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]\n",
      " [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]\n",
      " [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]\n",
      " [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]\n",
      " [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]\n",
      " [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]\n"
     ]
    }
   ],
   "source": [
    "scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
    "\n",
    "print(np.around(scores[:10, :10], 3))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "endangered-stomach",
   "metadata": {},
   "source": [
    "## Final comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "prepared-fraction",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "E[Y|X]\n",
      "[[65.262 26.076  9.065  3.154  4.68  41.571 23.082  8.592 19.542 27.522]\n",
      " [28.303 19.062  4.288  1.5    2.223 18.99  10.768  4.066  9.15  12.778]\n",
      " [10.216  5.074  5.815  0.712  1.046  7.386  4.577  1.872  3.964  5.308]\n",
      " [ 2.315  0.859  0.283  1.938  0.144  1.433  0.754  0.267  0.631  0.911]\n",
      " [ 4.526  2.47   0.999  0.366  2.908  3.453  2.245  0.951  1.962  2.574]\n",
      " [47.984 20.534  7.279  2.549  3.776 34.569 18.241  6.902 15.507 21.636]\n",
      " [25.303 11.206  4.05   1.429  2.112 17.265 16.477  3.843  8.524 11.789]\n",
      " [ 9.094  4.124  1.561  0.561  0.826  6.205  3.701  5.492  3.186  4.326]\n",
      " [21.633  9.823  3.601  1.276  1.884 14.955  8.776  3.417 13.569 10.322]\n",
      " [25.03  10.257  3.571  1.243  1.844 16.332  9.082  3.385  7.691 20.354]]\n",
      "\n",
      "N(X, Y)\n",
      "[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]\n",
      " [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]\n",
      " [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]\n",
      " [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]\n",
      " [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]\n",
      " [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]\n",
      " [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]\n",
      " [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]\n",
      " [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]\n",
      " [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]\n",
      "\n",
      "Scores\n",
      "[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]\n",
      " [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]\n",
      " [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]\n",
      " [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]\n",
      " [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]\n",
      " [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]\n",
      " [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]\n",
      " [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]\n",
      " [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]\n",
      " [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"E[Y|X]\")\n",
    "print(np.around(e_xy[:10, :10], 3))\n",
    "print()\n",
    "\n",
    "print(\"N(X, Y)\")\n",
    "print(n_xy[:10, :10])\n",
    "print()\n",
    "\n",
    "print(\"Scores\")\n",
    "print(np.around(scores[:10, :10], 3))\n",
    "print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "distant-archive",
   "metadata": {},
   "source": [
    "# Inner workings of the Amazon recommender recommend method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "aerial-shipping",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommendation: 1, Brick (2005), 6.122652596595853\n",
      "Recommendation: 1, Oh, God! (1977), 5.908857666844879\n",
      "Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312\n",
      "Recommendation: 1, Meatballs (1979), 5.56930833865894\n",
      "Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742\n",
      "Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393\n",
      "Recommendation: 1, Six-String Samurai (1998), 5.225652131462832\n",
      "Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206\n",
      "Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857\n",
      "Recommendation: 1, Clara's Heart (1988), 4.608515964550741\n"
     ]
    }
   ],
   "source": [
    "user_id = 1\n",
    "should_recommend_already_bought = False\n",
    "n_recommendations = 10\n",
    "\n",
    "mapped_user_id = user_id_mapping[user_id]\n",
    "\n",
    "x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
    "final_scores = np.sum(scores[x_list], axis=0)\n",
    "\n",
    "# Choose n recommendations based on highest scores\n",
    "if not should_recommend_already_bought:\n",
    "    final_scores[x_list] = -1e100\n",
    "\n",
    "chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
    "\n",
    "for item_id in chosen_ids:\n",
    "    print(\"Recommendation: {}, {}, {}\".format(user_id_reverse_mapping[mapped_user_id],\n",
    "                                              ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], \n",
    "                                                            'title'].iloc[0],\n",
    "                                              final_scores[item_id]))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "opponent-prediction",
   "metadata": {},
   "source": [
    "# Amazon recommder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "fancy-return",
   "metadata": {},
   "outputs": [],
   "source": [
    "from recommenders.recommender import Recommender\n",
    "\n",
    "class AmazonRecommender(Recommender):\n",
    "    \"\"\"\n",
    "    Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:\n",
    "    - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,\n",
    "        IEEE Internet Computing, 2003,\n",
    "    - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
    "        self.interactions_df = None\n",
    "        self.item_id_mapping = None\n",
    "        self.user_id_mapping = None\n",
    "        self.item_id_reverse_mapping = None\n",
    "        self.user_id_reverse_mapping = None\n",
    "        self.e_xy = None\n",
    "        self.n_xy = None\n",
    "        self.scores = None\n",
    "        self.most_popular_items = None\n",
    "        self.should_recommend_already_bought = False\n",
    "\n",
    "    def initialize(self, **params):\n",
    "        if 'should_recommend_already_bought' in params:\n",
    "            self.should_recommend_already_bought = params['should_recommend_already_bought']\n",
    "\n",
    "    def fit(self, interactions_df, users_df, items_df):\n",
    "        \"\"\"\n",
    "        Training of the recommender.\n",
    "\n",
    "        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n",
    "            defined by user_id, item_id and features of the interaction.\n",
    "        :param pd.DataFrame users_df: DataFrame with users and their features defined by\n",
    "            user_id and the user feature columns.\n",
    "        :param pd.DataFrame items_df: DataFrame with items and their features defined\n",
    "            by item_id and the item feature columns.\n",
    "        \"\"\"\n",
    "\n",
    "        # Shift item ids and user ids so that they are consecutive\n",
    "\n",
    "        unique_item_ids = interactions_df['item_id'].unique()\n",
    "        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
    "        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
    "        unique_user_ids = interactions_df['user_id'].unique()\n",
    "        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
    "        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
    "        \n",
    "        interactions_df = interactions_df.copy()\n",
    "        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n",
    "\n",
    "        # Get the number of items and users\n",
    "\n",
    "        self.interactions_df = interactions_df\n",
    "        n_items = np.max(interactions_df['item_id']) + 1\n",
    "        n_users = np.max(interactions_df['user_id']) + 1\n",
    "\n",
    "        # Get maximal number of interactions\n",
    "\n",
    "        n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
    "        # Unnecessary, but added for readability\n",
    "        n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
    "        max_interactions = n_user_interactions['n_items'].max()\n",
    "\n",
    "        # Calculate P_Y's\n",
    "\n",
    "        n_interactions = len(interactions_df)\n",
    "        p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
    "        p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
    "        p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
    "        p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
    "\n",
    "        # Get the series of all items\n",
    "\n",
    "        # items = list(range(n_items))\n",
    "        items = interactions_df['item_id'].unique()\n",
    "\n",
    "        # For every X calculate the E[Y|X]\n",
    "\n",
    "        e_xy = np.zeros(shape=(n_items, n_items))\n",
    "        e_xy[:][:] = -1e100\n",
    "\n",
    "        p_y_powers = {}\n",
    "        for y in items:\n",
    "            p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
    "\n",
    "        for x in items:\n",
    "            # Get users who bought X\n",
    "            c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
    "\n",
    "            # Get users who bought only X\n",
    "            c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
    "            c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
    "\n",
    "            # Calculate the number of non-X interactions for each user who bought X\n",
    "            # Include users with zero non-X interactions\n",
    "            n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
    "            n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
    "            # Unnecessary, but added for readability\n",
    "            n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
    "\n",
    "            zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x)  # Remove\n",
    "            n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
    "\n",
    "            n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
    "\n",
    "            # Calculate the expected numbers of Y products bought by clients who bought X\n",
    "            alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
    "                                        for abs_c in n_non_x_interactions[\"n_items\"]])\n",
    "                                for k in range(1, max_interactions + 1)])\n",
    "\n",
    "            for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
    "                if y != x:\n",
    "                    e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
    "                else:\n",
    "                    e_xy[x][y] = n_users * p_y[x]\n",
    "\n",
    "        self.e_xy = e_xy\n",
    "\n",
    "        # Calculate the number of users who bought both X and Y\n",
    "\n",
    "        # Simple and slow method (commented out)\n",
    "\n",
    "        # n_xy = np.zeros(shape=(n_items, n_items))\n",
    "\n",
    "        # for x in items:\n",
    "        #     for y in items:\n",
    "        #         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
    "        #         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
    "        #         users_x_and_y = users_x & users_y\n",
    "        #         n_xy[x][y] = len(users_x_and_y)\n",
    "\n",
    "        # Optimized method (can be further optimized by using sparse matrices)\n",
    "\n",
    "        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n",
    "        r = np.zeros(shape=(n_users, n_items))\n",
    "        for idx, interaction in interactions_df.iterrows():\n",
    "            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
    "\n",
    "        # Get the number of users who bought both X and Y\n",
    "\n",
    "        n_xy = np.matmul(r.T, r)\n",
    "\n",
    "        self.n_xy = n_xy\n",
    "        \n",
    "        # Calculate the scores\n",
    "\n",
    "        self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
    "        \n",
    "        # Find the most popular items for the cold start problem\n",
    "        \n",
    "        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n",
    "        offers_count = offers_count.sort_values('user_id', ascending=False)\n",
    "        self.most_popular_items = offers_count.index\n",
    "\n",
    "    def recommend(self, users_df, items_df, n_recommendations=1):\n",
    "        \"\"\"\n",
    "        Serving of recommendations. Scores items in items_df for each user in users_df and returns\n",
    "        top n_recommendations for each user.\n",
    "\n",
    "        :param pd.DataFrame users_df: DataFrame with users and their features for which\n",
    "            recommendations should be generated.\n",
    "        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n",
    "        :param int n_recommendations: Number of recommendations to be returned for each user.\n",
    "        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n",
    "            for each user.\n",
    "        :rtype: pd.DataFrame\n",
    "        \"\"\"\n",
    "\n",
    "        # Clean previous recommendations (iloc could be used alternatively)\n",
    "        self.recommender_df = self.recommender_df[:0]\n",
    "        \n",
    "        # Handle users not in the training data\n",
    "\n",
    "        # Map item ids\n",
    "        \n",
    "        items_df = items_df.copy()\n",
    "        items_df.replace({'item_id': self.user_id_mapping}, inplace=True)\n",
    "\n",
    "        # Generate recommendations\n",
    "\n",
    "        for idx, user in users_df.iterrows():\n",
    "            recommendations = []\n",
    "            \n",
    "            user_id = user['user_id']\n",
    "            \n",
    "            if user_id in self.user_id_mapping:\n",
    "                mapped_user_id = self.user_id_mapping[user_id]\n",
    "            \n",
    "                x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
    "                final_scores = np.sum(self.scores[x_list], axis=0)\n",
    "\n",
    "                # Choose n recommendations based on highest scores\n",
    "                if not self.should_recommend_already_bought:\n",
    "                    final_scores[x_list] = -1e100\n",
    "\n",
    "                chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
    "\n",
    "                for item_id in chosen_ids:\n",
    "                    recommendations.append(\n",
    "                        {\n",
    "                            'user_id': self.user_id_reverse_mapping[mapped_user_id],\n",
    "                            'item_id': self.item_id_reverse_mapping[item_id],\n",
    "                            'score': final_scores[item_id]\n",
    "                        }\n",
    "                    )\n",
    "            else:  # For new users recommend most popular items\n",
    "                for i in range(n_recommendations):\n",
    "                    recommendations.append(\n",
    "                        {\n",
    "                            'user_id': user['user_id'],\n",
    "                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n",
    "                            'score': 1.0\n",
    "                        }\n",
    "                    )\n",
    "\n",
    "            user_recommendations = pd.DataFrame(recommendations)\n",
    "\n",
    "            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n",
    "\n",
    "        return self.recommender_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "nonprofit-roads",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recommendations\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>score</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>44761</td>\n",
       "      <td>6.122653</td>\n",
       "      <td>Brick (2005)</td>\n",
       "      <td>Crime|Drama|Film-Noir|Mystery</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>5214</td>\n",
       "      <td>5.908858</td>\n",
       "      <td>Oh, God! (1977)</td>\n",
       "      <td>Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>6755</td>\n",
       "      <td>5.830667</td>\n",
       "      <td>Bubba Ho-tep (2002)</td>\n",
       "      <td>Comedy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3040</td>\n",
       "      <td>5.569308</td>\n",
       "      <td>Meatballs (1979)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>6713</td>\n",
       "      <td>5.502504</td>\n",
       "      <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
       "      <td>Animation|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>3614</td>\n",
       "      <td>5.387478</td>\n",
       "      <td>Honeymoon in Vegas (1992)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>2275</td>\n",
       "      <td>5.225652</td>\n",
       "      <td>Six-String Samurai (1998)</td>\n",
       "      <td>Action|Adventure|Sci-Fi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>4796</td>\n",
       "      <td>5.144470</td>\n",
       "      <td>Grass Is Greener, The (1960)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>4896</td>\n",
       "      <td>4.796473</td>\n",
       "      <td>Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>3714</td>\n",
       "      <td>4.608516</td>\n",
       "      <td>Clara's Heart (1988)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4</td>\n",
       "      <td>3614</td>\n",
       "      <td>7.825335</td>\n",
       "      <td>Honeymoon in Vegas (1992)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>4</td>\n",
       "      <td>6713</td>\n",
       "      <td>7.407051</td>\n",
       "      <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
       "      <td>Animation|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>4</td>\n",
       "      <td>2690</td>\n",
       "      <td>6.599105</td>\n",
       "      <td>Ideal Husband, An (1999)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>4</td>\n",
       "      <td>44761</td>\n",
       "      <td>6.205835</td>\n",
       "      <td>Brick (2005)</td>\n",
       "      <td>Crime|Drama|Film-Noir|Mystery</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>4</td>\n",
       "      <td>3628</td>\n",
       "      <td>6.186298</td>\n",
       "      <td>Flying Tigers (1942)</td>\n",
       "      <td>Action|Drama|Romance|War</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>4</td>\n",
       "      <td>6755</td>\n",
       "      <td>5.977848</td>\n",
       "      <td>Bubba Ho-tep (2002)</td>\n",
       "      <td>Comedy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>4</td>\n",
       "      <td>959</td>\n",
       "      <td>5.919668</td>\n",
       "      <td>Of Human Bondage (1934)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>4</td>\n",
       "      <td>31260</td>\n",
       "      <td>5.919668</td>\n",
       "      <td>Boys Town (1938)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>4</td>\n",
       "      <td>6033</td>\n",
       "      <td>5.919668</td>\n",
       "      <td>Mystery Date (1991)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>4</td>\n",
       "      <td>3714</td>\n",
       "      <td>5.919668</td>\n",
       "      <td>Clara's Heart (1988)</td>\n",
       "      <td>Drama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>6</td>\n",
       "      <td>3614</td>\n",
       "      <td>11.392962</td>\n",
       "      <td>Honeymoon in Vegas (1992)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>6</td>\n",
       "      <td>31921</td>\n",
       "      <td>8.329693</td>\n",
       "      <td>Seven-Per-Cent Solution, The (1976)</td>\n",
       "      <td>Adventure|Comedy|Crime|Drama|Mystery|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>6</td>\n",
       "      <td>1752</td>\n",
       "      <td>8.236954</td>\n",
       "      <td>Hard Rain (1998)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>6</td>\n",
       "      <td>95147</td>\n",
       "      <td>8.006113</td>\n",
       "      <td>Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987)</td>\n",
       "      <td>Action|Adventure|Animation|Children</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>6</td>\n",
       "      <td>2275</td>\n",
       "      <td>6.941940</td>\n",
       "      <td>Six-String Samurai (1998)</td>\n",
       "      <td>Action|Adventure|Sci-Fi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>6</td>\n",
       "      <td>3479</td>\n",
       "      <td>6.771276</td>\n",
       "      <td>Ladyhawke (1985)</td>\n",
       "      <td>Adventure|Fantasy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>6</td>\n",
       "      <td>6755</td>\n",
       "      <td>6.520369</td>\n",
       "      <td>Bubba Ho-tep (2002)</td>\n",
       "      <td>Comedy|Horror</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>6</td>\n",
       "      <td>6537</td>\n",
       "      <td>6.454421</td>\n",
       "      <td>Terminator 3: Rise of the Machines (2003)</td>\n",
       "      <td>Action|Adventure|Sci-Fi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>6</td>\n",
       "      <td>4483</td>\n",
       "      <td>6.339894</td>\n",
       "      <td>Caddyshack II (1988)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>6</td>\n",
       "      <td>228</td>\n",
       "      <td>6.174734</td>\n",
       "      <td>Destiny Turns on the Radio (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Quick test of the recommender\n",
    "\n",
    "amazon_recommender = AmazonRecommender()\n",
    "amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)\n",
    "recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n",
    "\n",
    "recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n",
    "print(\"Recommendations\")\n",
    "display(HTML(recommendations.to_html()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "framed-negative",
   "metadata": {},
   "source": [
    "# Training-test split evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "romantic-music",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AmazonRecommender</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.311688</td>\n",
       "      <td>0.402597</td>\n",
       "      <td>0.551948</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.257806</td>\n",
       "      <td>0.294682</td>\n",
       "      <td>0.34147</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n",
    "\n",
    "amazon_recommender = AmazonRecommender()\n",
    "\n",
    "amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n",
    "    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
    "\n",
    "amazon_tts_results = pd.DataFrame(\n",
    "    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
    "\n",
    "display(HTML(amazon_tts_results.to_html()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "saving-harrison",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TFIDFRecommender</td>\n",
       "      <td>0.025974</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.136364</td>\n",
       "      <td>0.318182</td>\n",
       "      <td>0.025974</td>\n",
       "      <td>0.064393</td>\n",
       "      <td>0.083685</td>\n",
       "      <td>0.140799</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from recommenders.tfidf_recommender import TFIDFRecommender\n",
    "\n",
    "tfidf_recommender = TFIDFRecommender()\n",
    "\n",
    "tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n",
    "    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
    "\n",
    "tfidf_tts_results = pd.DataFrame(\n",
    "    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
    "\n",
    "display(HTML(tfidf_tts_results.to_html()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "random-source",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AmazonRecommender</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.311688</td>\n",
       "      <td>0.402597</td>\n",
       "      <td>0.551948</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.257806</td>\n",
       "      <td>0.294682</td>\n",
       "      <td>0.341470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TFIDFRecommender</td>\n",
       "      <td>0.025974</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.136364</td>\n",
       "      <td>0.318182</td>\n",
       "      <td>0.025974</td>\n",
       "      <td>0.064393</td>\n",
       "      <td>0.083685</td>\n",
       "      <td>0.140799</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n",
    "display(HTML(tts_results.to_html()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "continued-harassment",
   "metadata": {},
   "source": [
    "# Leave-one-out evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "prerequisite-lounge",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AmazonRecommender</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.256667</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.426667</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.219086</td>\n",
       "      <td>0.245486</td>\n",
       "      <td>0.279978</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from evaluation_and_testing.testing import evaluate_leave_one_out_implicit\n",
    "\n",
    "amazon_recommender = AmazonRecommender()\n",
    "\n",
    "amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n",
    "    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
    "\n",
    "amazon_loo_results = pd.DataFrame(\n",
    "    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
    "\n",
    "display(HTML(amazon_loo_results.to_html()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "behind-cambodia",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TFIDFRecommender</td>\n",
       "      <td>0.006667</td>\n",
       "      <td>0.053333</td>\n",
       "      <td>0.123333</td>\n",
       "      <td>0.233333</td>\n",
       "      <td>0.006667</td>\n",
       "      <td>0.033491</td>\n",
       "      <td>0.062178</td>\n",
       "      <td>0.096151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tfidf_recommender = TFIDFRecommender()\n",
    "\n",
    "tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n",
    "    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
    "\n",
    "tfidf_loo_results = pd.DataFrame(\n",
    "    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
    "\n",
    "display(HTML(tfidf_loo_results.to_html()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "lightweight-password",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Recommender</th>\n",
       "      <th>HR@1</th>\n",
       "      <th>HR@3</th>\n",
       "      <th>HR@5</th>\n",
       "      <th>HR@10</th>\n",
       "      <th>NDCG@1</th>\n",
       "      <th>NDCG@3</th>\n",
       "      <th>NDCG@5</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AmazonRecommender</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.256667</td>\n",
       "      <td>0.320000</td>\n",
       "      <td>0.426667</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.219086</td>\n",
       "      <td>0.245486</td>\n",
       "      <td>0.279978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TFIDFRecommender</td>\n",
       "      <td>0.006667</td>\n",
       "      <td>0.053333</td>\n",
       "      <td>0.123333</td>\n",
       "      <td>0.233333</td>\n",
       "      <td>0.006667</td>\n",
       "      <td>0.033491</td>\n",
       "      <td>0.062178</td>\n",
       "      <td>0.096151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n",
    "display(HTML(loo_results.to_html()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mediterranean-residence",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}