meh/recommender-systems-class-master/class_5_amazon_recommender.ipynb
2021-07-07 20:03:54 +02:00

1720 lines
69 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 112,
"id": "verified-accommodation",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from IPython.display import Markdown, display, HTML\n",
"from collections import defaultdict\n",
"from sklearn.model_selection import KFold\n",
"import scipy.special as scisp\n",
"\n",
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
"import os\n",
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
]
},
{
"cell_type": "markdown",
"id": "educated-tourist",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "looking-feeling",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>item_id</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Heat (1995)</td>\n",
" <td>Action|Crime|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Sabrina (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Tom and Huck (1995)</td>\n",
" <td>Adventure|Children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Sudden Death (1995)</td>\n",
" <td>Action</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action|Adventure|Thriller</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of interactions left: 1170\n"
]
}
],
"source": [
"ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n",
"ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n",
"ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n",
"ml_df.head(10)\n",
"\n",
"display(HTML(ml_movies_df.head(10).to_html()))\n",
"\n",
"# Filter the data to reduce the number of movies\n",
"seed = 6789\n",
"rng = np.random.RandomState(seed=seed)\n",
"left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n",
"\n",
"ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n",
"ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n",
"ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n",
"\n",
"print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))"
]
},
{
"cell_type": "markdown",
"id": "protecting-recognition",
"metadata": {},
"source": [
"# Inner workings of the Amazon recommender fit method"
]
},
{
"cell_type": "markdown",
"id": "plastic-brooklyn",
"metadata": {},
"source": [
"## Shift item ids and user ids so that they are consecutive"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "valuable-modem",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Item mapping\n",
"{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}\n",
"\n",
"Item reverse mapping\n",
"{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}\n",
"\n",
"User mapping\n",
"{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}\n",
"\n",
"User reverse mapping\n",
"{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}\n",
"\n"
]
},
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3.0</td>\n",
" <td>964984086</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964980985</td>\n",
" </tr>\n",
" <tr>\n",
" <th>216</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>4.0</td>\n",
" <td>964981725</td>\n",
" </tr>\n",
" <tr>\n",
" <th>310</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3.0</td>\n",
" <td>945078428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>398</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964622830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>4.0</td>\n",
" <td>964622714</td>\n",
" </tr>\n",
" <tr>\n",
" <th>513</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>4.0</td>\n",
" <td>1007574532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>616</th>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>845553966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>629</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>3.0</td>\n",
" <td>845555402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>677</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>3.0</td>\n",
" <td>845554376</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"interactions_df = ml_ratings_df.copy()\n",
"\n",
"unique_item_ids = interactions_df['item_id'].unique()\n",
"item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
"item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
"unique_user_ids = interactions_df['user_id'].unique()\n",
"user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
"user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
"\n",
"interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)\n",
"\n",
"print(\"Item mapping\")\n",
"print(item_id_mapping)\n",
"print()\n",
"\n",
"print(\"Item reverse mapping\")\n",
"print(item_id_reverse_mapping)\n",
"print()\n",
"\n",
"print(\"User mapping\")\n",
"print(user_id_mapping)\n",
"print()\n",
"\n",
"print(\"User reverse mapping\")\n",
"print(user_id_reverse_mapping)\n",
"print()\n",
"\n",
"display(HTML(interactions_df.head(10).to_html()))"
]
},
{
"cell_type": "markdown",
"id": "basic-meeting",
"metadata": {},
"source": [
"## Get the number of items and users"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "close-massachusetts",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n_items=100\n",
"n_users=378\n"
]
}
],
"source": [
"n_items = np.max(interactions_df['item_id']) + 1\n",
"n_users = np.max(interactions_df['user_id']) + 1\n",
"\n",
"print(\"n_items={}\\nn_users={}\".format(n_items, n_users))"
]
},
{
"cell_type": "markdown",
"id": "permanent-corrections",
"metadata": {},
"source": [
"## Get the maximal number of interactions"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "peripheral-natural",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"max_interaction=31\n"
]
}
],
"source": [
"n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
"# Unnecessary, but added for readability\n",
"n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
"max_interactions = n_user_interactions['n_items'].max()\n",
"\n",
"print(\"max_interaction={}\".format(max_interactions))"
]
},
{
"cell_type": "markdown",
"id": "basic-production",
"metadata": {},
"source": [
"## Calculate P_Y's"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "concrete-transparency",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}\n"
]
}
],
"source": [
"n_interactions = len(interactions_df)\n",
"p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
"p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
"p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
"p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
"\n",
"print(p_y)"
]
},
{
"cell_type": "markdown",
"id": "consolidated-constant",
"metadata": {},
"source": [
"## For every X calculate the E[Y|X]"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "alive-cameroon",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"p_y_powers for the first item\n",
"[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06\n",
" 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11\n",
" 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17\n",
" 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22\n",
" 7.547e-23 1.303e-23 2.250e-24]\n",
"alpha_k\n",
"[ 6.290e+02 -2.785e+03 1.408e+04 -6.937e+04 3.018e+05 -1.120e+06\n",
" 3.530e+06 -9.507e+06 2.202e+07 -4.418e+07 7.716e+07 -1.179e+08\n",
" 1.579e+08 -1.860e+08 1.928e+08 -1.759e+08 1.413e+08 -9.962e+07\n",
" 6.154e+07 -3.315e+07 1.549e+07 -6.230e+06 2.134e+06 -6.142e+05\n",
" 1.458e+05 -2.778e+04 4.088e+03 -4.360e+02 3.000e+01 -1.000e+00\n",
" 0.000e+00]\n",
"\n",
"E[Y|X]\n",
"[[65.262 26.076 9.065 3.154 4.68 ]\n",
" [28.303 19.062 4.288 1.5 2.223]\n",
" [10.216 5.074 5.815 0.712 1.046]\n",
" [ 2.315 0.859 0.283 1.938 0.144]\n",
" [ 4.526 2.47 0.999 0.366 2.908]]\n"
]
}
],
"source": [
"e_xy = np.zeros(shape=(n_items, n_items))\n",
"e_xy[:][:] = -1e100\n",
" \n",
"items = interactions_df['item_id'].unique()\n",
" \n",
"p_y_powers = {}\n",
"for y in items:\n",
" p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
" \n",
"print(\"p_y_powers for the first item\")\n",
"print(p_y_powers[0])\n",
"\n",
"for x in items:\n",
" # Get users who bought X\n",
" c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
"\n",
" # Get users who bought only X\n",
" c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
" c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
"\n",
" # Calculate the number of non-X interactions for each user who bought X\n",
" # Include users with zero non-X interactions\n",
" n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
" n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
" # Unnecessary, but added for readability\n",
" n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
"\n",
" zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n",
" n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
"\n",
" n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
"\n",
" # Calculate the expected numbers of Y products bought by clients who bought X\n",
" alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
" for abs_c in n_non_x_interactions[\"n_items\"]])\n",
" for k in range(1, max_interactions + 1)])\n",
" \n",
" if x == 0:\n",
" print(\"alpha_k\")\n",
" print(alpha_k)\n",
" print()\n",
"\n",
" for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
" if y != x:\n",
" e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
" else:\n",
" e_xy[x][y] = n_users * p_y[x]\n",
"\n",
"print(\"E[Y|X]\")\n",
"print(np.around(e_xy[:10, :10], 3))"
]
},
{
"cell_type": "markdown",
"id": "acknowledged-threshold",
"metadata": {},
"source": [
"## Get the user-item interaction matrix"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "extraordinary-mexico",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
" [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]\n",
" [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]\n",
" [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
" [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
" [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]\n",
" [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]\n",
" [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n"
]
}
],
"source": [
"# mapping to int is necessary because of how iterrows works\n",
"r = np.zeros(shape=(n_users, n_items))\n",
"for idx, interaction in interactions_df.iterrows():\n",
" r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
" \n",
"print(r[:10, :10])"
]
},
{
"cell_type": "markdown",
"id": "lovely-password",
"metadata": {},
"source": [
"## Calculate the number of users who bought both X and Y"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "rubber-detector",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n",
" [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n",
" [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n",
" [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n",
" [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n",
" [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n",
" [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n",
" [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n",
" [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n",
" [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n"
]
}
],
"source": [
"# Simple and slow method (commented out)\n",
"\n",
"# n_xy = np.zeros(shape=(n_items, n_items))\n",
"\n",
"# for x in items:\n",
"# for y in items:\n",
"# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
"# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
"# users_x_and_y = users_x & users_y\n",
"# n_xy[x][y] = len(users_x_and_y)\n",
"\n",
"# Optimized method (can be further optimized by using sparse matrices)\n",
"\n",
"n_xy = np.matmul(r.T, r)\n",
"\n",
"print(n_xy[:10, :10])"
]
},
{
"cell_type": "markdown",
"id": "distinguished-consequence",
"metadata": {},
"source": [
"## Calculate the scores"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "pointed-deputy",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n",
" [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n",
" [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n",
" [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n",
" [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n",
" [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n",
" [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n",
" [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n",
" [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n",
" [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n"
]
}
],
"source": [
"scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
"\n",
"print(np.around(scores[:10, :10], 3))"
]
},
{
"cell_type": "markdown",
"id": "endangered-stomach",
"metadata": {},
"source": [
"## Final comparison"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "prepared-fraction",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"E[Y|X]\n",
"[[65.262 26.076 9.065 3.154 4.68 41.571 23.082 8.592 19.542 27.522]\n",
" [28.303 19.062 4.288 1.5 2.223 18.99 10.768 4.066 9.15 12.778]\n",
" [10.216 5.074 5.815 0.712 1.046 7.386 4.577 1.872 3.964 5.308]\n",
" [ 2.315 0.859 0.283 1.938 0.144 1.433 0.754 0.267 0.631 0.911]\n",
" [ 4.526 2.47 0.999 0.366 2.908 3.453 2.245 0.951 1.962 2.574]\n",
" [47.984 20.534 7.279 2.549 3.776 34.569 18.241 6.902 15.507 21.636]\n",
" [25.303 11.206 4.05 1.429 2.112 17.265 16.477 3.843 8.524 11.789]\n",
" [ 9.094 4.124 1.561 0.561 0.826 6.205 3.701 5.492 3.186 4.326]\n",
" [21.633 9.823 3.601 1.276 1.884 14.955 8.776 3.417 13.569 10.322]\n",
" [25.03 10.257 3.571 1.243 1.844 16.332 9.082 3.385 7.691 20.354]]\n",
"\n",
"N(X, Y)\n",
"[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n",
" [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n",
" [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n",
" [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n",
" [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n",
" [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n",
" [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n",
" [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n",
" [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n",
" [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n",
"\n",
"Scores\n",
"[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n",
" [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n",
" [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n",
" [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n",
" [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n",
" [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n",
" [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n",
" [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n",
" [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n",
" [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n",
"\n"
]
}
],
"source": [
"print(\"E[Y|X]\")\n",
"print(np.around(e_xy[:10, :10], 3))\n",
"print()\n",
"\n",
"print(\"N(X, Y)\")\n",
"print(n_xy[:10, :10])\n",
"print()\n",
"\n",
"print(\"Scores\")\n",
"print(np.around(scores[:10, :10], 3))\n",
"print()"
]
},
{
"cell_type": "markdown",
"id": "distant-archive",
"metadata": {},
"source": [
"# Inner workings of the Amazon recommender recommend method"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "aerial-shipping",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommendation: 1, Brick (2005), 6.122652596595853\n",
"Recommendation: 1, Oh, God! (1977), 5.908857666844879\n",
"Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312\n",
"Recommendation: 1, Meatballs (1979), 5.56930833865894\n",
"Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742\n",
"Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393\n",
"Recommendation: 1, Six-String Samurai (1998), 5.225652131462832\n",
"Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206\n",
"Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857\n",
"Recommendation: 1, Clara's Heart (1988), 4.608515964550741\n"
]
}
],
"source": [
"user_id = 1\n",
"should_recommend_already_bought = False\n",
"n_recommendations = 10\n",
"\n",
"mapped_user_id = user_id_mapping[user_id]\n",
"\n",
"x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
"final_scores = np.sum(scores[x_list], axis=0)\n",
"\n",
"# Choose n recommendations based on highest scores\n",
"if not should_recommend_already_bought:\n",
" final_scores[x_list] = -1e100\n",
"\n",
"chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
"\n",
"for item_id in chosen_ids:\n",
" print(\"Recommendation: {}, {}, {}\".format(user_id_reverse_mapping[mapped_user_id],\n",
" ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], \n",
" 'title'].iloc[0],\n",
" final_scores[item_id]))"
]
},
{
"cell_type": "markdown",
"id": "opponent-prediction",
"metadata": {},
"source": [
"# Amazon recommder"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "fancy-return",
"metadata": {},
"outputs": [],
"source": [
"from recommenders.recommender import Recommender\n",
"\n",
"class AmazonRecommender(Recommender):\n",
" \"\"\"\n",
" Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:\n",
" - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,\n",
" IEEE Internet Computing, 2003,\n",
" - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.\n",
" \"\"\"\n",
"\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
" self.interactions_df = None\n",
" self.item_id_mapping = None\n",
" self.user_id_mapping = None\n",
" self.item_id_reverse_mapping = None\n",
" self.user_id_reverse_mapping = None\n",
" self.e_xy = None\n",
" self.n_xy = None\n",
" self.scores = None\n",
" self.most_popular_items = None\n",
" self.should_recommend_already_bought = False\n",
"\n",
" def initialize(self, **params):\n",
" if 'should_recommend_already_bought' in params:\n",
" self.should_recommend_already_bought = params['should_recommend_already_bought']\n",
"\n",
" def fit(self, interactions_df, users_df, items_df):\n",
" \"\"\"\n",
" Training of the recommender.\n",
"\n",
" :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n",
" defined by user_id, item_id and features of the interaction.\n",
" :param pd.DataFrame users_df: DataFrame with users and their features defined by\n",
" user_id and the user feature columns.\n",
" :param pd.DataFrame items_df: DataFrame with items and their features defined\n",
" by item_id and the item feature columns.\n",
" \"\"\"\n",
"\n",
" # Shift item ids and user ids so that they are consecutive\n",
"\n",
" unique_item_ids = interactions_df['item_id'].unique()\n",
" self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
" self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
" unique_user_ids = interactions_df['user_id'].unique()\n",
" self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
" self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
" \n",
" interactions_df = interactions_df.copy()\n",
" interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n",
"\n",
" # Get the number of items and users\n",
"\n",
" self.interactions_df = interactions_df\n",
" n_items = np.max(interactions_df['item_id']) + 1\n",
" n_users = np.max(interactions_df['user_id']) + 1\n",
"\n",
" # Get maximal number of interactions\n",
"\n",
" n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
" # Unnecessary, but added for readability\n",
" n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
" max_interactions = n_user_interactions['n_items'].max()\n",
"\n",
" # Calculate P_Y's\n",
"\n",
" n_interactions = len(interactions_df)\n",
" p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
" p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
" p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
" p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
"\n",
" # Get the series of all items\n",
"\n",
" # items = list(range(n_items))\n",
" items = interactions_df['item_id'].unique()\n",
"\n",
" # For every X calculate the E[Y|X]\n",
"\n",
" e_xy = np.zeros(shape=(n_items, n_items))\n",
" e_xy[:][:] = -1e100\n",
"\n",
" p_y_powers = {}\n",
" for y in items:\n",
" p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
"\n",
" for x in items:\n",
" # Get users who bought X\n",
" c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
"\n",
" # Get users who bought only X\n",
" c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
" c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
"\n",
" # Calculate the number of non-X interactions for each user who bought X\n",
" # Include users with zero non-X interactions\n",
" n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
" n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
" # Unnecessary, but added for readability\n",
" n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
"\n",
" zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n",
" n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
"\n",
" n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
"\n",
" # Calculate the expected numbers of Y products bought by clients who bought X\n",
" alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
" for abs_c in n_non_x_interactions[\"n_items\"]])\n",
" for k in range(1, max_interactions + 1)])\n",
"\n",
" for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
" if y != x:\n",
" e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
" else:\n",
" e_xy[x][y] = n_users * p_y[x]\n",
"\n",
" self.e_xy = e_xy\n",
"\n",
" # Calculate the number of users who bought both X and Y\n",
"\n",
" # Simple and slow method (commented out)\n",
"\n",
" # n_xy = np.zeros(shape=(n_items, n_items))\n",
"\n",
" # for x in items:\n",
" # for y in items:\n",
" # users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
" # users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
" # users_x_and_y = users_x & users_y\n",
" # n_xy[x][y] = len(users_x_and_y)\n",
"\n",
" # Optimized method (can be further optimized by using sparse matrices)\n",
"\n",
" # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n",
" r = np.zeros(shape=(n_users, n_items))\n",
" for idx, interaction in interactions_df.iterrows():\n",
" r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
"\n",
" # Get the number of users who bought both X and Y\n",
"\n",
" n_xy = np.matmul(r.T, r)\n",
"\n",
" self.n_xy = n_xy\n",
" \n",
" # Calculate the scores\n",
"\n",
" self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
" \n",
" # Find the most popular items for the cold start problem\n",
" \n",
" offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n",
" offers_count = offers_count.sort_values('user_id', ascending=False)\n",
" self.most_popular_items = offers_count.index\n",
"\n",
" def recommend(self, users_df, items_df, n_recommendations=1):\n",
" \"\"\"\n",
" Serving of recommendations. Scores items in items_df for each user in users_df and returns\n",
" top n_recommendations for each user.\n",
"\n",
" :param pd.DataFrame users_df: DataFrame with users and their features for which\n",
" recommendations should be generated.\n",
" :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n",
" :param int n_recommendations: Number of recommendations to be returned for each user.\n",
" :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n",
" for each user.\n",
" :rtype: pd.DataFrame\n",
" \"\"\"\n",
"\n",
" # Clean previous recommendations (iloc could be used alternatively)\n",
" self.recommender_df = self.recommender_df[:0]\n",
" \n",
" # Handle users not in the training data\n",
"\n",
" # Map item ids\n",
" \n",
" items_df = items_df.copy()\n",
" items_df.replace({'item_id': self.user_id_mapping}, inplace=True)\n",
"\n",
" # Generate recommendations\n",
"\n",
" for idx, user in users_df.iterrows():\n",
" recommendations = []\n",
" \n",
" user_id = user['user_id']\n",
" \n",
" if user_id in self.user_id_mapping:\n",
" mapped_user_id = self.user_id_mapping[user_id]\n",
" \n",
" x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
" final_scores = np.sum(self.scores[x_list], axis=0)\n",
"\n",
" # Choose n recommendations based on highest scores\n",
" if not self.should_recommend_already_bought:\n",
" final_scores[x_list] = -1e100\n",
"\n",
" chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
"\n",
" for item_id in chosen_ids:\n",
" recommendations.append(\n",
" {\n",
" 'user_id': self.user_id_reverse_mapping[mapped_user_id],\n",
" 'item_id': self.item_id_reverse_mapping[item_id],\n",
" 'score': final_scores[item_id]\n",
" }\n",
" )\n",
" else: # For new users recommend most popular items\n",
" for i in range(n_recommendations):\n",
" recommendations.append(\n",
" {\n",
" 'user_id': user['user_id'],\n",
" 'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n",
" 'score': 1.0\n",
" }\n",
" )\n",
"\n",
" user_recommendations = pd.DataFrame(recommendations)\n",
"\n",
" self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n",
"\n",
" return self.recommender_df"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "nonprofit-roads",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommendations\n"
]
},
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_id</th>\n",
" <th>item_id</th>\n",
" <th>score</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>44761</td>\n",
" <td>6.122653</td>\n",
" <td>Brick (2005)</td>\n",
" <td>Crime|Drama|Film-Noir|Mystery</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>5214</td>\n",
" <td>5.908858</td>\n",
" <td>Oh, God! (1977)</td>\n",
" <td>Comedy|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6755</td>\n",
" <td>5.830667</td>\n",
" <td>Bubba Ho-tep (2002)</td>\n",
" <td>Comedy|Horror</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3040</td>\n",
" <td>5.569308</td>\n",
" <td>Meatballs (1979)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>6713</td>\n",
" <td>5.502504</td>\n",
" <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
" <td>Animation|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>3614</td>\n",
" <td>5.387478</td>\n",
" <td>Honeymoon in Vegas (1992)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>2275</td>\n",
" <td>5.225652</td>\n",
" <td>Six-String Samurai (1998)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>4796</td>\n",
" <td>5.144470</td>\n",
" <td>Grass Is Greener, The (1960)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>4896</td>\n",
" <td>4.796473</td>\n",
" <td>Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)</td>\n",
" <td>Adventure|Children|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>3714</td>\n",
" <td>4.608516</td>\n",
" <td>Clara's Heart (1988)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>3614</td>\n",
" <td>7.825335</td>\n",
" <td>Honeymoon in Vegas (1992)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4</td>\n",
" <td>6713</td>\n",
" <td>7.407051</td>\n",
" <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
" <td>Animation|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>2690</td>\n",
" <td>6.599105</td>\n",
" <td>Ideal Husband, An (1999)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>4</td>\n",
" <td>44761</td>\n",
" <td>6.205835</td>\n",
" <td>Brick (2005)</td>\n",
" <td>Crime|Drama|Film-Noir|Mystery</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>3628</td>\n",
" <td>6.186298</td>\n",
" <td>Flying Tigers (1942)</td>\n",
" <td>Action|Drama|Romance|War</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>4</td>\n",
" <td>6755</td>\n",
" <td>5.977848</td>\n",
" <td>Bubba Ho-tep (2002)</td>\n",
" <td>Comedy|Horror</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>4</td>\n",
" <td>959</td>\n",
" <td>5.919668</td>\n",
" <td>Of Human Bondage (1934)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>4</td>\n",
" <td>31260</td>\n",
" <td>5.919668</td>\n",
" <td>Boys Town (1938)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>4</td>\n",
" <td>6033</td>\n",
" <td>5.919668</td>\n",
" <td>Mystery Date (1991)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>4</td>\n",
" <td>3714</td>\n",
" <td>5.919668</td>\n",
" <td>Clara's Heart (1988)</td>\n",
" <td>Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>6</td>\n",
" <td>3614</td>\n",
" <td>11.392962</td>\n",
" <td>Honeymoon in Vegas (1992)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>6</td>\n",
" <td>31921</td>\n",
" <td>8.329693</td>\n",
" <td>Seven-Per-Cent Solution, The (1976)</td>\n",
" <td>Adventure|Comedy|Crime|Drama|Mystery|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>6</td>\n",
" <td>1752</td>\n",
" <td>8.236954</td>\n",
" <td>Hard Rain (1998)</td>\n",
" <td>Action|Crime|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>6</td>\n",
" <td>95147</td>\n",
" <td>8.006113</td>\n",
" <td>Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987)</td>\n",
" <td>Action|Adventure|Animation|Children</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>6</td>\n",
" <td>2275</td>\n",
" <td>6.941940</td>\n",
" <td>Six-String Samurai (1998)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>6</td>\n",
" <td>3479</td>\n",
" <td>6.771276</td>\n",
" <td>Ladyhawke (1985)</td>\n",
" <td>Adventure|Fantasy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>6</td>\n",
" <td>6755</td>\n",
" <td>6.520369</td>\n",
" <td>Bubba Ho-tep (2002)</td>\n",
" <td>Comedy|Horror</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>6</td>\n",
" <td>6537</td>\n",
" <td>6.454421</td>\n",
" <td>Terminator 3: Rise of the Machines (2003)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>6</td>\n",
" <td>4483</td>\n",
" <td>6.339894</td>\n",
" <td>Caddyshack II (1988)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>6</td>\n",
" <td>228</td>\n",
" <td>6.174734</td>\n",
" <td>Destiny Turns on the Radio (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Quick test of the recommender\n",
"\n",
"amazon_recommender = AmazonRecommender()\n",
"amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)\n",
"recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n",
"\n",
"recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n",
"print(\"Recommendations\")\n",
"display(HTML(recommendations.to_html()))"
]
},
{
"cell_type": "markdown",
"id": "framed-negative",
"metadata": {},
"source": [
"# Training-test split evaluation"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "romantic-music",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.181818</td>\n",
" <td>0.311688</td>\n",
" <td>0.402597</td>\n",
" <td>0.551948</td>\n",
" <td>0.181818</td>\n",
" <td>0.257806</td>\n",
" <td>0.294682</td>\n",
" <td>0.34147</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n",
"\n",
"amazon_recommender = AmazonRecommender()\n",
"\n",
"amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n",
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"amazon_tts_results = pd.DataFrame(\n",
" amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(amazon_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "saving-harrison",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.025974</td>\n",
" <td>0.090909</td>\n",
" <td>0.136364</td>\n",
" <td>0.318182</td>\n",
" <td>0.025974</td>\n",
" <td>0.064393</td>\n",
" <td>0.083685</td>\n",
" <td>0.140799</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from recommenders.tfidf_recommender import TFIDFRecommender\n",
"\n",
"tfidf_recommender = TFIDFRecommender()\n",
"\n",
"tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n",
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
"\n",
"tfidf_tts_results = pd.DataFrame(\n",
" tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(tfidf_tts_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "random-source",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.181818</td>\n",
" <td>0.311688</td>\n",
" <td>0.402597</td>\n",
" <td>0.551948</td>\n",
" <td>0.181818</td>\n",
" <td>0.257806</td>\n",
" <td>0.294682</td>\n",
" <td>0.341470</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.025974</td>\n",
" <td>0.090909</td>\n",
" <td>0.136364</td>\n",
" <td>0.318182</td>\n",
" <td>0.025974</td>\n",
" <td>0.064393</td>\n",
" <td>0.083685</td>\n",
" <td>0.140799</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n",
"display(HTML(tts_results.to_html()))"
]
},
{
"cell_type": "markdown",
"id": "continued-harassment",
"metadata": {},
"source": [
"# Leave-one-out evaluation"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "prerequisite-lounge",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.166667</td>\n",
" <td>0.256667</td>\n",
" <td>0.32</td>\n",
" <td>0.426667</td>\n",
" <td>0.166667</td>\n",
" <td>0.219086</td>\n",
" <td>0.245486</td>\n",
" <td>0.279978</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from evaluation_and_testing.testing import evaluate_leave_one_out_implicit\n",
"\n",
"amazon_recommender = AmazonRecommender()\n",
"\n",
"amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"amazon_loo_results = pd.DataFrame(\n",
" amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(amazon_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "behind-cambodia",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.006667</td>\n",
" <td>0.053333</td>\n",
" <td>0.123333</td>\n",
" <td>0.233333</td>\n",
" <td>0.006667</td>\n",
" <td>0.033491</td>\n",
" <td>0.062178</td>\n",
" <td>0.096151</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tfidf_recommender = TFIDFRecommender()\n",
"\n",
"tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n",
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
"\n",
"tfidf_loo_results = pd.DataFrame(\n",
" tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
"\n",
"display(HTML(tfidf_loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "lightweight-password",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recommender</th>\n",
" <th>HR@1</th>\n",
" <th>HR@3</th>\n",
" <th>HR@5</th>\n",
" <th>HR@10</th>\n",
" <th>NDCG@1</th>\n",
" <th>NDCG@3</th>\n",
" <th>NDCG@5</th>\n",
" <th>NDCG@10</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AmazonRecommender</td>\n",
" <td>0.166667</td>\n",
" <td>0.256667</td>\n",
" <td>0.320000</td>\n",
" <td>0.426667</td>\n",
" <td>0.166667</td>\n",
" <td>0.219086</td>\n",
" <td>0.245486</td>\n",
" <td>0.279978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TFIDFRecommender</td>\n",
" <td>0.006667</td>\n",
" <td>0.053333</td>\n",
" <td>0.123333</td>\n",
" <td>0.233333</td>\n",
" <td>0.006667</td>\n",
" <td>0.033491</td>\n",
" <td>0.062178</td>\n",
" <td>0.096151</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n",
"display(HTML(loo_results.to_html()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "mediterranean-residence",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}