{ "cells": [ { "cell_type": "code", "execution_count": 112, "id": "verified-accommodation", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from IPython.display import Markdown, display, HTML\n", "from collections import defaultdict\n", "from sklearn.model_selection import KFold\n", "import scipy.special as scisp\n", "\n", "# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n", "import os\n", "os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'" ] }, { "cell_type": "markdown", "id": "educated-tourist", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 113, "id": "looking-feeling", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
item_idtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
56Heat (1995)Action|Crime|Thriller
67Sabrina (1995)Comedy|Romance
78Tom and Huck (1995)Adventure|Children
89Sudden Death (1995)Action
910GoldenEye (1995)Action|Adventure|Thriller
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Number of interactions left: 1170\n" ] } ], "source": [ "ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n", "ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n", "ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n", "ml_df.head(10)\n", "\n", "display(HTML(ml_movies_df.head(10).to_html()))\n", "\n", "# Filter the data to reduce the number of movies\n", "seed = 6789\n", "rng = np.random.RandomState(seed=seed)\n", "left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n", "\n", "ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n", "ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n", "ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n", "\n", "print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))" ] }, { "cell_type": "markdown", "id": "protecting-recognition", "metadata": {}, "source": [ "# Inner workings of the Amazon recommender fit method" ] }, { "cell_type": "markdown", "id": "plastic-brooklyn", "metadata": {}, "source": [ "## Shift item ids and user ids so that they are consecutive" ] }, { "cell_type": "code", "execution_count": 71, "id": "valuable-modem", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Item mapping\n", "{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}\n", "\n", "Item reverse mapping\n", "{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}\n", "\n", "User mapping\n", "{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}\n", "\n", "User reverse mapping\n", "{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}\n", "\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idratingtimestamp
42003.0964984086
97014.0964980985
216024.0964981725
310133.0945078428
398114.0964622830
416144.0964622714
513154.01007574532
616264.0845553966
629233.0845555402
677273.0845554376
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "interactions_df = ml_ratings_df.copy()\n", "\n", "unique_item_ids = interactions_df['item_id'].unique()\n", "item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n", "item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n", "unique_user_ids = interactions_df['user_id'].unique()\n", "user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n", "user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n", "\n", "interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)\n", "\n", "print(\"Item mapping\")\n", "print(item_id_mapping)\n", "print()\n", "\n", "print(\"Item reverse mapping\")\n", "print(item_id_reverse_mapping)\n", "print()\n", "\n", "print(\"User mapping\")\n", "print(user_id_mapping)\n", "print()\n", "\n", "print(\"User reverse mapping\")\n", "print(user_id_reverse_mapping)\n", "print()\n", "\n", "display(HTML(interactions_df.head(10).to_html()))" ] }, { "cell_type": "markdown", "id": "basic-meeting", "metadata": {}, "source": [ "## Get the number of items and users" ] }, { "cell_type": "code", "execution_count": 75, "id": "close-massachusetts", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n_items=100\n", "n_users=378\n" ] } ], "source": [ "n_items = np.max(interactions_df['item_id']) + 1\n", "n_users = np.max(interactions_df['user_id']) + 1\n", "\n", "print(\"n_items={}\\nn_users={}\".format(n_items, n_users))" ] }, { "cell_type": "markdown", "id": "permanent-corrections", "metadata": {}, "source": [ "## Get the maximal number of interactions" ] }, { "cell_type": "code", "execution_count": 73, "id": "peripheral-natural", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "max_interaction=31\n" ] } ], "source": [ "n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n", "# Unnecessary, but added for readability\n", "n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n", "max_interactions = n_user_interactions['n_items'].max()\n", "\n", "print(\"max_interaction={}\".format(max_interactions))" ] }, { "cell_type": "markdown", "id": "basic-production", "metadata": {}, "source": [ "## Calculate P_Y's" ] }, { "cell_type": "code", "execution_count": 76, "id": "concrete-transparency", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}\n" ] } ], "source": [ "n_interactions = len(interactions_df)\n", "p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n", "p_y = p_y.rename(columns={'user_id': 'P_Y'})\n", "p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n", "p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n", "\n", "print(p_y)" ] }, { "cell_type": "markdown", "id": "consolidated-constant", "metadata": {}, "source": [ "## For every X calculate the E[Y|X]" ] }, { "cell_type": "code", "execution_count": 99, "id": "alive-cameroon", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "p_y_powers for the first item\n", "[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06\n", " 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11\n", " 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17\n", " 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22\n", " 7.547e-23 1.303e-23 2.250e-24]\n", "alpha_k\n", "[ 6.290e+02 -2.785e+03 1.408e+04 -6.937e+04 3.018e+05 -1.120e+06\n", " 3.530e+06 -9.507e+06 2.202e+07 -4.418e+07 7.716e+07 -1.179e+08\n", " 1.579e+08 -1.860e+08 1.928e+08 -1.759e+08 1.413e+08 -9.962e+07\n", " 6.154e+07 -3.315e+07 1.549e+07 -6.230e+06 2.134e+06 -6.142e+05\n", " 1.458e+05 -2.778e+04 4.088e+03 -4.360e+02 3.000e+01 -1.000e+00\n", " 0.000e+00]\n", "\n", "E[Y|X]\n", "[[65.262 26.076 9.065 3.154 4.68 ]\n", " [28.303 19.062 4.288 1.5 2.223]\n", " [10.216 5.074 5.815 0.712 1.046]\n", " [ 2.315 0.859 0.283 1.938 0.144]\n", " [ 4.526 2.47 0.999 0.366 2.908]]\n" ] } ], "source": [ "e_xy = np.zeros(shape=(n_items, n_items))\n", "e_xy[:][:] = -1e100\n", " \n", "items = interactions_df['item_id'].unique()\n", " \n", "p_y_powers = {}\n", "for y in items:\n", " p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n", " \n", "print(\"p_y_powers for the first item\")\n", "print(p_y_powers[0])\n", "\n", "for x in items:\n", " # Get users who bought X\n", " c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n", "\n", " # Get users who bought only X\n", " c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n", " c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n", "\n", " # Calculate the number of non-X interactions for each user who bought X\n", " # Include users with zero non-X interactions\n", " n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n", " n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n", " # Unnecessary, but added for readability\n", " n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n", "\n", " zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n", " n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n", "\n", " n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n", "\n", " # Calculate the expected numbers of Y products bought by clients who bought X\n", " alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n", " for abs_c in n_non_x_interactions[\"n_items\"]])\n", " for k in range(1, max_interactions + 1)])\n", " \n", " if x == 0:\n", " print(\"alpha_k\")\n", " print(alpha_k)\n", " print()\n", "\n", " for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n", " if y != x:\n", " e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n", " else:\n", " e_xy[x][y] = n_users * p_y[x]\n", "\n", "print(\"E[Y|X]\")\n", "print(np.around(e_xy[:10, :10], 3))" ] }, { "cell_type": "markdown", "id": "acknowledged-threshold", "metadata": {}, "source": [ "## Get the user-item interaction matrix" ] }, { "cell_type": "code", "execution_count": 89, "id": "extraordinary-mexico", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]\n", " [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]\n", " [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n", " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]\n", " [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]\n", " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n" ] } ], "source": [ "# mapping to int is necessary because of how iterrows works\n", "r = np.zeros(shape=(n_users, n_items))\n", "for idx, interaction in interactions_df.iterrows():\n", " r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n", " \n", "print(r[:10, :10])" ] }, { "cell_type": "markdown", "id": "lovely-password", "metadata": {}, "source": [ "## Calculate the number of users who bought both X and Y" ] }, { "cell_type": "code", "execution_count": 91, "id": "rubber-detector", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n", " [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n", " [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n", " [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n", " [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n", " [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n", " [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n", " [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n", " [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n", " [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n" ] } ], "source": [ "# Simple and slow method (commented out)\n", "\n", "# n_xy = np.zeros(shape=(n_items, n_items))\n", "\n", "# for x in items:\n", "# for y in items:\n", "# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n", "# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n", "# users_x_and_y = users_x & users_y\n", "# n_xy[x][y] = len(users_x_and_y)\n", "\n", "# Optimized method (can be further optimized by using sparse matrices)\n", "\n", "n_xy = np.matmul(r.T, r)\n", "\n", "print(n_xy[:10, :10])" ] }, { "cell_type": "markdown", "id": "distinguished-consequence", "metadata": {}, "source": [ "## Calculate the scores" ] }, { "cell_type": "code", "execution_count": 97, "id": "pointed-deputy", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n", " [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n", " [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n", " [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n", " [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n", " [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n", " [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n", " [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n", " [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n", " [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n" ] } ], "source": [ "scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n", "\n", "print(np.around(scores[:10, :10], 3))" ] }, { "cell_type": "markdown", "id": "endangered-stomach", "metadata": {}, "source": [ "## Final comparison" ] }, { "cell_type": "code", "execution_count": 103, "id": "prepared-fraction", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "E[Y|X]\n", "[[65.262 26.076 9.065 3.154 4.68 41.571 23.082 8.592 19.542 27.522]\n", " [28.303 19.062 4.288 1.5 2.223 18.99 10.768 4.066 9.15 12.778]\n", " [10.216 5.074 5.815 0.712 1.046 7.386 4.577 1.872 3.964 5.308]\n", " [ 2.315 0.859 0.283 1.938 0.144 1.433 0.754 0.267 0.631 0.911]\n", " [ 4.526 2.47 0.999 0.366 2.908 3.453 2.245 0.951 1.962 2.574]\n", " [47.984 20.534 7.279 2.549 3.776 34.569 18.241 6.902 15.507 21.636]\n", " [25.303 11.206 4.05 1.429 2.112 17.265 16.477 3.843 8.524 11.789]\n", " [ 9.094 4.124 1.561 0.561 0.826 6.205 3.701 5.492 3.186 4.326]\n", " [21.633 9.823 3.601 1.276 1.884 14.955 8.776 3.417 13.569 10.322]\n", " [25.03 10.257 3.571 1.243 1.844 16.332 9.082 3.385 7.691 20.354]]\n", "\n", "N(X, Y)\n", "[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n", " [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n", " [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n", " [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n", " [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n", " [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n", " [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n", " [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n", " [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n", " [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n", "\n", "Scores\n", "[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n", " [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n", " [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n", " [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n", " [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n", " [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n", " [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n", " [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n", " [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n", " [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n", "\n" ] } ], "source": [ "print(\"E[Y|X]\")\n", "print(np.around(e_xy[:10, :10], 3))\n", "print()\n", "\n", "print(\"N(X, Y)\")\n", "print(n_xy[:10, :10])\n", "print()\n", "\n", "print(\"Scores\")\n", "print(np.around(scores[:10, :10], 3))\n", "print()" ] }, { "cell_type": "markdown", "id": "distant-archive", "metadata": {}, "source": [ "# Inner workings of the Amazon recommender recommend method" ] }, { "cell_type": "code", "execution_count": 111, "id": "aerial-shipping", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommendation: 1, Brick (2005), 6.122652596595853\n", "Recommendation: 1, Oh, God! (1977), 5.908857666844879\n", "Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312\n", "Recommendation: 1, Meatballs (1979), 5.56930833865894\n", "Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742\n", "Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393\n", "Recommendation: 1, Six-String Samurai (1998), 5.225652131462832\n", "Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206\n", "Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857\n", "Recommendation: 1, Clara's Heart (1988), 4.608515964550741\n" ] } ], "source": [ "user_id = 1\n", "should_recommend_already_bought = False\n", "n_recommendations = 10\n", "\n", "mapped_user_id = user_id_mapping[user_id]\n", "\n", "x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n", "final_scores = np.sum(scores[x_list], axis=0)\n", "\n", "# Choose n recommendations based on highest scores\n", "if not should_recommend_already_bought:\n", " final_scores[x_list] = -1e100\n", "\n", "chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n", "\n", "for item_id in chosen_ids:\n", " print(\"Recommendation: {}, {}, {}\".format(user_id_reverse_mapping[mapped_user_id],\n", " ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], \n", " 'title'].iloc[0],\n", " final_scores[item_id]))" ] }, { "cell_type": "markdown", "id": "opponent-prediction", "metadata": {}, "source": [ "# Amazon recommder" ] }, { "cell_type": "code", "execution_count": 48, "id": "fancy-return", "metadata": {}, "outputs": [], "source": [ "from recommenders.recommender import Recommender\n", "\n", "class AmazonRecommender(Recommender):\n", " \"\"\"\n", " Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:\n", " - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,\n", " IEEE Internet Computing, 2003,\n", " - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.\n", " \"\"\"\n", "\n", " def __init__(self):\n", " super().__init__()\n", " self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n", " self.interactions_df = None\n", " self.item_id_mapping = None\n", " self.user_id_mapping = None\n", " self.item_id_reverse_mapping = None\n", " self.user_id_reverse_mapping = None\n", " self.e_xy = None\n", " self.n_xy = None\n", " self.scores = None\n", " self.most_popular_items = None\n", " self.should_recommend_already_bought = False\n", "\n", " def initialize(self, **params):\n", " if 'should_recommend_already_bought' in params:\n", " self.should_recommend_already_bought = params['should_recommend_already_bought']\n", "\n", " def fit(self, interactions_df, users_df, items_df):\n", " \"\"\"\n", " Training of the recommender.\n", "\n", " :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n", " defined by user_id, item_id and features of the interaction.\n", " :param pd.DataFrame users_df: DataFrame with users and their features defined by\n", " user_id and the user feature columns.\n", " :param pd.DataFrame items_df: DataFrame with items and their features defined\n", " by item_id and the item feature columns.\n", " \"\"\"\n", "\n", " # Shift item ids and user ids so that they are consecutive\n", "\n", " unique_item_ids = interactions_df['item_id'].unique()\n", " self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n", " self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n", " unique_user_ids = interactions_df['user_id'].unique()\n", " self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n", " self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n", " \n", " interactions_df = interactions_df.copy()\n", " interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n", "\n", " # Get the number of items and users\n", "\n", " self.interactions_df = interactions_df\n", " n_items = np.max(interactions_df['item_id']) + 1\n", " n_users = np.max(interactions_df['user_id']) + 1\n", "\n", " # Get maximal number of interactions\n", "\n", " n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n", " # Unnecessary, but added for readability\n", " n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n", " max_interactions = n_user_interactions['n_items'].max()\n", "\n", " # Calculate P_Y's\n", "\n", " n_interactions = len(interactions_df)\n", " p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n", " p_y = p_y.rename(columns={'user_id': 'P_Y'})\n", " p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n", " p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n", "\n", " # Get the series of all items\n", "\n", " # items = list(range(n_items))\n", " items = interactions_df['item_id'].unique()\n", "\n", " # For every X calculate the E[Y|X]\n", "\n", " e_xy = np.zeros(shape=(n_items, n_items))\n", " e_xy[:][:] = -1e100\n", "\n", " p_y_powers = {}\n", " for y in items:\n", " p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n", "\n", " for x in items:\n", " # Get users who bought X\n", " c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n", "\n", " # Get users who bought only X\n", " c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n", " c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n", "\n", " # Calculate the number of non-X interactions for each user who bought X\n", " # Include users with zero non-X interactions\n", " n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n", " n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n", " # Unnecessary, but added for readability\n", " n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n", "\n", " zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n", " n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n", "\n", " n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n", "\n", " # Calculate the expected numbers of Y products bought by clients who bought X\n", " alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n", " for abs_c in n_non_x_interactions[\"n_items\"]])\n", " for k in range(1, max_interactions + 1)])\n", "\n", " for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n", " if y != x:\n", " e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n", " else:\n", " e_xy[x][y] = n_users * p_y[x]\n", "\n", " self.e_xy = e_xy\n", "\n", " # Calculate the number of users who bought both X and Y\n", "\n", " # Simple and slow method (commented out)\n", "\n", " # n_xy = np.zeros(shape=(n_items, n_items))\n", "\n", " # for x in items:\n", " # for y in items:\n", " # users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n", " # users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n", " # users_x_and_y = users_x & users_y\n", " # n_xy[x][y] = len(users_x_and_y)\n", "\n", " # Optimized method (can be further optimized by using sparse matrices)\n", "\n", " # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n", " r = np.zeros(shape=(n_users, n_items))\n", " for idx, interaction in interactions_df.iterrows():\n", " r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n", "\n", " # Get the number of users who bought both X and Y\n", "\n", " n_xy = np.matmul(r.T, r)\n", "\n", " self.n_xy = n_xy\n", " \n", " # Calculate the scores\n", "\n", " self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n", " \n", " # Find the most popular items for the cold start problem\n", " \n", " offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n", " offers_count = offers_count.sort_values('user_id', ascending=False)\n", " self.most_popular_items = offers_count.index\n", "\n", " def recommend(self, users_df, items_df, n_recommendations=1):\n", " \"\"\"\n", " Serving of recommendations. Scores items in items_df for each user in users_df and returns\n", " top n_recommendations for each user.\n", "\n", " :param pd.DataFrame users_df: DataFrame with users and their features for which\n", " recommendations should be generated.\n", " :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n", " :param int n_recommendations: Number of recommendations to be returned for each user.\n", " :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n", " for each user.\n", " :rtype: pd.DataFrame\n", " \"\"\"\n", "\n", " # Clean previous recommendations (iloc could be used alternatively)\n", " self.recommender_df = self.recommender_df[:0]\n", " \n", " # Handle users not in the training data\n", "\n", " # Map item ids\n", " \n", " items_df = items_df.copy()\n", " items_df.replace({'item_id': self.user_id_mapping}, inplace=True)\n", "\n", " # Generate recommendations\n", "\n", " for idx, user in users_df.iterrows():\n", " recommendations = []\n", " \n", " user_id = user['user_id']\n", " \n", " if user_id in self.user_id_mapping:\n", " mapped_user_id = self.user_id_mapping[user_id]\n", " \n", " x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n", " final_scores = np.sum(self.scores[x_list], axis=0)\n", "\n", " # Choose n recommendations based on highest scores\n", " if not self.should_recommend_already_bought:\n", " final_scores[x_list] = -1e100\n", "\n", " chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n", "\n", " for item_id in chosen_ids:\n", " recommendations.append(\n", " {\n", " 'user_id': self.user_id_reverse_mapping[mapped_user_id],\n", " 'item_id': self.item_id_reverse_mapping[item_id],\n", " 'score': final_scores[item_id]\n", " }\n", " )\n", " else: # For new users recommend most popular items\n", " for i in range(n_recommendations):\n", " recommendations.append(\n", " {\n", " 'user_id': user['user_id'],\n", " 'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n", " 'score': 1.0\n", " }\n", " )\n", "\n", " user_recommendations = pd.DataFrame(recommendations)\n", "\n", " self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n", "\n", " return self.recommender_df" ] }, { "cell_type": "code", "execution_count": 49, "id": "nonprofit-roads", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommendations\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idscoretitlegenres
01447616.122653Brick (2005)Crime|Drama|Film-Noir|Mystery
1152145.908858Oh, God! (1977)Comedy|Fantasy
2167555.830667Bubba Ho-tep (2002)Comedy|Horror
3130405.569308Meatballs (1979)Comedy
4167135.502504Millennium Actress (Sennen joyû) (2001)Animation|Drama|Romance
5136145.387478Honeymoon in Vegas (1992)Comedy|Romance
6122755.225652Six-String Samurai (1998)Action|Adventure|Sci-Fi
7147965.144470Grass Is Greener, The (1960)Comedy|Romance
8148964.796473Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)Adventure|Children|Fantasy
9137144.608516Clara's Heart (1988)Drama
10436147.825335Honeymoon in Vegas (1992)Comedy|Romance
11467137.407051Millennium Actress (Sennen joyû) (2001)Animation|Drama|Romance
12426906.599105Ideal Husband, An (1999)Comedy|Romance
134447616.205835Brick (2005)Crime|Drama|Film-Noir|Mystery
14436286.186298Flying Tigers (1942)Action|Drama|Romance|War
15467555.977848Bubba Ho-tep (2002)Comedy|Horror
1649595.919668Of Human Bondage (1934)Drama
174312605.919668Boys Town (1938)Drama
18460335.919668Mystery Date (1991)Comedy
19437145.919668Clara's Heart (1988)Drama
206361411.392962Honeymoon in Vegas (1992)Comedy|Romance
216319218.329693Seven-Per-Cent Solution, The (1976)Adventure|Comedy|Crime|Drama|Mystery|Thriller
22617528.236954Hard Rain (1998)Action|Crime|Thriller
236951478.006113Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987)Action|Adventure|Animation|Children
24622756.941940Six-String Samurai (1998)Action|Adventure|Sci-Fi
25634796.771276Ladyhawke (1985)Adventure|Fantasy|Romance
26667556.520369Bubba Ho-tep (2002)Comedy|Horror
27665376.454421Terminator 3: Rise of the Machines (2003)Action|Adventure|Sci-Fi
28644836.339894Caddyshack II (1988)Comedy
2962286.174734Destiny Turns on the Radio (1995)Comedy
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quick test of the recommender\n", "\n", "amazon_recommender = AmazonRecommender()\n", "amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)\n", "recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n", "\n", "recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n", "print(\"Recommendations\")\n", "display(HTML(recommendations.to_html()))" ] }, { "cell_type": "markdown", "id": "framed-negative", "metadata": {}, "source": [ "# Training-test split evaluation" ] }, { "cell_type": "code", "execution_count": 55, "id": "romantic-music", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1818180.3116880.4025970.5519480.1818180.2578060.2946820.34147
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n", "\n", "amazon_recommender = AmazonRecommender()\n", "\n", "amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n", " amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "amazon_tts_results = pd.DataFrame(\n", " amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(amazon_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 57, "id": "saving-harrison", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0TFIDFRecommender0.0259740.0909090.1363640.3181820.0259740.0643930.0836850.140799
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from recommenders.tfidf_recommender import TFIDFRecommender\n", "\n", "tfidf_recommender = TFIDFRecommender()\n", "\n", "tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n", " tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n", "\n", "tfidf_tts_results = pd.DataFrame(\n", " tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(tfidf_tts_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 59, "id": "random-source", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1818180.3116880.4025970.5519480.1818180.2578060.2946820.341470
1TFIDFRecommender0.0259740.0909090.1363640.3181820.0259740.0643930.0836850.140799
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n", "display(HTML(tts_results.to_html()))" ] }, { "cell_type": "markdown", "id": "continued-harassment", "metadata": {}, "source": [ "# Leave-one-out evaluation" ] }, { "cell_type": "code", "execution_count": 62, "id": "prerequisite-lounge", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1666670.2566670.320.4266670.1666670.2190860.2454860.279978
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from evaluation_and_testing.testing import evaluate_leave_one_out_implicit\n", "\n", "amazon_recommender = AmazonRecommender()\n", "\n", "amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n", " amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "amazon_loo_results = pd.DataFrame(\n", " amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(amazon_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 60, "id": "behind-cambodia", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0TFIDFRecommender0.0066670.0533330.1233330.2333330.0066670.0334910.0621780.096151
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tfidf_recommender = TFIDFRecommender()\n", "\n", "tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n", " tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n", "\n", "tfidf_loo_results = pd.DataFrame(\n", " tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n", "\n", "display(HTML(tfidf_loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": 63, "id": "lightweight-password", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecommenderHR@1HR@3HR@5HR@10NDCG@1NDCG@3NDCG@5NDCG@10
0AmazonRecommender0.1666670.2566670.3200000.4266670.1666670.2190860.2454860.279978
1TFIDFRecommender0.0066670.0533330.1233330.2333330.0066670.0334910.0621780.096151
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n", "display(HTML(loo_results.to_html()))" ] }, { "cell_type": "code", "execution_count": null, "id": "mediterranean-residence", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }