meh/recommender-systems-class-master/class_5_amazon_recommender.ipynb
2021-07-07 20:03:54 +02:00

69 KiB

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import scipy.special as scisp

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load data

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
ml_df.head(10)

display(HTML(ml_movies_df.head(10).to_html()))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
4 5 Father of the Bride Part II (1995) Comedy
5 6 Heat (1995) Action|Crime|Thriller
6 7 Sabrina (1995) Comedy|Romance
7 8 Tom and Huck (1995) Adventure|Children
8 9 Sudden Death (1995) Action
9 10 GoldenEye (1995) Action|Adventure|Thriller
Number of interactions left: 1170

Inner workings of the Amazon recommender fit method

Shift item ids and user ids so that they are consecutive

interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

print("Item mapping")
print(item_id_mapping)
print()

print("Item reverse mapping")
print(item_id_reverse_mapping)
print()

print("User mapping")
print(user_id_mapping)
print()

print("User reverse mapping")
print(user_id_reverse_mapping)
print()

display(HTML(interactions_df.head(10).to_html()))
Item mapping
{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}

Item reverse mapping
{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}

User mapping
{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}

User reverse mapping
{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}

user_id item_id rating timestamp
42 0 0 3.0 964984086
97 0 1 4.0 964980985
216 0 2 4.0 964981725
310 1 3 3.0 945078428
398 1 1 4.0 964622830
416 1 4 4.0 964622714
513 1 5 4.0 1007574532
616 2 6 4.0 845553966
629 2 3 3.0 845555402
677 2 7 3.0 845554376

Get the number of items and users

n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))
n_items=100
n_users=378

Get the maximal number of interactions

n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()

print("max_interaction={}".format(max_interactions))
max_interaction=31

Calculate P_Y's

n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

print(p_y)
{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}

For every X calculate the E[Y|X]

e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
    
items = interactions_df['item_id'].unique()
    
p_y_powers = {}
for y in items:
    p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
    
print("p_y_powers for the first item")
print(p_y_powers[0])

for x in items:
    # Get users who bought X
    c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

    # Get users who bought only X
    c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
    c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

    # Calculate the number of non-X interactions for each user who bought X
    # Include users with zero non-X interactions
    n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
    n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
    # Unnecessary, but added for readability
    n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

    zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)  # Remove
    n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

    n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]

    # Calculate the expected numbers of Y products bought by clients who bought X
    alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                for abs_c in n_non_x_interactions["n_items"]])
                        for k in range(1, max_interactions + 1)])
    
    if x == 0:
        print("alpha_k")
        print(alpha_k)
        print()

    for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
        if y != x:
            e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
        else:
            e_xy[x][y] = n_users * p_y[x]

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
p_y_powers for the first item
[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06
 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11
 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17
 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22
 7.547e-23 1.303e-23 2.250e-24]
alpha_k
[ 6.290e+02 -2.785e+03  1.408e+04 -6.937e+04  3.018e+05 -1.120e+06
  3.530e+06 -9.507e+06  2.202e+07 -4.418e+07  7.716e+07 -1.179e+08
  1.579e+08 -1.860e+08  1.928e+08 -1.759e+08  1.413e+08 -9.962e+07
  6.154e+07 -3.315e+07  1.549e+07 -6.230e+06  2.134e+06 -6.142e+05
  1.458e+05 -2.778e+04  4.088e+03 -4.360e+02  3.000e+01 -1.000e+00
  0.000e+00]

E[Y|X]
[[65.262 26.076  9.065  3.154  4.68 ]
 [28.303 19.062  4.288  1.5    2.223]
 [10.216  5.074  5.815  0.712  1.046]
 [ 2.315  0.859  0.283  1.938  0.144]
 [ 4.526  2.47   0.999  0.366  2.908]]

Get the user-item interaction matrix

# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]
 [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]

Calculate the number of users who bought both X and Y

# Simple and slow method (commented out)

# n_xy = np.zeros(shape=(n_items, n_items))

# for x in items:
#     for y in items:
#         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
#         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
#         users_x_and_y = users_x & users_y
#         n_xy[x][y] = len(users_x_and_y)

# Optimized method (can be further optimized by using sparse matrices)

n_xy = np.matmul(r.T, r)

print(n_xy[:10, :10])
[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]
 [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]
 [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]
 [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]
 [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]
 [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]

Calculate the scores

scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)

print(np.around(scores[:10, :10], 3))
[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]
 [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]
 [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]
 [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]
 [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]
 [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]
 [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]
 [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]
 [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]
 [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]

Final comparison

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
print()

print("N(X, Y)")
print(n_xy[:10, :10])
print()

print("Scores")
print(np.around(scores[:10, :10], 3))
print()
E[Y|X]
[[65.262 26.076  9.065  3.154  4.68  41.571 23.082  8.592 19.542 27.522]
 [28.303 19.062  4.288  1.5    2.223 18.99  10.768  4.066  9.15  12.778]
 [10.216  5.074  5.815  0.712  1.046  7.386  4.577  1.872  3.964  5.308]
 [ 2.315  0.859  0.283  1.938  0.144  1.433  0.754  0.267  0.631  0.911]
 [ 4.526  2.47   0.999  0.366  2.908  3.453  2.245  0.951  1.962  2.574]
 [47.984 20.534  7.279  2.549  3.776 34.569 18.241  6.902 15.507 21.636]
 [25.303 11.206  4.05   1.429  2.112 17.265 16.477  3.843  8.524 11.789]
 [ 9.094  4.124  1.561  0.561  0.826  6.205  3.701  5.492  3.186  4.326]
 [21.633  9.823  3.601  1.276  1.884 14.955  8.776  3.417 13.569 10.322]
 [25.03  10.257  3.571  1.243  1.844 16.332  9.082  3.385  7.691 20.354]]

N(X, Y)
[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]
 [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]
 [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]
 [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]
 [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]
 [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]

Scores
[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]
 [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]
 [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]
 [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]
 [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]
 [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]
 [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]
 [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]
 [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]
 [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]

Inner workings of the Amazon recommender recommend method

user_id = 1
should_recommend_already_bought = False
n_recommendations = 10

mapped_user_id = user_id_mapping[user_id]

x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(scores[x_list], axis=0)

# Choose n recommendations based on highest scores
if not should_recommend_already_bought:
    final_scores[x_list] = -1e100

chosen_ids = np.argsort(-final_scores)[:n_recommendations]

for item_id in chosen_ids:
    print("Recommendation: {}, {}, {}".format(user_id_reverse_mapping[mapped_user_id],
                                              ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], 
                                                            'title'].iloc[0],
                                              final_scores[item_id]))
Recommendation: 1, Brick (2005), 6.122652596595853
Recommendation: 1, Oh, God! (1977), 5.908857666844879
Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312
Recommendation: 1, Meatballs (1979), 5.56930833865894
Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742
Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393
Recommendation: 1, Six-String Samurai (1998), 5.225652131462832
Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206
Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857
Recommendation: 1, Clara's Heart (1988), 4.608515964550741

Amazon recommder

from recommenders.recommender import Recommender

class AmazonRecommender(Recommender):
    """
    Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
    - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
        IEEE Internet Computing, 2003,
    - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.e_xy = None
        self.n_xy = None
        self.scores = None
        self.most_popular_items = None
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
        
        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get maximal number of interactions

        n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
        # Unnecessary, but added for readability
        n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
        max_interactions = n_user_interactions['n_items'].max()

        # Calculate P_Y's

        n_interactions = len(interactions_df)
        p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
        p_y = p_y.rename(columns={'user_id': 'P_Y'})
        p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
        p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

        # Get the series of all items

        # items = list(range(n_items))
        items = interactions_df['item_id'].unique()

        # For every X calculate the E[Y|X]

        e_xy = np.zeros(shape=(n_items, n_items))
        e_xy[:][:] = -1e100

        p_y_powers = {}
        for y in items:
            p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])

        for x in items:
            # Get users who bought X
            c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

            # Get users who bought only X
            c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
            c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

            # Calculate the number of non-X interactions for each user who bought X
            # Include users with zero non-X interactions
            n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
            n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
            # Unnecessary, but added for readability
            n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

            zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)  # Remove
            n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

            n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]

            # Calculate the expected numbers of Y products bought by clients who bought X
            alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                        for abs_c in n_non_x_interactions["n_items"]])
                                for k in range(1, max_interactions + 1)])

            for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
                if y != x:
                    e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
                else:
                    e_xy[x][y] = n_users * p_y[x]

        self.e_xy = e_xy

        # Calculate the number of users who bought both X and Y

        # Simple and slow method (commented out)

        # n_xy = np.zeros(shape=(n_items, n_items))

        # for x in items:
        #     for y in items:
        #         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
        #         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
        #         users_x_and_y = users_x & users_y
        #         n_xy[x][y] = len(users_x_and_y)

        # Optimized method (can be further optimized by using sparse matrices)

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        # Get the number of users who bought both X and Y

        n_xy = np.matmul(r.T, r)

        self.n_xy = n_xy
        
        # Calculate the scores

        self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
        
        # Find the most popular items for the cold start problem
        
        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]
        
        # Handle users not in the training data

        # Map item ids
        
        items_df = items_df.copy()
        items_df.replace({'item_id': self.user_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []
            
            user_id = user['user_id']
            
            if user_id in self.user_id_mapping:
                mapped_user_id = self.user_id_mapping[user_id]
            
                x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                final_scores = np.sum(self.scores[x_list], axis=0)

                # Choose n recommendations based on highest scores
                if not self.should_recommend_already_bought:
                    final_scores[x_list] = -1e100

                chosen_ids = np.argsort(-final_scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': final_scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df
# Quick test of the recommender

amazon_recommender = AmazonRecommender()
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))
Recommendations
user_id item_id score title genres
0 1 44761 6.122653 Brick (2005) Crime|Drama|Film-Noir|Mystery
1 1 5214 5.908858 Oh, God! (1977) Comedy|Fantasy
2 1 6755 5.830667 Bubba Ho-tep (2002) Comedy|Horror
3 1 3040 5.569308 Meatballs (1979) Comedy
4 1 6713 5.502504 Millennium Actress (Sennen joyû) (2001) Animation|Drama|Romance
5 1 3614 5.387478 Honeymoon in Vegas (1992) Comedy|Romance
6 1 2275 5.225652 Six-String Samurai (1998) Action|Adventure|Sci-Fi
7 1 4796 5.144470 Grass Is Greener, The (1960) Comedy|Romance
8 1 4896 4.796473 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) Adventure|Children|Fantasy
9 1 3714 4.608516 Clara's Heart (1988) Drama
10 4 3614 7.825335 Honeymoon in Vegas (1992) Comedy|Romance
11 4 6713 7.407051 Millennium Actress (Sennen joyû) (2001) Animation|Drama|Romance
12 4 2690 6.599105 Ideal Husband, An (1999) Comedy|Romance
13 4 44761 6.205835 Brick (2005) Crime|Drama|Film-Noir|Mystery
14 4 3628 6.186298 Flying Tigers (1942) Action|Drama|Romance|War
15 4 6755 5.977848 Bubba Ho-tep (2002) Comedy|Horror
16 4 959 5.919668 Of Human Bondage (1934) Drama
17 4 31260 5.919668 Boys Town (1938) Drama
18 4 6033 5.919668 Mystery Date (1991) Comedy
19 4 3714 5.919668 Clara's Heart (1988) Drama
20 6 3614 11.392962 Honeymoon in Vegas (1992) Comedy|Romance
21 6 31921 8.329693 Seven-Per-Cent Solution, The (1976) Adventure|Comedy|Crime|Drama|Mystery|Thriller
22 6 1752 8.236954 Hard Rain (1998) Action|Crime|Thriller
23 6 95147 8.006113 Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987) Action|Adventure|Animation|Children
24 6 2275 6.941940 Six-String Samurai (1998) Action|Adventure|Sci-Fi
25 6 3479 6.771276 Ladyhawke (1985) Adventure|Fantasy|Romance
26 6 6755 6.520369 Bubba Ho-tep (2002) Comedy|Horror
27 6 6537 6.454421 Terminator 3: Rise of the Machines (2003) Action|Adventure|Sci-Fi
28 6 4483 6.339894 Caddyshack II (1988) Comedy
29 6 228 6.174734 Destiny Turns on the Radio (1995) Comedy

Training-test split evaluation

from evaluation_and_testing.testing import evaluate_train_test_split_implicit

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.34147
from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799
tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.181818 0.311688 0.402597 0.551948 0.181818 0.257806 0.294682 0.341470
1 TFIDFRecommender 0.025974 0.090909 0.136364 0.318182 0.025974 0.064393 0.083685 0.140799

Leave-one-out evaluation

from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.166667 0.256667 0.32 0.426667 0.166667 0.219086 0.245486 0.279978
tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151
loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))
Recommender HR@1 HR@3 HR@5 HR@10 NDCG@1 NDCG@3 NDCG@5 NDCG@10
0 AmazonRecommender 0.166667 0.256667 0.320000 0.426667 0.166667 0.219086 0.245486 0.279978
1 TFIDFRecommender 0.006667 0.053333 0.123333 0.233333 0.006667 0.033491 0.062178 0.096151