69 KiB
69 KiB
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import scipy.special as scisp
# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
Load data
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
ml_df.head(10)
display(HTML(ml_movies_df.head(10).to_html()))
# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)
ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]
print("Number of interactions left: {}".format(len(ml_ratings_df)))
item_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
Number of interactions left: 1170
Inner workings of the Amazon recommender fit method
Shift item ids and user ids so that they are consecutive
interactions_df = ml_ratings_df.copy()
unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)
print("Item mapping")
print(item_id_mapping)
print()
print("Item reverse mapping")
print(item_id_reverse_mapping)
print()
print("User mapping")
print(user_id_mapping)
print()
print("User reverse mapping")
print(user_id_reverse_mapping)
print()
display(HTML(interactions_df.head(10).to_html()))
Item mapping {780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99} Item reverse mapping {0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737} User mapping {1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377} User reverse mapping {0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}
user_id | item_id | rating | timestamp | |
---|---|---|---|---|
42 | 0 | 0 | 3.0 | 964984086 |
97 | 0 | 1 | 4.0 | 964980985 |
216 | 0 | 2 | 4.0 | 964981725 |
310 | 1 | 3 | 3.0 | 945078428 |
398 | 1 | 1 | 4.0 | 964622830 |
416 | 1 | 4 | 4.0 | 964622714 |
513 | 1 | 5 | 4.0 | 1007574532 |
616 | 2 | 6 | 4.0 | 845553966 |
629 | 2 | 3 | 3.0 | 845555402 |
677 | 2 | 7 | 3.0 | 845554376 |
Get the number of items and users
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1
print("n_items={}\nn_users={}".format(n_items, n_users))
n_items=100 n_users=378
Get the maximal number of interactions
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()
print("max_interaction={}".format(max_interactions))
max_interaction=31
Calculate P_Y's
n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
print(p_y)
{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}
For every X calculate the E[Y|X]
e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
items = interactions_df['item_id'].unique()
p_y_powers = {}
for y in items:
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
print("p_y_powers for the first item")
print(p_y_powers[0])
for x in items:
# Get users who bought X
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
# Get users who bought only X
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
# Calculate the number of non-X interactions for each user who bought X
# Include users with zero non-X interactions
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
# Unnecessary, but added for readability
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
# Calculate the expected numbers of Y products bought by clients who bought X
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
for abs_c in n_non_x_interactions["n_items"]])
for k in range(1, max_interactions + 1)])
if x == 0:
print("alpha_k")
print(alpha_k)
print()
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
if y != x:
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
else:
e_xy[x][y] = n_users * p_y[x]
print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
p_y_powers for the first item [1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22 7.547e-23 1.303e-23 2.250e-24] alpha_k [ 6.290e+02 -2.785e+03 1.408e+04 -6.937e+04 3.018e+05 -1.120e+06 3.530e+06 -9.507e+06 2.202e+07 -4.418e+07 7.716e+07 -1.179e+08 1.579e+08 -1.860e+08 1.928e+08 -1.759e+08 1.413e+08 -9.962e+07 6.154e+07 -3.315e+07 1.549e+07 -6.230e+06 2.134e+06 -6.142e+05 1.458e+05 -2.778e+04 4.088e+03 -4.360e+02 3.000e+01 -1.000e+00 0.000e+00] E[Y|X] [[65.262 26.076 9.065 3.154 4.68 ] [28.303 19.062 4.288 1.5 2.223] [10.216 5.074 5.815 0.712 1.046] [ 2.315 0.859 0.283 1.938 0.144] [ 4.526 2.47 0.999 0.366 2.908]]
Get the user-item interaction matrix
# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
print(r[:10, :10])
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.] [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.] [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.] [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
Calculate the number of users who bought both X and Y
# Simple and slow method (commented out)
# n_xy = np.zeros(shape=(n_items, n_items))
# for x in items:
# for y in items:
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
# users_x_and_y = users_x & users_y
# n_xy[x][y] = len(users_x_and_y)
# Optimized method (can be further optimized by using sparse matrices)
n_xy = np.matmul(r.T, r)
print(n_xy[:10, :10])
[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.] [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.] [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.] [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.] [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.] [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.] [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.] [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.] [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.] [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]
Calculate the scores
scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
print(np.around(scores[:10, :10], 3))
[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235] [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218] [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134] [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141] [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604] [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782] [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518] [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728] [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946] [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]
Final comparison
print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
print()
print("N(X, Y)")
print(n_xy[:10, :10])
print()
print("Scores")
print(np.around(scores[:10, :10], 3))
print()
E[Y|X] [[65.262 26.076 9.065 3.154 4.68 41.571 23.082 8.592 19.542 27.522] [28.303 19.062 4.288 1.5 2.223 18.99 10.768 4.066 9.15 12.778] [10.216 5.074 5.815 0.712 1.046 7.386 4.577 1.872 3.964 5.308] [ 2.315 0.859 0.283 1.938 0.144 1.433 0.754 0.267 0.631 0.911] [ 4.526 2.47 0.999 0.366 2.908 3.453 2.245 0.951 1.962 2.574] [47.984 20.534 7.279 2.549 3.776 34.569 18.241 6.902 15.507 21.636] [25.303 11.206 4.05 1.429 2.112 17.265 16.477 3.843 8.524 11.789] [ 9.094 4.124 1.561 0.561 0.826 6.205 3.701 5.492 3.186 4.326] [21.633 9.823 3.601 1.276 1.884 14.955 8.776 3.417 13.569 10.322] [25.03 10.257 3.571 1.243 1.844 16.332 9.082 3.385 7.691 20.354]] N(X, Y) [[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.] [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.] [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.] [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.] [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.] [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.] [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.] [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.] [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.] [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]] Scores [[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235] [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218] [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134] [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141] [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604] [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782] [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518] [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728] [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946] [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]
Inner workings of the Amazon recommender recommend method
user_id = 1
should_recommend_already_bought = False
n_recommendations = 10
mapped_user_id = user_id_mapping[user_id]
x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(scores[x_list], axis=0)
# Choose n recommendations based on highest scores
if not should_recommend_already_bought:
final_scores[x_list] = -1e100
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
for item_id in chosen_ids:
print("Recommendation: {}, {}, {}".format(user_id_reverse_mapping[mapped_user_id],
ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id],
'title'].iloc[0],
final_scores[item_id]))
Recommendation: 1, Brick (2005), 6.122652596595853 Recommendation: 1, Oh, God! (1977), 5.908857666844879 Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312 Recommendation: 1, Meatballs (1979), 5.56930833865894 Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742 Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393 Recommendation: 1, Six-String Samurai (1998), 5.225652131462832 Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206 Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857 Recommendation: 1, Clara's Heart (1988), 4.608515964550741
Amazon recommder
from recommenders.recommender import Recommender
class AmazonRecommender(Recommender):
"""
Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
- Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
IEEE Internet Computing, 2003,
- Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
"""
def __init__(self):
super().__init__()
self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
self.interactions_df = None
self.item_id_mapping = None
self.user_id_mapping = None
self.item_id_reverse_mapping = None
self.user_id_reverse_mapping = None
self.e_xy = None
self.n_xy = None
self.scores = None
self.most_popular_items = None
self.should_recommend_already_bought = False
def initialize(self, **params):
if 'should_recommend_already_bought' in params:
self.should_recommend_already_bought = params['should_recommend_already_bought']
def fit(self, interactions_df, users_df, items_df):
"""
Training of the recommender.
:param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
defined by user_id, item_id and features of the interaction.
:param pd.DataFrame users_df: DataFrame with users and their features defined by
user_id and the user feature columns.
:param pd.DataFrame items_df: DataFrame with items and their features defined
by item_id and the item feature columns.
"""
# Shift item ids and user ids so that they are consecutive
unique_item_ids = interactions_df['item_id'].unique()
self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
interactions_df = interactions_df.copy()
interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)
# Get the number of items and users
self.interactions_df = interactions_df
n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1
# Get maximal number of interactions
n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()
# Calculate P_Y's
n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))
# Get the series of all items
# items = list(range(n_items))
items = interactions_df['item_id'].unique()
# For every X calculate the E[Y|X]
e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
p_y_powers = {}
for y in items:
p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
for x in items:
# Get users who bought X
c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()
# Get users who bought only X
c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))
# Calculate the number of non-X interactions for each user who bought X
# Include users with zero non-X interactions
n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
# Unnecessary, but added for readability
n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})
zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x) # Remove
n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])
n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]
# Calculate the expected numbers of Y products bought by clients who bought X
alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
for abs_c in n_non_x_interactions["n_items"]])
for k in range(1, max_interactions + 1)])
for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y
if y != x:
e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
else:
e_xy[x][y] = n_users * p_y[x]
self.e_xy = e_xy
# Calculate the number of users who bought both X and Y
# Simple and slow method (commented out)
# n_xy = np.zeros(shape=(n_items, n_items))
# for x in items:
# for y in items:
# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
# users_x_and_y = users_x & users_y
# n_xy[x][y] = len(users_x_and_y)
# Optimized method (can be further optimized by using sparse matrices)
# Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
# Get the number of users who bought both X and Y
n_xy = np.matmul(r.T, r)
self.n_xy = n_xy
# Calculate the scores
self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
# Find the most popular items for the cold start problem
offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
offers_count = offers_count.sort_values('user_id', ascending=False)
self.most_popular_items = offers_count.index
def recommend(self, users_df, items_df, n_recommendations=1):
"""
Serving of recommendations. Scores items in items_df for each user in users_df and returns
top n_recommendations for each user.
:param pd.DataFrame users_df: DataFrame with users and their features for which
recommendations should be generated.
:param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
:param int n_recommendations: Number of recommendations to be returned for each user.
:return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
for each user.
:rtype: pd.DataFrame
"""
# Clean previous recommendations (iloc could be used alternatively)
self.recommender_df = self.recommender_df[:0]
# Handle users not in the training data
# Map item ids
items_df = items_df.copy()
items_df.replace({'item_id': self.user_id_mapping}, inplace=True)
# Generate recommendations
for idx, user in users_df.iterrows():
recommendations = []
user_id = user['user_id']
if user_id in self.user_id_mapping:
mapped_user_id = self.user_id_mapping[user_id]
x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(self.scores[x_list], axis=0)
# Choose n recommendations based on highest scores
if not self.should_recommend_already_bought:
final_scores[x_list] = -1e100
chosen_ids = np.argsort(-final_scores)[:n_recommendations]
for item_id in chosen_ids:
recommendations.append(
{
'user_id': self.user_id_reverse_mapping[mapped_user_id],
'item_id': self.item_id_reverse_mapping[item_id],
'score': final_scores[item_id]
}
)
else: # For new users recommend most popular items
for i in range(n_recommendations):
recommendations.append(
{
'user_id': user['user_id'],
'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
'score': 1.0
}
)
user_recommendations = pd.DataFrame(recommendations)
self.recommender_df = pd.concat([self.recommender_df, user_recommendations])
return self.recommender_df
# Quick test of the recommender
amazon_recommender = AmazonRecommender()
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)
recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))
Recommendations
user_id | item_id | score | title | genres | |
---|---|---|---|---|---|
0 | 1 | 44761 | 6.122653 | Brick (2005) | Crime|Drama|Film-Noir|Mystery |
1 | 1 | 5214 | 5.908858 | Oh, God! (1977) | Comedy|Fantasy |
2 | 1 | 6755 | 5.830667 | Bubba Ho-tep (2002) | Comedy|Horror |
3 | 1 | 3040 | 5.569308 | Meatballs (1979) | Comedy |
4 | 1 | 6713 | 5.502504 | Millennium Actress (Sennen joyû) (2001) | Animation|Drama|Romance |
5 | 1 | 3614 | 5.387478 | Honeymoon in Vegas (1992) | Comedy|Romance |
6 | 1 | 2275 | 5.225652 | Six-String Samurai (1998) | Action|Adventure|Sci-Fi |
7 | 1 | 4796 | 5.144470 | Grass Is Greener, The (1960) | Comedy|Romance |
8 | 1 | 4896 | 4.796473 | Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) | Adventure|Children|Fantasy |
9 | 1 | 3714 | 4.608516 | Clara's Heart (1988) | Drama |
10 | 4 | 3614 | 7.825335 | Honeymoon in Vegas (1992) | Comedy|Romance |
11 | 4 | 6713 | 7.407051 | Millennium Actress (Sennen joyû) (2001) | Animation|Drama|Romance |
12 | 4 | 2690 | 6.599105 | Ideal Husband, An (1999) | Comedy|Romance |
13 | 4 | 44761 | 6.205835 | Brick (2005) | Crime|Drama|Film-Noir|Mystery |
14 | 4 | 3628 | 6.186298 | Flying Tigers (1942) | Action|Drama|Romance|War |
15 | 4 | 6755 | 5.977848 | Bubba Ho-tep (2002) | Comedy|Horror |
16 | 4 | 959 | 5.919668 | Of Human Bondage (1934) | Drama |
17 | 4 | 31260 | 5.919668 | Boys Town (1938) | Drama |
18 | 4 | 6033 | 5.919668 | Mystery Date (1991) | Comedy |
19 | 4 | 3714 | 5.919668 | Clara's Heart (1988) | Drama |
20 | 6 | 3614 | 11.392962 | Honeymoon in Vegas (1992) | Comedy|Romance |
21 | 6 | 31921 | 8.329693 | Seven-Per-Cent Solution, The (1976) | Adventure|Comedy|Crime|Drama|Mystery|Thriller |
22 | 6 | 1752 | 8.236954 | Hard Rain (1998) | Action|Crime|Thriller |
23 | 6 | 95147 | 8.006113 | Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987) | Action|Adventure|Animation|Children |
24 | 6 | 2275 | 6.941940 | Six-String Samurai (1998) | Action|Adventure|Sci-Fi |
25 | 6 | 3479 | 6.771276 | Ladyhawke (1985) | Adventure|Fantasy|Romance |
26 | 6 | 6755 | 6.520369 | Bubba Ho-tep (2002) | Comedy|Horror |
27 | 6 | 6537 | 6.454421 | Terminator 3: Rise of the Machines (2003) | Action|Adventure|Sci-Fi |
28 | 6 | 4483 | 6.339894 | Caddyshack II (1988) | Comedy |
29 | 6 | 228 | 6.174734 | Destiny Turns on the Radio (1995) | Comedy |
Training-test split evaluation
from evaluation_and_testing.testing import evaluate_train_test_split_implicit
amazon_recommender = AmazonRecommender()
amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
amazon_tts_results = pd.DataFrame(
amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(amazon_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.181818 | 0.311688 | 0.402597 | 0.551948 | 0.181818 | 0.257806 | 0.294682 | 0.34147 |
from recommenders.tfidf_recommender import TFIDFRecommender
tfidf_recommender = TFIDFRecommender()
tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]
tfidf_tts_results = pd.DataFrame(
tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(tfidf_tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | TFIDFRecommender | 0.025974 | 0.090909 | 0.136364 | 0.318182 | 0.025974 | 0.064393 | 0.083685 | 0.140799 |
tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.181818 | 0.311688 | 0.402597 | 0.551948 | 0.181818 | 0.257806 | 0.294682 | 0.341470 |
1 | TFIDFRecommender | 0.025974 | 0.090909 | 0.136364 | 0.318182 | 0.025974 | 0.064393 | 0.083685 | 0.140799 |
Leave-one-out evaluation
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit
amazon_recommender = AmazonRecommender()
amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
amazon_loo_results = pd.DataFrame(
amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(amazon_loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.166667 | 0.256667 | 0.32 | 0.426667 | 0.166667 | 0.219086 | 0.245486 | 0.279978 |
tfidf_recommender = TFIDFRecommender()
tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]
tfidf_loo_results = pd.DataFrame(
tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
display(HTML(tfidf_loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | TFIDFRecommender | 0.006667 | 0.053333 | 0.123333 | 0.233333 | 0.006667 | 0.033491 | 0.062178 | 0.096151 |
loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))
Recommender | HR@1 | HR@3 | HR@5 | HR@10 | NDCG@1 | NDCG@3 | NDCG@5 | NDCG@10 | |
---|---|---|---|---|---|---|---|---|---|
0 | AmazonRecommender | 0.166667 | 0.256667 | 0.320000 | 0.426667 | 0.166667 | 0.219086 | 0.245486 | 0.279978 |
1 | TFIDFRecommender | 0.006667 | 0.053333 | 0.123333 | 0.233333 | 0.006667 | 0.033491 | 0.062178 | 0.096151 |