Jakub 61c41dc046 meh

2021-07-07 20:03:54 +02:00

69 KiB

Raw Blame History

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import scipy.special as scisp

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load data

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
ml_df.head(10)

display(HTML(ml_movies_df.head(10).to_html()))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of interactions left: {}".format(len(ml_ratings_df)))

	item_id	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

Number of interactions left: 1170

Inner workings of the Amazon recommender fit method

Shift item ids and user ids so that they are consecutive

interactions_df = ml_ratings_df.copy()

unique_item_ids = interactions_df['item_id'].unique()
item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
unique_user_ids = interactions_df['user_id'].unique()
user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))

interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)

print("Item mapping")
print(item_id_mapping)
print()

print("Item reverse mapping")
print(item_id_reverse_mapping)
print()

print("User mapping")
print(user_id_mapping)
print()

print("User reverse mapping")
print(user_id_reverse_mapping)
print()

display(HTML(interactions_df.head(10).to_html()))

Item mapping
{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}

Item reverse mapping
{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}

User mapping
{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}

User reverse mapping
{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}

	user_id	item_id	rating	timestamp
42	0	0	3.0	964984086
97	0	1	4.0	964980985
216	0	2	4.0	964981725
310	1	3	3.0	945078428
398	1	1	4.0	964622830
416	1	4	4.0	964622714
513	1	5	4.0	1007574532
616	2	6	4.0	845553966
629	2	3	3.0	845555402
677	2	7	3.0	845554376

Get the number of items and users

n_items = np.max(interactions_df['item_id']) + 1
n_users = np.max(interactions_df['user_id']) + 1

print("n_items={}\nn_users={}".format(n_items, n_users))

n_items=100
n_users=378

Get the maximal number of interactions

n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
# Unnecessary, but added for readability
n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
max_interactions = n_user_interactions['n_items'].max()

print("max_interaction={}".format(max_interactions))

max_interaction=31

Calculate P_Y's

n_interactions = len(interactions_df)
p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
p_y = p_y.rename(columns={'user_id': 'P_Y'})
p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

print(p_y)

{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}

For every X calculate the E[Y|X]

e_xy = np.zeros(shape=(n_items, n_items))
e_xy[:][:] = -1e100
    
items = interactions_df['item_id'].unique()
    
p_y_powers = {}
for y in items:
    p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])
    
print("p_y_powers for the first item")
print(p_y_powers[0])

for x in items:
    # Get users who bought X
    c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

    # Get users who bought only X
    c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
    c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

    # Calculate the number of non-X interactions for each user who bought X
    # Include users with zero non-X interactions
    n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
    n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
    # Unnecessary, but added for readability
    n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

    zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)  # Remove
    n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

    n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]

    # Calculate the expected numbers of Y products bought by clients who bought X
    alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                for abs_c in n_non_x_interactions["n_items"]])
                        for k in range(1, max_interactions + 1)])
    
    if x == 0:
        print("alpha_k")
        print(alpha_k)
        print()

    for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
        if y != x:
            e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
        else:
            e_xy[x][y] = n_users * p_y[x]

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))

p_y_powers for the first item
[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06
 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11
 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17
 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22
 7.547e-23 1.303e-23 2.250e-24]
alpha_k
[ 6.290e+02 -2.785e+03  1.408e+04 -6.937e+04  3.018e+05 -1.120e+06
  3.530e+06 -9.507e+06  2.202e+07 -4.418e+07  7.716e+07 -1.179e+08
  1.579e+08 -1.860e+08  1.928e+08 -1.759e+08  1.413e+08 -9.962e+07
  6.154e+07 -3.315e+07  1.549e+07 -6.230e+06  2.134e+06 -6.142e+05
  1.458e+05 -2.778e+04  4.088e+03 -4.360e+02  3.000e+01 -1.000e+00
  0.000e+00]

E[Y|X]
[[65.262 26.076  9.065  3.154  4.68 ]
 [28.303 19.062  4.288  1.5    2.223]
 [10.216  5.074  5.815  0.712  1.046]
 [ 2.315  0.859  0.283  1.938  0.144]
 [ 4.526  2.47   0.999  0.366  2.908]]

Get the user-item interaction matrix

# mapping to int is necessary because of how iterrows works
r = np.zeros(shape=(n_users, n_items))
for idx, interaction in interactions_df.iterrows():
    r[int(interaction['user_id'])][int(interaction['item_id'])] = 1
    
print(r[:10, :10])

[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]
 [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]

Calculate the number of users who bought both X and Y

# Simple and slow method (commented out)

# n_xy = np.zeros(shape=(n_items, n_items))

# for x in items:
#     for y in items:
#         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
#         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
#         users_x_and_y = users_x & users_y
#         n_xy[x][y] = len(users_x_and_y)

# Optimized method (can be further optimized by using sparse matrices)

n_xy = np.matmul(r.T, r)

print(n_xy[:10, :10])

[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]
 [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]
 [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]
 [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]
 [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]
 [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]

Calculate the scores

scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)

print(np.around(scores[:10, :10], 3))

[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]
 [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]
 [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]
 [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]
 [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]
 [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]
 [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]
 [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]
 [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]
 [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]

Final comparison

print("E[Y|X]")
print(np.around(e_xy[:10, :10], 3))
print()

print("N(X, Y)")
print(n_xy[:10, :10])
print()

print("Scores")
print(np.around(scores[:10, :10], 3))
print()

E[Y|X]
[[65.262 26.076  9.065  3.154  4.68  41.571 23.082  8.592 19.542 27.522]
 [28.303 19.062  4.288  1.5    2.223 18.99  10.768  4.066  9.15  12.778]
 [10.216  5.074  5.815  0.712  1.046  7.386  4.577  1.872  3.964  5.308]
 [ 2.315  0.859  0.283  1.938  0.144  1.433  0.754  0.267  0.631  0.911]
 [ 4.526  2.47   0.999  0.366  2.908  3.453  2.245  0.951  1.962  2.574]
 [47.984 20.534  7.279  2.549  3.776 34.569 18.241  6.902 15.507 21.636]
 [25.303 11.206  4.05   1.429  2.112 17.265 16.477  3.843  8.524 11.789]
 [ 9.094  4.124  1.561  0.561  0.826  6.205  3.701  5.492  3.186  4.326]
 [21.633  9.823  3.601  1.276  1.884 14.955  8.776  3.417 13.569 10.322]
 [25.03  10.257  3.571  1.243  1.844 16.332  9.082  3.385  7.691 20.354]]

N(X, Y)
[[202.  34.  15.   3.   3.  66.  36.  10.  25.  34.]
 [ 34.  59.   6.   2.   5.  24.  12.   4.   8.  12.]
 [ 15.   6.  18.   1.   2.   7.   3.   4.   6.   5.]
 [  3.   2.   1.   6.   1.   1.   1.   1.   2.   2.]
 [  3.   5.   2.   1.   9.   3.   2.   1.   1.   0.]
 [ 66.  24.   7.   1.   3. 107.  20.   5.  16.  18.]
 [ 36.  12.   3.   1.   2.  20.  51.   8.  16.  17.]
 [ 10.   4.   4.   1.   1.   5.   8.  17.   8.  10.]
 [ 25.   8.   6.   2.   1.  16.  16.   8.  42.  23.]
 [ 34.  12.   5.   2.   0.  18.  17.  10.  23.  63.]]

Scores
[[16.926  1.552  1.971 -0.087 -0.777  3.789  2.689  0.48   1.235  1.235]
 [ 1.071  9.148  0.827  0.408  1.863  1.15   0.376 -0.033 -0.38  -0.218]
 [ 1.497  0.411  5.053  0.341  0.932 -0.142 -0.737  1.555  1.023 -0.134]
 [ 0.451  1.23   1.349  2.917  2.259 -0.361  0.284  1.417  1.724  1.141]
 [-0.717  1.61   1.002  1.048  3.573 -0.244 -0.164  0.051 -0.687 -1.604]
 [ 2.601  0.765 -0.103 -0.97  -0.399 12.319  0.412 -0.724  0.125 -0.782]
 [ 2.127  0.237 -0.522 -0.359 -0.077  0.658  8.505  2.121  2.561  1.518]
 [ 0.3   -0.061  1.952  0.585  0.192 -0.484  2.235  4.91   2.697  2.728]
 [ 0.724 -0.582  1.265  0.641 -0.644  0.27   2.439  2.479  7.718  3.946]
 [ 1.793  0.544  0.756  0.679 -1.358  0.413  2.627  3.596  5.52   9.453]]

user_id = 1
should_recommend_already_bought = False
n_recommendations = 10

mapped_user_id = user_id_mapping[user_id]

x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
final_scores = np.sum(scores[x_list], axis=0)

# Choose n recommendations based on highest scores
if not should_recommend_already_bought:
    final_scores[x_list] = -1e100

chosen_ids = np.argsort(-final_scores)[:n_recommendations]

for item_id in chosen_ids:
    print("Recommendation: {}, {}, {}".format(user_id_reverse_mapping[mapped_user_id],
                                              ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], 
                                                            'title'].iloc[0],
                                              final_scores[item_id]))

Recommendation: 1, Brick (2005), 6.122652596595853
Recommendation: 1, Oh, God! (1977), 5.908857666844879
Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312
Recommendation: 1, Meatballs (1979), 5.56930833865894
Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742
Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393
Recommendation: 1, Six-String Samurai (1998), 5.225652131462832
Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206
Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857
Recommendation: 1, Clara's Heart (1988), 4.608515964550741

Amazon recommder

from recommenders.recommender import Recommender

class AmazonRecommender(Recommender):
    """
    Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:
    - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,
        IEEE Internet Computing, 2003,
    - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.
    """

    def __init__(self):
        super().__init__()
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.interactions_df = None
        self.item_id_mapping = None
        self.user_id_mapping = None
        self.item_id_reverse_mapping = None
        self.user_id_reverse_mapping = None
        self.e_xy = None
        self.n_xy = None
        self.scores = None
        self.most_popular_items = None
        self.should_recommend_already_bought = False

    def initialize(self, **params):
        if 'should_recommend_already_bought' in params:
            self.should_recommend_already_bought = params['should_recommend_already_bought']

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by
            user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined
            by item_id and the item feature columns.
        """

        # Shift item ids and user ids so that they are consecutive

        unique_item_ids = interactions_df['item_id'].unique()
        self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))
        self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))
        unique_user_ids = interactions_df['user_id'].unique()
        self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))
        self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))
        
        interactions_df = interactions_df.copy()
        interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)

        # Get the number of items and users

        self.interactions_df = interactions_df
        n_items = np.max(interactions_df['item_id']) + 1
        n_users = np.max(interactions_df['user_id']) + 1

        # Get maximal number of interactions

        n_user_interactions = interactions_df[['user_id', 'item_id']].groupby("user_id").count()
        # Unnecessary, but added for readability
        n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})
        max_interactions = n_user_interactions['n_items'].max()

        # Calculate P_Y's

        n_interactions = len(interactions_df)
        p_y = interactions_df[['item_id', 'user_id']].groupby("item_id").count().reset_index()
        p_y = p_y.rename(columns={'user_id': 'P_Y'})
        p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions
        p_y = dict(zip(p_y['item_id'], p_y['P_Y']))

        # Get the series of all items

        # items = list(range(n_items))
        items = interactions_df['item_id'].unique()

        # For every X calculate the E[Y|X]

        e_xy = np.zeros(shape=(n_items, n_items))
        e_xy[:][:] = -1e100

        p_y_powers = {}
        for y in items:
            p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])

        for x in items:
            # Get users who bought X
            c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()

            # Get users who bought only X
            c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()
            c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))

            # Calculate the number of non-X interactions for each user who bought X
            # Include users with zero non-X interactions
            n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]
            n_non_x_interactions = n_non_x_interactions.groupby("user_id").count()
            # Unnecessary, but added for readability
            n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})

            zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=["n_items"], index=c_only_x)  # Remove
            n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])

            n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]

            # Calculate the expected numbers of Y products bought by clients who bought X
            alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)
                                        for abs_c in n_non_x_interactions["n_items"]])
                                for k in range(1, max_interactions + 1)])

            for y in items:  # Optimize to use only those Y's which have at least one client who bought both X and Y
                if y != x:
                    e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])
                else:
                    e_xy[x][y] = n_users * p_y[x]

        self.e_xy = e_xy

        # Calculate the number of users who bought both X and Y

        # Simple and slow method (commented out)

        # n_xy = np.zeros(shape=(n_items, n_items))

        # for x in items:
        #     for y in items:
        #         users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())
        #         users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())
        #         users_x_and_y = users_x & users_y
        #         n_xy[x][y] = len(users_x_and_y)

        # Optimized method (can be further optimized by using sparse matrices)

        # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)
        r = np.zeros(shape=(n_users, n_items))
        for idx, interaction in interactions_df.iterrows():
            r[int(interaction['user_id'])][int(interaction['item_id'])] = 1

        # Get the number of users who bought both X and Y

        n_xy = np.matmul(r.T, r)

        self.n_xy = n_xy
        
        # Calculate the scores

        self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)
        
        # Find the most popular items for the cold start problem
        
        offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()
        offers_count = offers_count.sort_values('user_id', ascending=False)
        self.most_popular_items = offers_count.index

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which
            recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Clean previous recommendations (iloc could be used alternatively)
        self.recommender_df = self.recommender_df[:0]
        
        # Handle users not in the training data

        # Map item ids
        
        items_df = items_df.copy()
        items_df.replace({'item_id': self.user_id_mapping}, inplace=True)

        # Generate recommendations

        for idx, user in users_df.iterrows():
            recommendations = []
            
            user_id = user['user_id']
            
            if user_id in self.user_id_mapping:
                mapped_user_id = self.user_id_mapping[user_id]
            
                x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()
                final_scores = np.sum(self.scores[x_list], axis=0)

                # Choose n recommendations based on highest scores
                if not self.should_recommend_already_bought:
                    final_scores[x_list] = -1e100

                chosen_ids = np.argsort(-final_scores)[:n_recommendations]

                for item_id in chosen_ids:
                    recommendations.append(
                        {
                            'user_id': self.user_id_reverse_mapping[mapped_user_id],
                            'item_id': self.item_id_reverse_mapping[item_id],
                            'score': final_scores[item_id]
                        }
                    )
            else:  # For new users recommend most popular items
                for i in range(n_recommendations):
                    recommendations.append(
                        {
                            'user_id': user['user_id'],
                            'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],
                            'score': 1.0
                        }
                    )

            user_recommendations = pd.DataFrame(recommendations)

            self.recommender_df = pd.concat([self.recommender_df, user_recommendations])

        return self.recommender_df

# Quick test of the recommender

amazon_recommender = AmazonRecommender()
amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

Recommendations

	user_id	item_id	score	title	genres
0	1	44761	6.122653	Brick (2005)	Crime\|Drama\|Film-Noir\|Mystery
1	1	5214	5.908858	Oh, God! (1977)	Comedy\|Fantasy
2	1	6755	5.830667	Bubba Ho-tep (2002)	Comedy\|Horror
3	1	3040	5.569308	Meatballs (1979)	Comedy
4	1	6713	5.502504	Millennium Actress (Sennen joyû) (2001)	Animation\|Drama\|Romance
5	1	3614	5.387478	Honeymoon in Vegas (1992)	Comedy\|Romance
6	1	2275	5.225652	Six-String Samurai (1998)	Action\|Adventure\|Sci-Fi
7	1	4796	5.144470	Grass Is Greener, The (1960)	Comedy\|Romance
8	1	4896	4.796473	Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)	Adventure\|Children\|Fantasy
9	1	3714	4.608516	Clara's Heart (1988)	Drama
10	4	3614	7.825335	Honeymoon in Vegas (1992)	Comedy\|Romance
11	4	6713	7.407051	Millennium Actress (Sennen joyû) (2001)	Animation\|Drama\|Romance
12	4	2690	6.599105	Ideal Husband, An (1999)	Comedy\|Romance
13	4	44761	6.205835	Brick (2005)	Crime\|Drama\|Film-Noir\|Mystery
14	4	3628	6.186298	Flying Tigers (1942)	Action\|Drama\|Romance\|War
15	4	6755	5.977848	Bubba Ho-tep (2002)	Comedy\|Horror
16	4	959	5.919668	Of Human Bondage (1934)	Drama
17	4	31260	5.919668	Boys Town (1938)	Drama
18	4	6033	5.919668	Mystery Date (1991)	Comedy
19	4	3714	5.919668	Clara's Heart (1988)	Drama
20	6	3614	11.392962	Honeymoon in Vegas (1992)	Comedy\|Romance
21	6	31921	8.329693	Seven-Per-Cent Solution, The (1976)	Adventure\|Comedy\|Crime\|Drama\|Mystery\|Thriller
22	6	1752	8.236954	Hard Rain (1998)	Action\|Crime\|Thriller
23	6	95147	8.006113	Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987)	Action\|Adventure\|Animation\|Children
24	6	2275	6.941940	Six-String Samurai (1998)	Action\|Adventure\|Sci-Fi
25	6	3479	6.771276	Ladyhawke (1985)	Adventure\|Fantasy\|Romance
26	6	6755	6.520369	Bubba Ho-tep (2002)	Comedy\|Horror
27	6	6537	6.454421	Terminator 3: Rise of the Machines (2003)	Action\|Adventure\|Sci-Fi
28	6	4483	6.339894	Caddyshack II (1988)	Comedy
29	6	228	6.174734	Destiny Turns on the Radio (1995)	Comedy

Training-test split evaluation

from evaluation_and_testing.testing import evaluate_train_test_split_implicit

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.181818	0.311688	0.402597	0.551948	0.181818	0.257806	0.294682	0.34147

from recommenders.tfidf_recommender import TFIDFRecommender

tfidf_recommender = TFIDFRecommender()

tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

tfidf_tts_results = pd.DataFrame(
    tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	TFIDFRecommender	0.025974	0.090909	0.136364	0.318182	0.025974	0.064393	0.083685	0.140799

tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.181818	0.311688	0.402597	0.551948	0.181818	0.257806	0.294682	0.341470
1	TFIDFRecommender	0.025974	0.090909	0.136364	0.318182	0.025974	0.064393	0.083685	0.140799

Leave-one-out evaluation

from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

amazon_recommender = AmazonRecommender()

amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(
    amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

amazon_loo_results = pd.DataFrame(
    amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.166667	0.256667	0.32	0.426667	0.166667	0.219086	0.245486	0.279978

tfidf_recommender = TFIDFRecommender()

tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

tfidf_loo_results = pd.DataFrame(
    tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(tfidf_loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	TFIDFRecommender	0.006667	0.053333	0.123333	0.233333	0.006667	0.033491	0.062178	0.096151

loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)
display(HTML(loo_results.to_html()))

	Recommender	HR@1	HR@3	HR@5	HR@10	NDCG@1	NDCG@3	NDCG@5	NDCG@10
0	AmazonRecommender	0.166667	0.256667	0.320000	0.426667	0.166667	0.219086	0.245486	0.279978
1	TFIDFRecommender	0.006667	0.053333	0.123333	0.233333	0.006667	0.033491	0.062178	0.096151

69 KiB Raw Blame History

Load data

Inner workings of the Amazon recommender fit method

Shift item ids and user ids so that they are consecutive

Get the number of items and users

Get the maximal number of interactions

Calculate P_Y's

For every X calculate the E[Y|X]

Get the user-item interaction matrix

Calculate the number of users who bought both X and Y

Calculate the scores

Final comparison

Inner workings of the Amazon recommender recommend method

Amazon recommder

Training-test split evaluation

Leave-one-out evaluation

69 KiB

Raw Blame History