1720 lines
69 KiB
Plaintext
1720 lines
69 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 112,
|
|
"id": "verified-accommodation",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The autoreload extension is already loaded. To reload it, use:\n",
|
|
" %reload_ext autoreload\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from IPython.display import Markdown, display, HTML\n",
|
|
"from collections import defaultdict\n",
|
|
"from sklearn.model_selection import KFold\n",
|
|
"import scipy.special as scisp\n",
|
|
"\n",
|
|
"# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)\n",
|
|
"import os\n",
|
|
"os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "educated-tourist",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Load data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 113,
|
|
"id": "looking-feeling",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>item_id</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>genres</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>Toy Story (1995)</td>\n",
|
|
" <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>Jumanji (1995)</td>\n",
|
|
" <td>Adventure|Children|Fantasy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Grumpier Old Men (1995)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>Waiting to Exhale (1995)</td>\n",
|
|
" <td>Comedy|Drama|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>Father of the Bride Part II (1995)</td>\n",
|
|
" <td>Comedy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>Heat (1995)</td>\n",
|
|
" <td>Action|Crime|Thriller</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>7</td>\n",
|
|
" <td>Sabrina (1995)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>8</td>\n",
|
|
" <td>Tom and Huck (1995)</td>\n",
|
|
" <td>Adventure|Children</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>9</td>\n",
|
|
" <td>Sudden Death (1995)</td>\n",
|
|
" <td>Action</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>10</td>\n",
|
|
" <td>GoldenEye (1995)</td>\n",
|
|
" <td>Action|Adventure|Thriller</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of interactions left: 1170\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"ml_ratings_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"ratings.csv\")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})\n",
|
|
"ml_movies_df = pd.read_csv(os.path.join(\"data\", \"movielens_small\", \"movies.csv\")).rename(columns={'movieId': 'item_id'})\n",
|
|
"ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')\n",
|
|
"ml_df.head(10)\n",
|
|
"\n",
|
|
"display(HTML(ml_movies_df.head(10).to_html()))\n",
|
|
"\n",
|
|
"# Filter the data to reduce the number of movies\n",
|
|
"seed = 6789\n",
|
|
"rng = np.random.RandomState(seed=seed)\n",
|
|
"left_ids = rng.choice(ml_movies_df['item_id'], size=100, replace=False)\n",
|
|
"\n",
|
|
"ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]\n",
|
|
"ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]\n",
|
|
"ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]\n",
|
|
"\n",
|
|
"print(\"Number of interactions left: {}\".format(len(ml_ratings_df)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "protecting-recognition",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Inner workings of the Amazon recommender fit method"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "plastic-brooklyn",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Shift item ids and user ids so that they are consecutive"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 71,
|
|
"id": "valuable-modem",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Item mapping\n",
|
|
"{780: 0, 1500: 1, 3479: 2, 171: 3, 1914: 4, 4896: 5, 145: 6, 267: 7, 355: 8, 435: 9, 6502: 10, 73323: 11, 112421: 12, 1783: 13, 2806: 14, 3040: 15, 3551: 16, 2135: 17, 39715: 18, 41566: 19, 5673: 20, 7064: 21, 481: 22, 6537: 23, 44761: 24, 2690: 25, 228: 26, 4890: 27, 3614: 28, 3507: 29, 3628: 30, 5954: 31, 8605: 32, 3786: 33, 6755: 34, 3468: 35, 50601: 36, 3089: 37, 55444: 38, 118270: 39, 124404: 40, 3768: 41, 233: 42, 3687: 43, 171749: 44, 104218: 45, 182749: 46, 3342: 47, 65130: 48, 84952: 49, 152970: 50, 3067: 51, 4031: 52, 1107: 53, 47382: 54, 3801: 55, 5155: 56, 5612: 57, 5214: 58, 67295: 59, 3165: 60, 1752: 61, 31223: 62, 6713: 63, 66783: 64, 2043: 65, 2903: 66, 3313: 67, 4009: 68, 91842: 69, 2190: 70, 7282: 71, 4483: 72, 2275: 73, 3567: 74, 190207: 75, 4505: 76, 95147: 77, 4552: 78, 6033: 79, 2521: 80, 4397: 81, 151315: 82, 156706: 83, 151311: 84, 959: 85, 3714: 86, 4164: 87, 4796: 88, 31260: 89, 6927: 90, 126142: 91, 73804: 92, 26357: 93, 82684: 94, 6342: 95, 32799: 96, 31921: 97, 2892: 98, 2737: 99}\n",
|
|
"\n",
|
|
"Item reverse mapping\n",
|
|
"{0: 780, 1: 1500, 2: 3479, 3: 171, 4: 1914, 5: 4896, 6: 145, 7: 267, 8: 355, 9: 435, 10: 6502, 11: 73323, 12: 112421, 13: 1783, 14: 2806, 15: 3040, 16: 3551, 17: 2135, 18: 39715, 19: 41566, 20: 5673, 21: 7064, 22: 481, 23: 6537, 24: 44761, 25: 2690, 26: 228, 27: 4890, 28: 3614, 29: 3507, 30: 3628, 31: 5954, 32: 8605, 33: 3786, 34: 6755, 35: 3468, 36: 50601, 37: 3089, 38: 55444, 39: 118270, 40: 124404, 41: 3768, 42: 233, 43: 3687, 44: 171749, 45: 104218, 46: 182749, 47: 3342, 48: 65130, 49: 84952, 50: 152970, 51: 3067, 52: 4031, 53: 1107, 54: 47382, 55: 3801, 56: 5155, 57: 5612, 58: 5214, 59: 67295, 60: 3165, 61: 1752, 62: 31223, 63: 6713, 64: 66783, 65: 2043, 66: 2903, 67: 3313, 68: 4009, 69: 91842, 70: 2190, 71: 7282, 72: 4483, 73: 2275, 74: 3567, 75: 190207, 76: 4505, 77: 95147, 78: 4552, 79: 6033, 80: 2521, 81: 4397, 82: 151315, 83: 156706, 84: 151311, 85: 959, 86: 3714, 87: 4164, 88: 4796, 89: 31260, 90: 6927, 91: 126142, 92: 73804, 93: 26357, 94: 82684, 95: 6342, 96: 32799, 97: 31921, 98: 2892, 99: 2737}\n",
|
|
"\n",
|
|
"User mapping\n",
|
|
"{1: 0, 4: 1, 6: 2, 7: 3, 11: 4, 15: 5, 17: 6, 18: 7, 19: 8, 20: 9, 21: 10, 22: 11, 23: 12, 24: 13, 27: 14, 28: 15, 29: 16, 31: 17, 32: 18, 33: 19, 34: 20, 36: 21, 38: 22, 39: 23, 40: 24, 41: 25, 42: 26, 43: 27, 44: 28, 45: 29, 46: 30, 48: 31, 50: 32, 51: 33, 53: 34, 57: 35, 58: 36, 59: 37, 61: 38, 62: 39, 63: 40, 64: 41, 66: 42, 67: 43, 68: 44, 70: 45, 71: 46, 72: 47, 73: 48, 74: 49, 75: 50, 76: 51, 78: 52, 80: 53, 82: 54, 83: 55, 84: 56, 86: 57, 88: 58, 89: 59, 90: 60, 91: 61, 94: 62, 95: 63, 96: 64, 99: 65, 100: 66, 101: 67, 103: 68, 104: 69, 105: 70, 106: 71, 108: 72, 109: 73, 111: 74, 112: 75, 113: 76, 114: 77, 115: 78, 116: 79, 117: 80, 120: 81, 121: 82, 122: 83, 125: 84, 129: 85, 132: 86, 133: 87, 135: 88, 136: 89, 137: 90, 139: 91, 140: 92, 141: 93, 142: 94, 144: 95, 148: 96, 149: 97, 150: 98, 151: 99, 153: 100, 154: 101, 156: 102, 158: 103, 160: 104, 161: 105, 162: 106, 164: 107, 165: 108, 166: 109, 167: 110, 169: 111, 170: 112, 171: 113, 173: 114, 174: 115, 175: 116, 176: 117, 177: 118, 178: 119, 179: 120, 181: 121, 182: 122, 184: 123, 186: 124, 187: 125, 190: 126, 194: 127, 195: 128, 198: 129, 199: 130, 200: 131, 201: 132, 202: 133, 203: 134, 204: 135, 205: 136, 206: 137, 210: 138, 212: 139, 213: 140, 214: 141, 215: 142, 216: 143, 217: 144, 219: 145, 220: 146, 221: 147, 222: 148, 223: 149, 226: 150, 229: 151, 230: 152, 232: 153, 233: 154, 234: 155, 235: 156, 236: 157, 239: 158, 240: 159, 243: 160, 244: 161, 246: 162, 247: 163, 249: 164, 254: 165, 256: 166, 257: 167, 260: 168, 262: 169, 263: 170, 264: 171, 265: 172, 266: 173, 269: 174, 270: 175, 271: 176, 273: 177, 274: 178, 275: 179, 276: 180, 277: 181, 279: 182, 280: 183, 282: 184, 283: 185, 284: 186, 287: 187, 288: 188, 290: 189, 291: 190, 292: 191, 294: 192, 297: 193, 298: 194, 301: 195, 302: 196, 303: 197, 304: 198, 305: 199, 306: 200, 307: 201, 308: 202, 310: 203, 312: 204, 313: 205, 314: 206, 318: 207, 321: 208, 322: 209, 325: 210, 328: 211, 330: 212, 331: 213, 332: 214, 333: 215, 334: 216, 335: 217, 337: 218, 338: 219, 339: 220, 340: 221, 341: 222, 345: 223, 347: 224, 349: 225, 352: 226, 353: 227, 354: 228, 356: 229, 357: 230, 359: 231, 361: 232, 364: 233, 365: 234, 366: 235, 367: 236, 368: 237, 369: 238, 370: 239, 373: 240, 374: 241, 376: 242, 380: 243, 381: 244, 382: 245, 383: 246, 384: 247, 385: 248, 386: 249, 387: 250, 389: 251, 391: 252, 395: 253, 399: 254, 402: 255, 408: 256, 409: 257, 410: 258, 411: 259, 412: 260, 413: 261, 414: 262, 415: 263, 417: 264, 419: 265, 420: 266, 422: 267, 423: 268, 425: 269, 426: 270, 427: 271, 428: 272, 431: 273, 432: 274, 434: 275, 436: 276, 437: 277, 438: 278, 440: 279, 445: 280, 446: 281, 447: 282, 448: 283, 451: 284, 452: 285, 453: 286, 455: 287, 456: 288, 460: 289, 462: 290, 463: 291, 464: 292, 465: 293, 466: 294, 467: 295, 469: 296, 474: 297, 475: 298, 477: 299, 479: 300, 480: 301, 482: 302, 483: 303, 484: 304, 486: 305, 489: 306, 490: 307, 491: 308, 492: 309, 495: 310, 500: 311, 501: 312, 503: 313, 504: 314, 505: 315, 509: 316, 510: 317, 511: 318, 513: 319, 514: 320, 517: 321, 521: 322, 522: 323, 524: 324, 525: 325, 527: 326, 529: 327, 533: 328, 534: 329, 536: 330, 537: 331, 540: 332, 542: 333, 543: 334, 544: 335, 552: 336, 553: 337, 555: 338, 556: 339, 557: 340, 558: 341, 559: 342, 560: 343, 561: 344, 562: 345, 563: 346, 564: 347, 566: 348, 567: 349, 570: 350, 573: 351, 577: 352, 579: 353, 580: 354, 581: 355, 584: 356, 585: 357, 586: 358, 587: 359, 589: 360, 590: 361, 592: 362, 593: 363, 594: 364, 595: 365, 596: 366, 597: 367, 599: 368, 600: 369, 602: 370, 603: 371, 604: 372, 605: 373, 606: 374, 607: 375, 608: 376, 610: 377}\n",
|
|
"\n",
|
|
"User reverse mapping\n",
|
|
"{0: 1, 1: 4, 2: 6, 3: 7, 4: 11, 5: 15, 6: 17, 7: 18, 8: 19, 9: 20, 10: 21, 11: 22, 12: 23, 13: 24, 14: 27, 15: 28, 16: 29, 17: 31, 18: 32, 19: 33, 20: 34, 21: 36, 22: 38, 23: 39, 24: 40, 25: 41, 26: 42, 27: 43, 28: 44, 29: 45, 30: 46, 31: 48, 32: 50, 33: 51, 34: 53, 35: 57, 36: 58, 37: 59, 38: 61, 39: 62, 40: 63, 41: 64, 42: 66, 43: 67, 44: 68, 45: 70, 46: 71, 47: 72, 48: 73, 49: 74, 50: 75, 51: 76, 52: 78, 53: 80, 54: 82, 55: 83, 56: 84, 57: 86, 58: 88, 59: 89, 60: 90, 61: 91, 62: 94, 63: 95, 64: 96, 65: 99, 66: 100, 67: 101, 68: 103, 69: 104, 70: 105, 71: 106, 72: 108, 73: 109, 74: 111, 75: 112, 76: 113, 77: 114, 78: 115, 79: 116, 80: 117, 81: 120, 82: 121, 83: 122, 84: 125, 85: 129, 86: 132, 87: 133, 88: 135, 89: 136, 90: 137, 91: 139, 92: 140, 93: 141, 94: 142, 95: 144, 96: 148, 97: 149, 98: 150, 99: 151, 100: 153, 101: 154, 102: 156, 103: 158, 104: 160, 105: 161, 106: 162, 107: 164, 108: 165, 109: 166, 110: 167, 111: 169, 112: 170, 113: 171, 114: 173, 115: 174, 116: 175, 117: 176, 118: 177, 119: 178, 120: 179, 121: 181, 122: 182, 123: 184, 124: 186, 125: 187, 126: 190, 127: 194, 128: 195, 129: 198, 130: 199, 131: 200, 132: 201, 133: 202, 134: 203, 135: 204, 136: 205, 137: 206, 138: 210, 139: 212, 140: 213, 141: 214, 142: 215, 143: 216, 144: 217, 145: 219, 146: 220, 147: 221, 148: 222, 149: 223, 150: 226, 151: 229, 152: 230, 153: 232, 154: 233, 155: 234, 156: 235, 157: 236, 158: 239, 159: 240, 160: 243, 161: 244, 162: 246, 163: 247, 164: 249, 165: 254, 166: 256, 167: 257, 168: 260, 169: 262, 170: 263, 171: 264, 172: 265, 173: 266, 174: 269, 175: 270, 176: 271, 177: 273, 178: 274, 179: 275, 180: 276, 181: 277, 182: 279, 183: 280, 184: 282, 185: 283, 186: 284, 187: 287, 188: 288, 189: 290, 190: 291, 191: 292, 192: 294, 193: 297, 194: 298, 195: 301, 196: 302, 197: 303, 198: 304, 199: 305, 200: 306, 201: 307, 202: 308, 203: 310, 204: 312, 205: 313, 206: 314, 207: 318, 208: 321, 209: 322, 210: 325, 211: 328, 212: 330, 213: 331, 214: 332, 215: 333, 216: 334, 217: 335, 218: 337, 219: 338, 220: 339, 221: 340, 222: 341, 223: 345, 224: 347, 225: 349, 226: 352, 227: 353, 228: 354, 229: 356, 230: 357, 231: 359, 232: 361, 233: 364, 234: 365, 235: 366, 236: 367, 237: 368, 238: 369, 239: 370, 240: 373, 241: 374, 242: 376, 243: 380, 244: 381, 245: 382, 246: 383, 247: 384, 248: 385, 249: 386, 250: 387, 251: 389, 252: 391, 253: 395, 254: 399, 255: 402, 256: 408, 257: 409, 258: 410, 259: 411, 260: 412, 261: 413, 262: 414, 263: 415, 264: 417, 265: 419, 266: 420, 267: 422, 268: 423, 269: 425, 270: 426, 271: 427, 272: 428, 273: 431, 274: 432, 275: 434, 276: 436, 277: 437, 278: 438, 279: 440, 280: 445, 281: 446, 282: 447, 283: 448, 284: 451, 285: 452, 286: 453, 287: 455, 288: 456, 289: 460, 290: 462, 291: 463, 292: 464, 293: 465, 294: 466, 295: 467, 296: 469, 297: 474, 298: 475, 299: 477, 300: 479, 301: 480, 302: 482, 303: 483, 304: 484, 305: 486, 306: 489, 307: 490, 308: 491, 309: 492, 310: 495, 311: 500, 312: 501, 313: 503, 314: 504, 315: 505, 316: 509, 317: 510, 318: 511, 319: 513, 320: 514, 321: 517, 322: 521, 323: 522, 324: 524, 325: 525, 326: 527, 327: 529, 328: 533, 329: 534, 330: 536, 331: 537, 332: 540, 333: 542, 334: 543, 335: 544, 336: 552, 337: 553, 338: 555, 339: 556, 340: 557, 341: 558, 342: 559, 343: 560, 344: 561, 345: 562, 346: 563, 347: 564, 348: 566, 349: 567, 350: 570, 351: 573, 352: 577, 353: 579, 354: 580, 355: 581, 356: 584, 357: 585, 358: 586, 359: 587, 360: 589, 361: 590, 362: 592, 363: 593, 364: 594, 365: 595, 366: 596, 367: 597, 368: 599, 369: 600, 370: 602, 371: 603, 372: 604, 373: 605, 374: 606, 375: 607, 376: 608, 377: 610}\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>user_id</th>\n",
|
|
" <th>item_id</th>\n",
|
|
" <th>rating</th>\n",
|
|
" <th>timestamp</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>42</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>964984086</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>97</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>964980985</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>216</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>964981725</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>310</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>945078428</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>398</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>964622830</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>416</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>964622714</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>513</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>1007574532</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>616</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>6</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>845553966</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>629</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>845555402</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>677</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>7</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>845554376</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"interactions_df = ml_ratings_df.copy()\n",
|
|
"\n",
|
|
"unique_item_ids = interactions_df['item_id'].unique()\n",
|
|
"item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
|
|
"item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
|
|
"unique_user_ids = interactions_df['user_id'].unique()\n",
|
|
"user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
|
|
"user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
|
|
"\n",
|
|
"interactions_df.replace({'item_id': item_id_mapping, 'user_id': user_id_mapping}, inplace=True)\n",
|
|
"\n",
|
|
"print(\"Item mapping\")\n",
|
|
"print(item_id_mapping)\n",
|
|
"print()\n",
|
|
"\n",
|
|
"print(\"Item reverse mapping\")\n",
|
|
"print(item_id_reverse_mapping)\n",
|
|
"print()\n",
|
|
"\n",
|
|
"print(\"User mapping\")\n",
|
|
"print(user_id_mapping)\n",
|
|
"print()\n",
|
|
"\n",
|
|
"print(\"User reverse mapping\")\n",
|
|
"print(user_id_reverse_mapping)\n",
|
|
"print()\n",
|
|
"\n",
|
|
"display(HTML(interactions_df.head(10).to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "basic-meeting",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Get the number of items and users"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 75,
|
|
"id": "close-massachusetts",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"n_items=100\n",
|
|
"n_users=378\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"n_items = np.max(interactions_df['item_id']) + 1\n",
|
|
"n_users = np.max(interactions_df['user_id']) + 1\n",
|
|
"\n",
|
|
"print(\"n_items={}\\nn_users={}\".format(n_items, n_users))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "permanent-corrections",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Get the maximal number of interactions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 73,
|
|
"id": "peripheral-natural",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"max_interaction=31\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
|
|
"# Unnecessary, but added for readability\n",
|
|
"n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
|
|
"max_interactions = n_user_interactions['n_items'].max()\n",
|
|
"\n",
|
|
"print(\"max_interaction={}\".format(max_interactions))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "basic-production",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Calculate P_Y's"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 76,
|
|
"id": "concrete-transparency",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{0: 0.17264957264957265, 1: 0.05042735042735043, 2: 0.015384615384615385, 3: 0.005128205128205128, 4: 0.007692307692307693, 5: 0.09145299145299145, 6: 0.04358974358974359, 7: 0.01452991452991453, 8: 0.035897435897435895, 9: 0.05384615384615385, 10: 0.04957264957264957, 11: 0.004273504273504274, 12: 0.002564102564102564, 13: 0.004273504273504274, 14: 0.007692307692307693, 15: 0.007692307692307693, 16: 0.011111111111111112, 17: 0.009401709401709401, 18: 0.005982905982905983, 19: 0.05299145299145299, 20: 0.028205128205128206, 21: 0.005128205128205128, 22: 0.01623931623931624, 23: 0.038461538461538464, 24: 0.010256410256410256, 25: 0.008547008547008548, 26: 0.002564102564102564, 27: 0.026495726495726495, 28: 0.006837606837606838, 29: 0.01282051282051282, 30: 0.0017094017094017094, 31: 0.018803418803418803, 32: 0.0017094017094017094, 33: 0.003418803418803419, 34: 0.011965811965811967, 35: 0.015384615384615385, 36: 0.007692307692307693, 37: 0.013675213675213675, 38: 0.002564102564102564, 39: 0.0008547008547008547, 40: 0.0008547008547008547, 41: 0.0017094017094017094, 42: 0.010256410256410256, 43: 0.0008547008547008547, 44: 0.0008547008547008547, 45: 0.004273504273504274, 46: 0.0008547008547008547, 47: 0.004273504273504274, 48: 0.004273504273504274, 49: 0.0008547008547008547, 50: 0.003418803418803419, 51: 0.008547008547008548, 52: 0.0017094017094017094, 53: 0.0017094017094017094, 54: 0.003418803418803419, 55: 0.003418803418803419, 56: 0.0008547008547008547, 57: 0.0008547008547008547, 58: 0.003418803418803419, 59: 0.003418803418803419, 60: 0.0017094017094017094, 61: 0.003418803418803419, 62: 0.0008547008547008547, 63: 0.004273504273504274, 64: 0.0017094017094017094, 65: 0.003418803418803419, 66: 0.0017094017094017094, 67: 0.0017094017094017094, 68: 0.0017094017094017094, 69: 0.0017094017094017094, 70: 0.0008547008547008547, 71: 0.0008547008547008547, 72: 0.002564102564102564, 73: 0.004273504273504274, 74: 0.0008547008547008547, 75: 0.0008547008547008547, 76: 0.0008547008547008547, 77: 0.0017094017094017094, 78: 0.002564102564102564, 79: 0.0008547008547008547, 80: 0.0017094017094017094, 81: 0.0017094017094017094, 82: 0.002564102564102564, 83: 0.0008547008547008547, 84: 0.0008547008547008547, 85: 0.0008547008547008547, 86: 0.0008547008547008547, 87: 0.0017094017094017094, 88: 0.0017094017094017094, 89: 0.0008547008547008547, 90: 0.0008547008547008547, 91: 0.0008547008547008547, 92: 0.0008547008547008547, 93: 0.0008547008547008547, 94: 0.0008547008547008547, 95: 0.0008547008547008547, 96: 0.0008547008547008547, 97: 0.0008547008547008547, 98: 0.0008547008547008547, 99: 0.0008547008547008547}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"n_interactions = len(interactions_df)\n",
|
|
"p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
|
|
"p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
|
|
"p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
|
|
"p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
|
|
"\n",
|
|
"print(p_y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "consolidated-constant",
|
|
"metadata": {},
|
|
"source": [
|
|
"## For every X calculate the E[Y|X]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 99,
|
|
"id": "alive-cameroon",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"p_y_powers for the first item\n",
|
|
"[1.726e-01 2.981e-02 5.146e-03 8.885e-04 1.534e-04 2.648e-05 4.573e-06\n",
|
|
" 7.894e-07 1.363e-07 2.353e-08 4.063e-09 7.014e-10 1.211e-10 2.091e-11\n",
|
|
" 3.610e-12 6.232e-13 1.076e-13 1.858e-14 3.207e-15 5.537e-16 9.560e-17\n",
|
|
" 1.651e-17 2.850e-18 4.920e-19 8.494e-20 1.467e-20 2.532e-21 4.372e-22\n",
|
|
" 7.547e-23 1.303e-23 2.250e-24]\n",
|
|
"alpha_k\n",
|
|
"[ 6.290e+02 -2.785e+03 1.408e+04 -6.937e+04 3.018e+05 -1.120e+06\n",
|
|
" 3.530e+06 -9.507e+06 2.202e+07 -4.418e+07 7.716e+07 -1.179e+08\n",
|
|
" 1.579e+08 -1.860e+08 1.928e+08 -1.759e+08 1.413e+08 -9.962e+07\n",
|
|
" 6.154e+07 -3.315e+07 1.549e+07 -6.230e+06 2.134e+06 -6.142e+05\n",
|
|
" 1.458e+05 -2.778e+04 4.088e+03 -4.360e+02 3.000e+01 -1.000e+00\n",
|
|
" 0.000e+00]\n",
|
|
"\n",
|
|
"E[Y|X]\n",
|
|
"[[65.262 26.076 9.065 3.154 4.68 ]\n",
|
|
" [28.303 19.062 4.288 1.5 2.223]\n",
|
|
" [10.216 5.074 5.815 0.712 1.046]\n",
|
|
" [ 2.315 0.859 0.283 1.938 0.144]\n",
|
|
" [ 4.526 2.47 0.999 0.366 2.908]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"e_xy = np.zeros(shape=(n_items, n_items))\n",
|
|
"e_xy[:][:] = -1e100\n",
|
|
" \n",
|
|
"items = interactions_df['item_id'].unique()\n",
|
|
" \n",
|
|
"p_y_powers = {}\n",
|
|
"for y in items:\n",
|
|
" p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
|
|
" \n",
|
|
"print(\"p_y_powers for the first item\")\n",
|
|
"print(p_y_powers[0])\n",
|
|
"\n",
|
|
"for x in items:\n",
|
|
" # Get users who bought X\n",
|
|
" c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
|
|
"\n",
|
|
" # Get users who bought only X\n",
|
|
" c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
|
|
" c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
|
|
"\n",
|
|
" # Calculate the number of non-X interactions for each user who bought X\n",
|
|
" # Include users with zero non-X interactions\n",
|
|
" n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
|
|
" n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
|
|
" # Unnecessary, but added for readability\n",
|
|
" n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
|
|
"\n",
|
|
" zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n",
|
|
" n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
|
|
"\n",
|
|
" n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
|
|
"\n",
|
|
" # Calculate the expected numbers of Y products bought by clients who bought X\n",
|
|
" alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
|
|
" for abs_c in n_non_x_interactions[\"n_items\"]])\n",
|
|
" for k in range(1, max_interactions + 1)])\n",
|
|
" \n",
|
|
" if x == 0:\n",
|
|
" print(\"alpha_k\")\n",
|
|
" print(alpha_k)\n",
|
|
" print()\n",
|
|
"\n",
|
|
" for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
|
|
" if y != x:\n",
|
|
" e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
|
|
" else:\n",
|
|
" e_xy[x][y] = n_users * p_y[x]\n",
|
|
"\n",
|
|
"print(\"E[Y|X]\")\n",
|
|
"print(np.around(e_xy[:10, :10], 3))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "acknowledged-threshold",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Get the user-item interaction matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 89,
|
|
"id": "extraordinary-mexico",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
|
|
" [0. 1. 0. 1. 1. 1. 0. 0. 0. 0.]\n",
|
|
" [1. 0. 0. 1. 0. 0. 1. 1. 1. 1.]\n",
|
|
" [1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
|
|
" [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
|
|
" [1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
|
|
" [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
|
|
" [1. 0. 0. 0. 0. 1. 1. 0. 0. 1.]\n",
|
|
" [0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]\n",
|
|
" [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# mapping to int is necessary because of how iterrows works\n",
|
|
"r = np.zeros(shape=(n_users, n_items))\n",
|
|
"for idx, interaction in interactions_df.iterrows():\n",
|
|
" r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
|
|
" \n",
|
|
"print(r[:10, :10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "lovely-password",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Calculate the number of users who bought both X and Y"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 91,
|
|
"id": "rubber-detector",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n",
|
|
" [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n",
|
|
" [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n",
|
|
" [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n",
|
|
" [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n",
|
|
" [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n",
|
|
" [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n",
|
|
" [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n",
|
|
" [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n",
|
|
" [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Simple and slow method (commented out)\n",
|
|
"\n",
|
|
"# n_xy = np.zeros(shape=(n_items, n_items))\n",
|
|
"\n",
|
|
"# for x in items:\n",
|
|
"# for y in items:\n",
|
|
"# users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
|
|
"# users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
|
|
"# users_x_and_y = users_x & users_y\n",
|
|
"# n_xy[x][y] = len(users_x_and_y)\n",
|
|
"\n",
|
|
"# Optimized method (can be further optimized by using sparse matrices)\n",
|
|
"\n",
|
|
"n_xy = np.matmul(r.T, r)\n",
|
|
"\n",
|
|
"print(n_xy[:10, :10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "distinguished-consequence",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Calculate the scores"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 97,
|
|
"id": "pointed-deputy",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n",
|
|
" [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n",
|
|
" [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n",
|
|
" [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n",
|
|
" [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n",
|
|
" [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n",
|
|
" [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n",
|
|
" [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n",
|
|
" [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n",
|
|
" [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
|
|
"\n",
|
|
"print(np.around(scores[:10, :10], 3))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "endangered-stomach",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Final comparison"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 103,
|
|
"id": "prepared-fraction",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"E[Y|X]\n",
|
|
"[[65.262 26.076 9.065 3.154 4.68 41.571 23.082 8.592 19.542 27.522]\n",
|
|
" [28.303 19.062 4.288 1.5 2.223 18.99 10.768 4.066 9.15 12.778]\n",
|
|
" [10.216 5.074 5.815 0.712 1.046 7.386 4.577 1.872 3.964 5.308]\n",
|
|
" [ 2.315 0.859 0.283 1.938 0.144 1.433 0.754 0.267 0.631 0.911]\n",
|
|
" [ 4.526 2.47 0.999 0.366 2.908 3.453 2.245 0.951 1.962 2.574]\n",
|
|
" [47.984 20.534 7.279 2.549 3.776 34.569 18.241 6.902 15.507 21.636]\n",
|
|
" [25.303 11.206 4.05 1.429 2.112 17.265 16.477 3.843 8.524 11.789]\n",
|
|
" [ 9.094 4.124 1.561 0.561 0.826 6.205 3.701 5.492 3.186 4.326]\n",
|
|
" [21.633 9.823 3.601 1.276 1.884 14.955 8.776 3.417 13.569 10.322]\n",
|
|
" [25.03 10.257 3.571 1.243 1.844 16.332 9.082 3.385 7.691 20.354]]\n",
|
|
"\n",
|
|
"N(X, Y)\n",
|
|
"[[202. 34. 15. 3. 3. 66. 36. 10. 25. 34.]\n",
|
|
" [ 34. 59. 6. 2. 5. 24. 12. 4. 8. 12.]\n",
|
|
" [ 15. 6. 18. 1. 2. 7. 3. 4. 6. 5.]\n",
|
|
" [ 3. 2. 1. 6. 1. 1. 1. 1. 2. 2.]\n",
|
|
" [ 3. 5. 2. 1. 9. 3. 2. 1. 1. 0.]\n",
|
|
" [ 66. 24. 7. 1. 3. 107. 20. 5. 16. 18.]\n",
|
|
" [ 36. 12. 3. 1. 2. 20. 51. 8. 16. 17.]\n",
|
|
" [ 10. 4. 4. 1. 1. 5. 8. 17. 8. 10.]\n",
|
|
" [ 25. 8. 6. 2. 1. 16. 16. 8. 42. 23.]\n",
|
|
" [ 34. 12. 5. 2. 0. 18. 17. 10. 23. 63.]]\n",
|
|
"\n",
|
|
"Scores\n",
|
|
"[[16.926 1.552 1.971 -0.087 -0.777 3.789 2.689 0.48 1.235 1.235]\n",
|
|
" [ 1.071 9.148 0.827 0.408 1.863 1.15 0.376 -0.033 -0.38 -0.218]\n",
|
|
" [ 1.497 0.411 5.053 0.341 0.932 -0.142 -0.737 1.555 1.023 -0.134]\n",
|
|
" [ 0.451 1.23 1.349 2.917 2.259 -0.361 0.284 1.417 1.724 1.141]\n",
|
|
" [-0.717 1.61 1.002 1.048 3.573 -0.244 -0.164 0.051 -0.687 -1.604]\n",
|
|
" [ 2.601 0.765 -0.103 -0.97 -0.399 12.319 0.412 -0.724 0.125 -0.782]\n",
|
|
" [ 2.127 0.237 -0.522 -0.359 -0.077 0.658 8.505 2.121 2.561 1.518]\n",
|
|
" [ 0.3 -0.061 1.952 0.585 0.192 -0.484 2.235 4.91 2.697 2.728]\n",
|
|
" [ 0.724 -0.582 1.265 0.641 -0.644 0.27 2.439 2.479 7.718 3.946]\n",
|
|
" [ 1.793 0.544 0.756 0.679 -1.358 0.413 2.627 3.596 5.52 9.453]]\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"E[Y|X]\")\n",
|
|
"print(np.around(e_xy[:10, :10], 3))\n",
|
|
"print()\n",
|
|
"\n",
|
|
"print(\"N(X, Y)\")\n",
|
|
"print(n_xy[:10, :10])\n",
|
|
"print()\n",
|
|
"\n",
|
|
"print(\"Scores\")\n",
|
|
"print(np.around(scores[:10, :10], 3))\n",
|
|
"print()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "distant-archive",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Inner workings of the Amazon recommender recommend method"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 111,
|
|
"id": "aerial-shipping",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Recommendation: 1, Brick (2005), 6.122652596595853\n",
|
|
"Recommendation: 1, Oh, God! (1977), 5.908857666844879\n",
|
|
"Recommendation: 1, Bubba Ho-tep (2002), 5.830666625469312\n",
|
|
"Recommendation: 1, Meatballs (1979), 5.56930833865894\n",
|
|
"Recommendation: 1, Millennium Actress (Sennen joyû) (2001), 5.502504256363742\n",
|
|
"Recommendation: 1, Honeymoon in Vegas (1992), 5.387478215471393\n",
|
|
"Recommendation: 1, Six-String Samurai (1998), 5.225652131462832\n",
|
|
"Recommendation: 1, Grass Is Greener, The (1960), 5.144470412494206\n",
|
|
"Recommendation: 1, Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), 4.796473011676857\n",
|
|
"Recommendation: 1, Clara's Heart (1988), 4.608515964550741\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"user_id = 1\n",
|
|
"should_recommend_already_bought = False\n",
|
|
"n_recommendations = 10\n",
|
|
"\n",
|
|
"mapped_user_id = user_id_mapping[user_id]\n",
|
|
"\n",
|
|
"x_list = interactions_df.loc[interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
|
|
"final_scores = np.sum(scores[x_list], axis=0)\n",
|
|
"\n",
|
|
"# Choose n recommendations based on highest scores\n",
|
|
"if not should_recommend_already_bought:\n",
|
|
" final_scores[x_list] = -1e100\n",
|
|
"\n",
|
|
"chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
|
|
"\n",
|
|
"for item_id in chosen_ids:\n",
|
|
" print(\"Recommendation: {}, {}, {}\".format(user_id_reverse_mapping[mapped_user_id],\n",
|
|
" ml_movies_df.loc[ml_movies_df['item_id'] == item_id_reverse_mapping[item_id], \n",
|
|
" 'title'].iloc[0],\n",
|
|
" final_scores[item_id]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "opponent-prediction",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Amazon recommder"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"id": "fancy-return",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from recommenders.recommender import Recommender\n",
|
|
"\n",
|
|
"class AmazonRecommender(Recommender):\n",
|
|
" \"\"\"\n",
|
|
" Basic item-to-item collaborative filtering algorithm used in Amazon.com as described in:\n",
|
|
" - Linden G., Smith B., York Y., Amazon.com Recommendations. Item-to-Item Collaborative Filtering,\n",
|
|
" IEEE Internet Computing, 2003,\n",
|
|
" - Smith B., Linden G., Two Decades of Recommender Systems at Amazon.com, IEEE Internet Computing, 2017.\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" def __init__(self):\n",
|
|
" super().__init__()\n",
|
|
" self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])\n",
|
|
" self.interactions_df = None\n",
|
|
" self.item_id_mapping = None\n",
|
|
" self.user_id_mapping = None\n",
|
|
" self.item_id_reverse_mapping = None\n",
|
|
" self.user_id_reverse_mapping = None\n",
|
|
" self.e_xy = None\n",
|
|
" self.n_xy = None\n",
|
|
" self.scores = None\n",
|
|
" self.most_popular_items = None\n",
|
|
" self.should_recommend_already_bought = False\n",
|
|
"\n",
|
|
" def initialize(self, **params):\n",
|
|
" if 'should_recommend_already_bought' in params:\n",
|
|
" self.should_recommend_already_bought = params['should_recommend_already_bought']\n",
|
|
"\n",
|
|
" def fit(self, interactions_df, users_df, items_df):\n",
|
|
" \"\"\"\n",
|
|
" Training of the recommender.\n",
|
|
"\n",
|
|
" :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items\n",
|
|
" defined by user_id, item_id and features of the interaction.\n",
|
|
" :param pd.DataFrame users_df: DataFrame with users and their features defined by\n",
|
|
" user_id and the user feature columns.\n",
|
|
" :param pd.DataFrame items_df: DataFrame with items and their features defined\n",
|
|
" by item_id and the item feature columns.\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" # Shift item ids and user ids so that they are consecutive\n",
|
|
"\n",
|
|
" unique_item_ids = interactions_df['item_id'].unique()\n",
|
|
" self.item_id_mapping = dict(zip(unique_item_ids, list(range(len(unique_item_ids)))))\n",
|
|
" self.item_id_reverse_mapping = dict(zip(list(range(len(unique_item_ids))), unique_item_ids))\n",
|
|
" unique_user_ids = interactions_df['user_id'].unique()\n",
|
|
" self.user_id_mapping = dict(zip(unique_user_ids, list(range(len(unique_user_ids)))))\n",
|
|
" self.user_id_reverse_mapping = dict(zip(list(range(len(unique_user_ids))), unique_user_ids))\n",
|
|
" \n",
|
|
" interactions_df = interactions_df.copy()\n",
|
|
" interactions_df.replace({'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping}, inplace=True)\n",
|
|
"\n",
|
|
" # Get the number of items and users\n",
|
|
"\n",
|
|
" self.interactions_df = interactions_df\n",
|
|
" n_items = np.max(interactions_df['item_id']) + 1\n",
|
|
" n_users = np.max(interactions_df['user_id']) + 1\n",
|
|
"\n",
|
|
" # Get maximal number of interactions\n",
|
|
"\n",
|
|
" n_user_interactions = interactions_df[['user_id', 'item_id']].groupby(\"user_id\").count()\n",
|
|
" # Unnecessary, but added for readability\n",
|
|
" n_user_interactions = n_user_interactions.rename(columns={'item_id': 'n_items'})\n",
|
|
" max_interactions = n_user_interactions['n_items'].max()\n",
|
|
"\n",
|
|
" # Calculate P_Y's\n",
|
|
"\n",
|
|
" n_interactions = len(interactions_df)\n",
|
|
" p_y = interactions_df[['item_id', 'user_id']].groupby(\"item_id\").count().reset_index()\n",
|
|
" p_y = p_y.rename(columns={'user_id': 'P_Y'})\n",
|
|
" p_y.loc[:, 'P_Y'] = p_y['P_Y'] / n_interactions\n",
|
|
" p_y = dict(zip(p_y['item_id'], p_y['P_Y']))\n",
|
|
"\n",
|
|
" # Get the series of all items\n",
|
|
"\n",
|
|
" # items = list(range(n_items))\n",
|
|
" items = interactions_df['item_id'].unique()\n",
|
|
"\n",
|
|
" # For every X calculate the E[Y|X]\n",
|
|
"\n",
|
|
" e_xy = np.zeros(shape=(n_items, n_items))\n",
|
|
" e_xy[:][:] = -1e100\n",
|
|
"\n",
|
|
" p_y_powers = {}\n",
|
|
" for y in items:\n",
|
|
" p_y_powers[y] = np.array([p_y[y]**k for k in range(1, max_interactions + 1)])\n",
|
|
"\n",
|
|
" for x in items:\n",
|
|
" # Get users who bought X\n",
|
|
" c_x = interactions_df.loc[interactions_df['item_id'] == x]['user_id'].unique()\n",
|
|
"\n",
|
|
" # Get users who bought only X\n",
|
|
" c_only_x = interactions_df.loc[interactions_df['item_id'] != x]['user_id'].unique()\n",
|
|
" c_only_x = list(set(c_x.tolist()) - set(c_only_x.tolist()))\n",
|
|
"\n",
|
|
" # Calculate the number of non-X interactions for each user who bought X\n",
|
|
" # Include users with zero non-X interactions\n",
|
|
" n_non_x_interactions = interactions_df.loc[interactions_df['item_id'] != x, ['user_id', 'item_id']]\n",
|
|
" n_non_x_interactions = n_non_x_interactions.groupby(\"user_id\").count()\n",
|
|
" # Unnecessary, but added for readability\n",
|
|
" n_non_x_interactions = n_non_x_interactions.rename(columns={'item_id': 'n_items'})\n",
|
|
"\n",
|
|
" zero_non_x_interactions = pd.DataFrame([[0]]*len(c_only_x), columns=[\"n_items\"], index=c_only_x) # Remove\n",
|
|
" n_non_x_interactions = pd.concat([n_non_x_interactions, zero_non_x_interactions])\n",
|
|
"\n",
|
|
" n_non_x_interactions = n_non_x_interactions.loc[c_x.tolist()]\n",
|
|
"\n",
|
|
" # Calculate the expected numbers of Y products bought by clients who bought X\n",
|
|
" alpha_k = np.array([np.sum([(-1)**(k + 1) * scisp.binom(abs_c, k)\n",
|
|
" for abs_c in n_non_x_interactions[\"n_items\"]])\n",
|
|
" for k in range(1, max_interactions + 1)])\n",
|
|
"\n",
|
|
" for y in items: # Optimize to use only those Y's which have at least one client who bought both X and Y\n",
|
|
" if y != x:\n",
|
|
" e_xy[x][y] = np.sum(alpha_k * p_y_powers[y])\n",
|
|
" else:\n",
|
|
" e_xy[x][y] = n_users * p_y[x]\n",
|
|
"\n",
|
|
" self.e_xy = e_xy\n",
|
|
"\n",
|
|
" # Calculate the number of users who bought both X and Y\n",
|
|
"\n",
|
|
" # Simple and slow method (commented out)\n",
|
|
"\n",
|
|
" # n_xy = np.zeros(shape=(n_items, n_items))\n",
|
|
"\n",
|
|
" # for x in items:\n",
|
|
" # for y in items:\n",
|
|
" # users_x = set(interactions_df.loc[interactions_df['item_id'] == x]['user_id'].tolist())\n",
|
|
" # users_y = set(interactions_df.loc[interactions_df['item_id'] == y]['user_id'].tolist())\n",
|
|
" # users_x_and_y = users_x & users_y\n",
|
|
" # n_xy[x][y] = len(users_x_and_y)\n",
|
|
"\n",
|
|
" # Optimized method (can be further optimized by using sparse matrices)\n",
|
|
"\n",
|
|
" # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works)\n",
|
|
" r = np.zeros(shape=(n_users, n_items))\n",
|
|
" for idx, interaction in interactions_df.iterrows():\n",
|
|
" r[int(interaction['user_id'])][int(interaction['item_id'])] = 1\n",
|
|
"\n",
|
|
" # Get the number of users who bought both X and Y\n",
|
|
"\n",
|
|
" n_xy = np.matmul(r.T, r)\n",
|
|
"\n",
|
|
" self.n_xy = n_xy\n",
|
|
" \n",
|
|
" # Calculate the scores\n",
|
|
"\n",
|
|
" self.scores = np.divide(n_xy - e_xy, np.sqrt(e_xy), out=np.zeros_like(n_xy), where=e_xy != 0)\n",
|
|
" \n",
|
|
" # Find the most popular items for the cold start problem\n",
|
|
" \n",
|
|
" offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby(by='item_id').count()\n",
|
|
" offers_count = offers_count.sort_values('user_id', ascending=False)\n",
|
|
" self.most_popular_items = offers_count.index\n",
|
|
"\n",
|
|
" def recommend(self, users_df, items_df, n_recommendations=1):\n",
|
|
" \"\"\"\n",
|
|
" Serving of recommendations. Scores items in items_df for each user in users_df and returns\n",
|
|
" top n_recommendations for each user.\n",
|
|
"\n",
|
|
" :param pd.DataFrame users_df: DataFrame with users and their features for which\n",
|
|
" recommendations should be generated.\n",
|
|
" :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.\n",
|
|
" :param int n_recommendations: Number of recommendations to be returned for each user.\n",
|
|
" :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations\n",
|
|
" for each user.\n",
|
|
" :rtype: pd.DataFrame\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" # Clean previous recommendations (iloc could be used alternatively)\n",
|
|
" self.recommender_df = self.recommender_df[:0]\n",
|
|
" \n",
|
|
" # Handle users not in the training data\n",
|
|
"\n",
|
|
" # Map item ids\n",
|
|
" \n",
|
|
" items_df = items_df.copy()\n",
|
|
" items_df.replace({'item_id': self.user_id_mapping}, inplace=True)\n",
|
|
"\n",
|
|
" # Generate recommendations\n",
|
|
"\n",
|
|
" for idx, user in users_df.iterrows():\n",
|
|
" recommendations = []\n",
|
|
" \n",
|
|
" user_id = user['user_id']\n",
|
|
" \n",
|
|
" if user_id in self.user_id_mapping:\n",
|
|
" mapped_user_id = self.user_id_mapping[user_id]\n",
|
|
" \n",
|
|
" x_list = self.interactions_df.loc[self.interactions_df['user_id'] == mapped_user_id]['item_id'].tolist()\n",
|
|
" final_scores = np.sum(self.scores[x_list], axis=0)\n",
|
|
"\n",
|
|
" # Choose n recommendations based on highest scores\n",
|
|
" if not self.should_recommend_already_bought:\n",
|
|
" final_scores[x_list] = -1e100\n",
|
|
"\n",
|
|
" chosen_ids = np.argsort(-final_scores)[:n_recommendations]\n",
|
|
"\n",
|
|
" for item_id in chosen_ids:\n",
|
|
" recommendations.append(\n",
|
|
" {\n",
|
|
" 'user_id': self.user_id_reverse_mapping[mapped_user_id],\n",
|
|
" 'item_id': self.item_id_reverse_mapping[item_id],\n",
|
|
" 'score': final_scores[item_id]\n",
|
|
" }\n",
|
|
" )\n",
|
|
" else: # For new users recommend most popular items\n",
|
|
" for i in range(n_recommendations):\n",
|
|
" recommendations.append(\n",
|
|
" {\n",
|
|
" 'user_id': user['user_id'],\n",
|
|
" 'item_id': self.item_id_reverse_mapping[self.most_popular_items[i]],\n",
|
|
" 'score': 1.0\n",
|
|
" }\n",
|
|
" )\n",
|
|
"\n",
|
|
" user_recommendations = pd.DataFrame(recommendations)\n",
|
|
"\n",
|
|
" self.recommender_df = pd.concat([self.recommender_df, user_recommendations])\n",
|
|
"\n",
|
|
" return self.recommender_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"id": "nonprofit-roads",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Recommendations\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>user_id</th>\n",
|
|
" <th>item_id</th>\n",
|
|
" <th>score</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>genres</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>44761</td>\n",
|
|
" <td>6.122653</td>\n",
|
|
" <td>Brick (2005)</td>\n",
|
|
" <td>Crime|Drama|Film-Noir|Mystery</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>5214</td>\n",
|
|
" <td>5.908858</td>\n",
|
|
" <td>Oh, God! (1977)</td>\n",
|
|
" <td>Comedy|Fantasy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>6755</td>\n",
|
|
" <td>5.830667</td>\n",
|
|
" <td>Bubba Ho-tep (2002)</td>\n",
|
|
" <td>Comedy|Horror</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3040</td>\n",
|
|
" <td>5.569308</td>\n",
|
|
" <td>Meatballs (1979)</td>\n",
|
|
" <td>Comedy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>6713</td>\n",
|
|
" <td>5.502504</td>\n",
|
|
" <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
|
|
" <td>Animation|Drama|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3614</td>\n",
|
|
" <td>5.387478</td>\n",
|
|
" <td>Honeymoon in Vegas (1992)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2275</td>\n",
|
|
" <td>5.225652</td>\n",
|
|
" <td>Six-String Samurai (1998)</td>\n",
|
|
" <td>Action|Adventure|Sci-Fi</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>4796</td>\n",
|
|
" <td>5.144470</td>\n",
|
|
" <td>Grass Is Greener, The (1960)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>4896</td>\n",
|
|
" <td>4.796473</td>\n",
|
|
" <td>Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)</td>\n",
|
|
" <td>Adventure|Children|Fantasy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3714</td>\n",
|
|
" <td>4.608516</td>\n",
|
|
" <td>Clara's Heart (1988)</td>\n",
|
|
" <td>Drama</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>3614</td>\n",
|
|
" <td>7.825335</td>\n",
|
|
" <td>Honeymoon in Vegas (1992)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>11</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>6713</td>\n",
|
|
" <td>7.407051</td>\n",
|
|
" <td>Millennium Actress (Sennen joyû) (2001)</td>\n",
|
|
" <td>Animation|Drama|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>2690</td>\n",
|
|
" <td>6.599105</td>\n",
|
|
" <td>Ideal Husband, An (1999)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>44761</td>\n",
|
|
" <td>6.205835</td>\n",
|
|
" <td>Brick (2005)</td>\n",
|
|
" <td>Crime|Drama|Film-Noir|Mystery</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>14</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>3628</td>\n",
|
|
" <td>6.186298</td>\n",
|
|
" <td>Flying Tigers (1942)</td>\n",
|
|
" <td>Action|Drama|Romance|War</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>15</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>6755</td>\n",
|
|
" <td>5.977848</td>\n",
|
|
" <td>Bubba Ho-tep (2002)</td>\n",
|
|
" <td>Comedy|Horror</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>16</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>959</td>\n",
|
|
" <td>5.919668</td>\n",
|
|
" <td>Of Human Bondage (1934)</td>\n",
|
|
" <td>Drama</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>17</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>31260</td>\n",
|
|
" <td>5.919668</td>\n",
|
|
" <td>Boys Town (1938)</td>\n",
|
|
" <td>Drama</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>18</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>6033</td>\n",
|
|
" <td>5.919668</td>\n",
|
|
" <td>Mystery Date (1991)</td>\n",
|
|
" <td>Comedy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>19</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>3714</td>\n",
|
|
" <td>5.919668</td>\n",
|
|
" <td>Clara's Heart (1988)</td>\n",
|
|
" <td>Drama</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>20</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>3614</td>\n",
|
|
" <td>11.392962</td>\n",
|
|
" <td>Honeymoon in Vegas (1992)</td>\n",
|
|
" <td>Comedy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>21</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>31921</td>\n",
|
|
" <td>8.329693</td>\n",
|
|
" <td>Seven-Per-Cent Solution, The (1976)</td>\n",
|
|
" <td>Adventure|Comedy|Crime|Drama|Mystery|Thriller</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>22</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>8.236954</td>\n",
|
|
" <td>Hard Rain (1998)</td>\n",
|
|
" <td>Action|Crime|Thriller</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>23</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>95147</td>\n",
|
|
" <td>8.006113</td>\n",
|
|
" <td>Dragon Ball: Sleeping Princess in Devil's Castle (Doragon bôru: Majinjô no nemuri hime) (1987)</td>\n",
|
|
" <td>Action|Adventure|Animation|Children</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>24</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>2275</td>\n",
|
|
" <td>6.941940</td>\n",
|
|
" <td>Six-String Samurai (1998)</td>\n",
|
|
" <td>Action|Adventure|Sci-Fi</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>3479</td>\n",
|
|
" <td>6.771276</td>\n",
|
|
" <td>Ladyhawke (1985)</td>\n",
|
|
" <td>Adventure|Fantasy|Romance</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>26</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>6755</td>\n",
|
|
" <td>6.520369</td>\n",
|
|
" <td>Bubba Ho-tep (2002)</td>\n",
|
|
" <td>Comedy|Horror</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>27</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>6537</td>\n",
|
|
" <td>6.454421</td>\n",
|
|
" <td>Terminator 3: Rise of the Machines (2003)</td>\n",
|
|
" <td>Action|Adventure|Sci-Fi</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>28</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>4483</td>\n",
|
|
" <td>6.339894</td>\n",
|
|
" <td>Caddyshack II (1988)</td>\n",
|
|
" <td>Comedy</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>29</th>\n",
|
|
" <td>6</td>\n",
|
|
" <td>228</td>\n",
|
|
" <td>6.174734</td>\n",
|
|
" <td>Destiny Turns on the Radio (1995)</td>\n",
|
|
" <td>Comedy</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Quick test of the recommender\n",
|
|
"\n",
|
|
"amazon_recommender = AmazonRecommender()\n",
|
|
"amazon_recommender.fit(ml_ratings_df, None, ml_movies_df)\n",
|
|
"recommendations = amazon_recommender.recommend(pd.DataFrame([[1], [4], [6]], columns=['user_id']), ml_movies_df, 10)\n",
|
|
"\n",
|
|
"recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')\n",
|
|
"print(\"Recommendations\")\n",
|
|
"display(HTML(recommendations.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "framed-negative",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Training-test split evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 55,
|
|
"id": "romantic-music",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>AmazonRecommender</td>\n",
|
|
" <td>0.181818</td>\n",
|
|
" <td>0.311688</td>\n",
|
|
" <td>0.402597</td>\n",
|
|
" <td>0.551948</td>\n",
|
|
" <td>0.181818</td>\n",
|
|
" <td>0.257806</td>\n",
|
|
" <td>0.294682</td>\n",
|
|
" <td>0.34147</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from evaluation_and_testing.testing import evaluate_train_test_split_implicit\n",
|
|
"\n",
|
|
"amazon_recommender = AmazonRecommender()\n",
|
|
"\n",
|
|
"amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(\n",
|
|
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
|
|
"\n",
|
|
"amazon_tts_results = pd.DataFrame(\n",
|
|
" amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
|
"\n",
|
|
"display(HTML(amazon_tts_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"id": "saving-harrison",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>TFIDFRecommender</td>\n",
|
|
" <td>0.025974</td>\n",
|
|
" <td>0.090909</td>\n",
|
|
" <td>0.136364</td>\n",
|
|
" <td>0.318182</td>\n",
|
|
" <td>0.025974</td>\n",
|
|
" <td>0.064393</td>\n",
|
|
" <td>0.083685</td>\n",
|
|
" <td>0.140799</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from recommenders.tfidf_recommender import TFIDFRecommender\n",
|
|
"\n",
|
|
"tfidf_recommender = TFIDFRecommender()\n",
|
|
"\n",
|
|
"tfidf_tts_results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(\n",
|
|
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]\n",
|
|
"\n",
|
|
"tfidf_tts_results = pd.DataFrame(\n",
|
|
" tfidf_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
|
"\n",
|
|
"display(HTML(tfidf_tts_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 59,
|
|
"id": "random-source",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>AmazonRecommender</td>\n",
|
|
" <td>0.181818</td>\n",
|
|
" <td>0.311688</td>\n",
|
|
" <td>0.402597</td>\n",
|
|
" <td>0.551948</td>\n",
|
|
" <td>0.181818</td>\n",
|
|
" <td>0.257806</td>\n",
|
|
" <td>0.294682</td>\n",
|
|
" <td>0.341470</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>TFIDFRecommender</td>\n",
|
|
" <td>0.025974</td>\n",
|
|
" <td>0.090909</td>\n",
|
|
" <td>0.136364</td>\n",
|
|
" <td>0.318182</td>\n",
|
|
" <td>0.025974</td>\n",
|
|
" <td>0.064393</td>\n",
|
|
" <td>0.083685</td>\n",
|
|
" <td>0.140799</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"tts_results = pd.concat([amazon_tts_results, tfidf_tts_results]).reset_index(drop=True)\n",
|
|
"display(HTML(tts_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "continued-harassment",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Leave-one-out evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 62,
|
|
"id": "prerequisite-lounge",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>AmazonRecommender</td>\n",
|
|
" <td>0.166667</td>\n",
|
|
" <td>0.256667</td>\n",
|
|
" <td>0.32</td>\n",
|
|
" <td>0.426667</td>\n",
|
|
" <td>0.166667</td>\n",
|
|
" <td>0.219086</td>\n",
|
|
" <td>0.245486</td>\n",
|
|
" <td>0.279978</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from evaluation_and_testing.testing import evaluate_leave_one_out_implicit\n",
|
|
"\n",
|
|
"amazon_recommender = AmazonRecommender()\n",
|
|
"\n",
|
|
"amazon_loo_results = [['AmazonRecommender'] + list(evaluate_leave_one_out_implicit(\n",
|
|
" amazon_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
|
|
"\n",
|
|
"amazon_loo_results = pd.DataFrame(\n",
|
|
" amazon_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
|
"\n",
|
|
"display(HTML(amazon_loo_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 60,
|
|
"id": "behind-cambodia",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>TFIDFRecommender</td>\n",
|
|
" <td>0.006667</td>\n",
|
|
" <td>0.053333</td>\n",
|
|
" <td>0.123333</td>\n",
|
|
" <td>0.233333</td>\n",
|
|
" <td>0.006667</td>\n",
|
|
" <td>0.033491</td>\n",
|
|
" <td>0.062178</td>\n",
|
|
" <td>0.096151</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"tfidf_recommender = TFIDFRecommender()\n",
|
|
"\n",
|
|
"tfidf_loo_results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(\n",
|
|
" tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]\n",
|
|
"\n",
|
|
"tfidf_loo_results = pd.DataFrame(\n",
|
|
" tfidf_loo_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])\n",
|
|
"\n",
|
|
"display(HTML(tfidf_loo_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 63,
|
|
"id": "lightweight-password",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Recommender</th>\n",
|
|
" <th>HR@1</th>\n",
|
|
" <th>HR@3</th>\n",
|
|
" <th>HR@5</th>\n",
|
|
" <th>HR@10</th>\n",
|
|
" <th>NDCG@1</th>\n",
|
|
" <th>NDCG@3</th>\n",
|
|
" <th>NDCG@5</th>\n",
|
|
" <th>NDCG@10</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>AmazonRecommender</td>\n",
|
|
" <td>0.166667</td>\n",
|
|
" <td>0.256667</td>\n",
|
|
" <td>0.320000</td>\n",
|
|
" <td>0.426667</td>\n",
|
|
" <td>0.166667</td>\n",
|
|
" <td>0.219086</td>\n",
|
|
" <td>0.245486</td>\n",
|
|
" <td>0.279978</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>TFIDFRecommender</td>\n",
|
|
" <td>0.006667</td>\n",
|
|
" <td>0.053333</td>\n",
|
|
" <td>0.123333</td>\n",
|
|
" <td>0.233333</td>\n",
|
|
" <td>0.006667</td>\n",
|
|
" <td>0.033491</td>\n",
|
|
" <td>0.062178</td>\n",
|
|
" <td>0.096151</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"loo_results = pd.concat([amazon_loo_results, tfidf_loo_results]).reset_index(drop=True)\n",
|
|
"display(HTML(loo_results.to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "mediterranean-residence",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|