{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.610507Z", "start_time": "2024-12-09T22:43:58.602149Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import numpy as np" ], "id": "5a823fabad3c186f", "outputs": [], "execution_count": 32 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.621153Z", "start_time": "2024-12-09T22:43:58.612510Z" } }, "cell_type": "code", "source": [ "def split_by_user(data, test_size=0.2, random_state=42):\n", " unique_users = data['userId'].unique()\n", " np.random.seed(random_state)\n", " test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n", " test_data = data[data['userId'].isin(test_users)]\n", " train_data = data[~data['userId'].isin(test_users)]\n", " return train_data, test_data" ], "id": "d694dbe819b591ac", "outputs": [], "execution_count": 33 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.689968Z", "start_time": "2024-12-09T22:43:58.622148Z" } }, "cell_type": "code", "source": [ "movies = pd.read_csv('ml-latest-small/movies.csv')\n", "ratings = pd.read_csv('ml-latest-small/ratings.csv')" ], "id": "1bc78bafbae06c89", "outputs": [], "execution_count": 34 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.707803Z", "start_time": "2024-12-09T22:43:58.690972Z" } }, "cell_type": "code", "source": "movies['genres'] = movies['genres'].str.split('|')", "id": "2543aa8216425342", "outputs": [], "execution_count": 35 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.742861Z", "start_time": "2024-12-09T22:43:58.709808Z" } }, "cell_type": "code", "source": "data = pd.merge(ratings, movies, on=\"movieId\")", "id": "b3fa37255dccb066", "outputs": [], "execution_count": 36 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.759240Z", "start_time": "2024-12-09T22:43:58.743879Z" } }, "cell_type": "code", "source": "data", "id": "c9542abeaa0c59d7", "outputs": [ { "data": { "text/plain": [ " userId movieId rating timestamp title \\\n", "0 1 1 4.0 964982703 Toy Story (1995) \n", "1 1 3 4.0 964981247 Grumpier Old Men (1995) \n", "2 1 6 4.0 964982224 Heat (1995) \n", "3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n", "4 1 50 5.0 964982931 Usual Suspects, The (1995) \n", "... ... ... ... ... ... \n", "100831 610 166534 4.0 1493848402 Split (2017) \n", "100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n", "100833 610 168250 5.0 1494273047 Get Out (2017) \n", "100834 610 168252 5.0 1493846352 Logan (2017) \n", "100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n", "\n", " genres \n", "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", "1 [Comedy, Romance] \n", "2 [Action, Crime, Thriller] \n", "3 [Mystery, Thriller] \n", "4 [Crime, Mystery, Thriller] \n", "... ... \n", "100831 [Drama, Horror, Thriller] \n", "100832 [Action, Crime, Thriller] \n", "100833 [Horror] \n", "100834 [Action, Sci-Fi] \n", "100835 [Action, Crime, Drama, Thriller] \n", "\n", "[100836 rows x 6 columns]" ], "text/html": [ "
\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "
---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "4.0 | \n", "964982703 | \n", "Toy Story (1995) | \n", "[Adventure, Animation, Children, Comedy, Fantasy] | \n", "
1 | \n", "1 | \n", "3 | \n", "4.0 | \n", "964981247 | \n", "Grumpier Old Men (1995) | \n", "[Comedy, Romance] | \n", "
2 | \n", "1 | \n", "6 | \n", "4.0 | \n", "964982224 | \n", "Heat (1995) | \n", "[Action, Crime, Thriller] | \n", "
3 | \n", "1 | \n", "47 | \n", "5.0 | \n", "964983815 | \n", "Seven (a.k.a. Se7en) (1995) | \n", "[Mystery, Thriller] | \n", "
4 | \n", "1 | \n", "50 | \n", "5.0 | \n", "964982931 | \n", "Usual Suspects, The (1995) | \n", "[Crime, Mystery, Thriller] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
100831 | \n", "610 | \n", "166534 | \n", "4.0 | \n", "1493848402 | \n", "Split (2017) | \n", "[Drama, Horror, Thriller] | \n", "
100832 | \n", "610 | \n", "168248 | \n", "5.0 | \n", "1493850091 | \n", "John Wick: Chapter Two (2017) | \n", "[Action, Crime, Thriller] | \n", "
100833 | \n", "610 | \n", "168250 | \n", "5.0 | \n", "1494273047 | \n", "Get Out (2017) | \n", "[Horror] | \n", "
100834 | \n", "610 | \n", "168252 | \n", "5.0 | \n", "1493846352 | \n", "Logan (2017) | \n", "[Action, Sci-Fi] | \n", "
100835 | \n", "610 | \n", "170875 | \n", "3.0 | \n", "1493846415 | \n", "The Fate of the Furious (2017) | \n", "[Action, Crime, Drama, Thriller] | \n", "
100836 rows × 6 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "genreMatch | \n", "similarUsers_x | \n", "similarUsers_y | \n", "popularity | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "4.0 | \n", "964982703 | \n", "Toy Story (1995) | \n", "[Adventure, Animation, Children, Comedy, Fantasy] | \n", "4.44 | \n", "4.50 | \n", "4.50 | \n", "33.81 | \n", "
1 | \n", "1 | \n", "3 | \n", "4.0 | \n", "964981247 | \n", "Grumpier Old Men (1995) | \n", "[Comedy, Romance] | \n", "4.29 | \n", "4.00 | \n", "4.00 | \n", "8.81 | \n", "
2 | \n", "1 | \n", "6 | \n", "4.0 | \n", "964982224 | \n", "Heat (1995) | \n", "[Action, Crime, Thriller] | \n", "4.27 | \n", "3.00 | \n", "3.00 | \n", "16.19 | \n", "
3 | \n", "1 | \n", "47 | \n", "5.0 | \n", "964983815 | \n", "Seven (a.k.a. Se7en) (1995) | \n", "[Mystery, Thriller] | \n", "4.16 | \n", "3.88 | \n", "3.88 | \n", "33.20 | \n", "
4 | \n", "1 | \n", "50 | \n", "5.0 | \n", "964982931 | \n", "Usual Suspects, The (1995) | \n", "[Crime, Mystery, Thriller] | \n", "4.22 | \n", "4.75 | \n", "4.75 | \n", "32.38 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
73172 | \n", "610 | \n", "166534 | \n", "4.0 | \n", "1493848402 | \n", "Split (2017) | \n", "[Drama, Horror, Thriller] | \n", "3.65 | \n", "NaN | \n", "NaN | \n", "0.82 | \n", "
73173 | \n", "610 | \n", "168248 | \n", "5.0 | \n", "1493850091 | \n", "John Wick: Chapter Two (2017) | \n", "[Action, Crime, Thriller] | \n", "3.66 | \n", "5.00 | \n", "5.00 | \n", "1.02 | \n", "
73174 | \n", "610 | \n", "168250 | \n", "5.0 | \n", "1494273047 | \n", "Get Out (2017) | \n", "[Horror] | \n", "3.51 | \n", "NaN | \n", "NaN | \n", "2.66 | \n", "
73175 | \n", "610 | \n", "168252 | \n", "5.0 | \n", "1493846352 | \n", "Logan (2017) | \n", "[Action, Sci-Fi] | \n", "3.63 | \n", "4.50 | \n", "4.50 | \n", "4.30 | \n", "
73176 | \n", "610 | \n", "170875 | \n", "3.0 | \n", "1493846415 | \n", "The Fate of the Furious (2017) | \n", "[Action, Crime, Drama, Thriller] | \n", "3.71 | \n", "NaN | \n", "NaN | \n", "0.41 | \n", "
73177 rows × 10 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "genreMatch | \n", "similarUsers_x | \n", "similarUsers_y | \n", "popularity | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "3 | \n", "31 | \n", "0.5 | \n", "1306463578 | \n", "Dangerous Minds (1995) | \n", "[Drama] | \n", "0.75 | \n", "2.00 | \n", "2.00 | \n", "5.74 | \n", "
1 | \n", "3 | \n", "527 | \n", "0.5 | \n", "1306464275 | \n", "Schindler's List (1993) | \n", "[Drama, War] | \n", "0.62 | \n", "3.67 | \n", "3.67 | \n", "31.97 | \n", "
2 | \n", "3 | \n", "647 | \n", "0.5 | \n", "1306463619 | \n", "Courage Under Fire (1996) | \n", "[Action, Crime, Drama, War] | \n", "1.33 | \n", "3.00 | \n", "3.00 | \n", "5.74 | \n", "
3 | \n", "3 | \n", "688 | \n", "0.5 | \n", "1306464228 | \n", "Operation Dumbo Drop (1995) | \n", "[Action, Adventure, Comedy, War] | \n", "1.95 | \n", "1.50 | \n", "1.50 | \n", "3.28 | \n", "
4 | \n", "3 | \n", "720 | \n", "0.5 | \n", "1306463595 | \n", "Wallace & Gromit: The Best of Aardman Animatio... | \n", "[Adventure, Animation, Comedy] | \n", "1.41 | \n", "4.50 | \n", "4.50 | \n", "5.74 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
27654 | \n", "609 | \n", "892 | \n", "3.0 | \n", "847221080 | \n", "Twelfth Night (1996) | \n", "[Comedy, Drama, Romance] | \n", "3.28 | \n", "NaN | \n", "NaN | \n", "4.10 | \n", "
27655 | \n", "609 | \n", "1056 | \n", "3.0 | \n", "847221080 | \n", "Jude (1996) | \n", "[Drama] | \n", "3.37 | \n", "NaN | \n", "NaN | \n", "1.64 | \n", "
27656 | \n", "609 | \n", "1059 | \n", "3.0 | \n", "847221054 | \n", "William Shakespeare's Romeo + Juliet (1996) | \n", "[Drama, Romance] | \n", "3.28 | \n", "NaN | \n", "NaN | \n", "7.38 | \n", "
27657 | \n", "609 | \n", "1150 | \n", "4.0 | \n", "847221054 | \n", "Return of Martin Guerre, The (Retour de Martin... | \n", "[Drama] | \n", "3.37 | \n", "NaN | \n", "NaN | \n", "0.82 | \n", "
27658 | \n", "609 | \n", "1161 | \n", "4.0 | \n", "847221080 | \n", "Tin Drum, The (Blechtrommel, Die) (1979) | \n", "[Drama, War] | \n", "3.43 | \n", "NaN | \n", "NaN | \n", "0.82 | \n", "
27659 rows × 10 columns
\n", "