{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.610507Z", "start_time": "2024-12-09T22:43:58.602149Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import numpy as np" ], "id": "5a823fabad3c186f", "outputs": [], "execution_count": 32 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.621153Z", "start_time": "2024-12-09T22:43:58.612510Z" } }, "cell_type": "code", "source": [ "def split_by_user(data, test_size=0.2, random_state=42):\n", " unique_users = data['userId'].unique()\n", " np.random.seed(random_state)\n", " test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n", " test_data = data[data['userId'].isin(test_users)]\n", " train_data = data[~data['userId'].isin(test_users)]\n", " return train_data, test_data" ], "id": "d694dbe819b591ac", "outputs": [], "execution_count": 33 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.689968Z", "start_time": "2024-12-09T22:43:58.622148Z" } }, "cell_type": "code", "source": [ "movies = pd.read_csv('ml-latest-small/movies.csv')\n", "ratings = pd.read_csv('ml-latest-small/ratings.csv')" ], "id": "1bc78bafbae06c89", "outputs": [], "execution_count": 34 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.707803Z", "start_time": "2024-12-09T22:43:58.690972Z" } }, "cell_type": "code", "source": "movies['genres'] = movies['genres'].str.split('|')", "id": "2543aa8216425342", "outputs": [], "execution_count": 35 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.742861Z", "start_time": "2024-12-09T22:43:58.709808Z" } }, "cell_type": "code", "source": "data = pd.merge(ratings, movies, on=\"movieId\")", "id": "b3fa37255dccb066", "outputs": [], "execution_count": 36 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-09T22:43:58.759240Z", "start_time": "2024-12-09T22:43:58.743879Z" } }, "cell_type": "code", "source": "data", "id": "c9542abeaa0c59d7", "outputs": [ { "data": { "text/plain": [ " userId movieId rating timestamp title \\\n", "0 1 1 4.0 964982703 Toy Story (1995) \n", "1 1 3 4.0 964981247 Grumpier Old Men (1995) \n", "2 1 6 4.0 964982224 Heat (1995) \n", "3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n", "4 1 50 5.0 964982931 Usual Suspects, The (1995) \n", "... ... ... ... ... ... \n", "100831 610 166534 4.0 1493848402 Split (2017) \n", "100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n", "100833 610 168250 5.0 1494273047 Get Out (2017) \n", "100834 610 168252 5.0 1493846352 Logan (2017) \n", "100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n", "\n", " genres \n", "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", "1 [Comedy, Romance] \n", "2 [Action, Crime, Thriller] \n", "3 [Mystery, Thriller] \n", "4 [Crime, Mystery, Thriller] \n", "... ... \n", "100831 [Drama, Horror, Thriller] \n", "100832 [Action, Crime, Thriller] \n", "100833 [Horror] \n", "100834 [Action, Sci-Fi] \n", "100835 [Action, Crime, Drama, Thriller] \n", "\n", "[100836 rows x 6 columns]" ], "text/html": [ "
\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "
---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "4.0 | \n", "964982703 | \n", "Toy Story (1995) | \n", "[Adventure, Animation, Children, Comedy, Fantasy] | \n", "
1 | \n", "1 | \n", "3 | \n", "4.0 | \n", "964981247 | \n", "Grumpier Old Men (1995) | \n", "[Comedy, Romance] | \n", "
2 | \n", "1 | \n", "6 | \n", "4.0 | \n", "964982224 | \n", "Heat (1995) | \n", "[Action, Crime, Thriller] | \n", "
3 | \n", "1 | \n", "47 | \n", "5.0 | \n", "964983815 | \n", "Seven (a.k.a. Se7en) (1995) | \n", "[Mystery, Thriller] | \n", "
4 | \n", "1 | \n", "50 | \n", "5.0 | \n", "964982931 | \n", "Usual Suspects, The (1995) | \n", "[Crime, Mystery, Thriller] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
100831 | \n", "610 | \n", "166534 | \n", "4.0 | \n", "1493848402 | \n", "Split (2017) | \n", "[Drama, Horror, Thriller] | \n", "
100832 | \n", "610 | \n", "168248 | \n", "5.0 | \n", "1493850091 | \n", "John Wick: Chapter Two (2017) | \n", "[Action, Crime, Thriller] | \n", "
100833 | \n", "610 | \n", "168250 | \n", "5.0 | \n", "1494273047 | \n", "Get Out (2017) | \n", "[Horror] | \n", "
100834 | \n", "610 | \n", "168252 | \n", "5.0 | \n", "1493846352 | \n", "Logan (2017) | \n", "[Action, Sci-Fi] | \n", "
100835 | \n", "610 | \n", "170875 | \n", "3.0 | \n", "1493846415 | \n", "The Fate of the Furious (2017) | \n", "[Action, Crime, Drama, Thriller] | \n", "
100836 rows × 6 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "genreMatch | \n", "similarUsers | \n", "popularity | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "4.0 | \n", "964982703 | \n", "Toy Story (1995) | \n", "[Adventure, Animation, Children, Comedy, Fantasy] | \n", "4.44 | \n", "4.50 | \n", "33.81 | \n", "
1 | \n", "1 | \n", "3 | \n", "4.0 | \n", "964981247 | \n", "Grumpier Old Men (1995) | \n", "[Comedy, Romance] | \n", "4.29 | \n", "4.00 | \n", "8.81 | \n", "
2 | \n", "1 | \n", "6 | \n", "4.0 | \n", "964982224 | \n", "Heat (1995) | \n", "[Action, Crime, Thriller] | \n", "4.27 | \n", "3.00 | \n", "16.19 | \n", "
3 | \n", "1 | \n", "47 | \n", "5.0 | \n", "964983815 | \n", "Seven (a.k.a. Se7en) (1995) | \n", "[Mystery, Thriller] | \n", "4.16 | \n", "3.88 | \n", "33.20 | \n", "
4 | \n", "1 | \n", "50 | \n", "5.0 | \n", "964982931 | \n", "Usual Suspects, The (1995) | \n", "[Crime, Mystery, Thriller] | \n", "4.22 | \n", "4.75 | \n", "32.38 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
73172 | \n", "610 | \n", "166534 | \n", "4.0 | \n", "1493848402 | \n", "Split (2017) | \n", "[Drama, Horror, Thriller] | \n", "3.65 | \n", "NaN | \n", "0.82 | \n", "
73173 | \n", "610 | \n", "168248 | \n", "5.0 | \n", "1493850091 | \n", "John Wick: Chapter Two (2017) | \n", "[Action, Crime, Thriller] | \n", "3.66 | \n", "5.00 | \n", "1.02 | \n", "
73174 | \n", "610 | \n", "168250 | \n", "5.0 | \n", "1494273047 | \n", "Get Out (2017) | \n", "[Horror] | \n", "3.51 | \n", "NaN | \n", "2.66 | \n", "
73175 | \n", "610 | \n", "168252 | \n", "5.0 | \n", "1493846352 | \n", "Logan (2017) | \n", "[Action, Sci-Fi] | \n", "3.63 | \n", "4.50 | \n", "4.30 | \n", "
73176 | \n", "610 | \n", "170875 | \n", "3.0 | \n", "1493846415 | \n", "The Fate of the Furious (2017) | \n", "[Action, Crime, Drama, Thriller] | \n", "3.71 | \n", "NaN | \n", "0.41 | \n", "
73177 rows × 9 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "timestamp | \n", "title | \n", "genres | \n", "genreMatch | \n", "similarUsers | \n", "popularity | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "3 | \n", "31 | \n", "0.5 | \n", "1306463578 | \n", "Dangerous Minds (1995) | \n", "[Drama] | \n", "0.75 | \n", "2.00 | \n", "5.74 | \n", "
1 | \n", "3 | \n", "527 | \n", "0.5 | \n", "1306464275 | \n", "Schindler's List (1993) | \n", "[Drama, War] | \n", "0.62 | \n", "3.67 | \n", "31.97 | \n", "
2 | \n", "3 | \n", "647 | \n", "0.5 | \n", "1306463619 | \n", "Courage Under Fire (1996) | \n", "[Action, Crime, Drama, War] | \n", "1.33 | \n", "3.00 | \n", "5.74 | \n", "
3 | \n", "3 | \n", "688 | \n", "0.5 | \n", "1306464228 | \n", "Operation Dumbo Drop (1995) | \n", "[Action, Adventure, Comedy, War] | \n", "1.95 | \n", "1.50 | \n", "3.28 | \n", "
4 | \n", "3 | \n", "720 | \n", "0.5 | \n", "1306463595 | \n", "Wallace & Gromit: The Best of Aardman Animatio... | \n", "[Adventure, Animation, Comedy] | \n", "1.41 | \n", "4.50 | \n", "5.74 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
27654 | \n", "609 | \n", "892 | \n", "3.0 | \n", "847221080 | \n", "Twelfth Night (1996) | \n", "[Comedy, Drama, Romance] | \n", "3.28 | \n", "NaN | \n", "4.10 | \n", "
27655 | \n", "609 | \n", "1056 | \n", "3.0 | \n", "847221080 | \n", "Jude (1996) | \n", "[Drama] | \n", "3.37 | \n", "NaN | \n", "1.64 | \n", "
27656 | \n", "609 | \n", "1059 | \n", "3.0 | \n", "847221054 | \n", "William Shakespeare's Romeo + Juliet (1996) | \n", "[Drama, Romance] | \n", "3.28 | \n", "NaN | \n", "7.38 | \n", "
27657 | \n", "609 | \n", "1150 | \n", "4.0 | \n", "847221054 | \n", "Return of Martin Guerre, The (Retour de Martin... | \n", "[Drama] | \n", "3.37 | \n", "NaN | \n", "0.82 | \n", "
27658 | \n", "609 | \n", "1161 | \n", "4.0 | \n", "847221080 | \n", "Tin Drum, The (Blechtrommel, Die) (1979) | \n", "[Drama, War] | \n", "3.43 | \n", "NaN | \n", "0.82 | \n", "
27659 rows × 9 columns
\n", "