{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.637007Z", "start_time": "2024-12-07T01:55:39.633398Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import numpy as np" ], "id": "5a823fabad3c186f", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.646149Z", "start_time": "2024-12-07T01:55:39.641016Z" } }, "cell_type": "code", "source": [ "def split_by_user(data, test_size=0.2, random_state=42):\n", " unique_users = data['userId'].unique()\n", " np.random.seed(random_state)\n", " test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n", " test_data = data[data['userId'].isin(test_users)]\n", " train_data = data[~data['userId'].isin(test_users)]\n", " return train_data, test_data" ], "id": "d694dbe819b591ac", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.715567Z", "start_time": "2024-12-07T01:55:39.660120Z" } }, "cell_type": "code", "source": [ "movies = pd.read_csv('ml-latest-small/movies.csv')\n", "ratings = pd.read_csv('ml-latest-small/ratings.csv')" ], "id": "1bc78bafbae06c89", "outputs": [], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.729648Z", "start_time": "2024-12-07T01:55:39.717570Z" } }, "cell_type": "code", "source": "movies['genres'] = movies['genres'].str.split('|')", "id": "2543aa8216425342", "outputs": [], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.758043Z", "start_time": "2024-12-07T01:55:39.730651Z" } }, "cell_type": "code", "source": "data = pd.merge(ratings, movies, on=\"movieId\")", "id": "b3fa37255dccb066", "outputs": [], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.775205Z", "start_time": "2024-12-07T01:55:39.759045Z" } }, "cell_type": "code", "source": "data", "id": "c9542abeaa0c59d7", "outputs": [ { "data": { "text/plain": [ " userId movieId rating timestamp title \\\n", "0 1 1 4.0 964982703 Toy Story (1995) \n", "1 1 3 4.0 964981247 Grumpier Old Men (1995) \n", "2 1 6 4.0 964982224 Heat (1995) \n", "3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n", "4 1 50 5.0 964982931 Usual Suspects, The (1995) \n", "... ... ... ... ... ... \n", "100831 610 166534 4.0 1493848402 Split (2017) \n", "100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n", "100833 610 168250 5.0 1494273047 Get Out (2017) \n", "100834 610 168252 5.0 1493846352 Logan (2017) \n", "100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n", "\n", " genres \n", "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", "1 [Comedy, Romance] \n", "2 [Action, Crime, Thriller] \n", "3 [Mystery, Thriller] \n", "4 [Crime, Mystery, Thriller] \n", "... ... \n", "100831 [Drama, Horror, Thriller] \n", "100832 [Action, Crime, Thriller] \n", "100833 [Horror] \n", "100834 [Action, Sci-Fi] \n", "100835 [Action, Crime, Drama, Thriller] \n", "\n", "[100836 rows x 6 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitlegenres
0114.0964982703Toy Story (1995)[Adventure, Animation, Children, Comedy, Fantasy]
1134.0964981247Grumpier Old Men (1995)[Comedy, Romance]
2164.0964982224Heat (1995)[Action, Crime, Thriller]
31475.0964983815Seven (a.k.a. Se7en) (1995)[Mystery, Thriller]
41505.0964982931Usual Suspects, The (1995)[Crime, Mystery, Thriller]
.....................
1008316101665344.01493848402Split (2017)[Drama, Horror, Thriller]
1008326101682485.01493850091John Wick: Chapter Two (2017)[Action, Crime, Thriller]
1008336101682505.01494273047Get Out (2017)[Horror]
1008346101682525.01493846352Logan (2017)[Action, Sci-Fi]
1008356101708753.01493846415The Fate of the Furious (2017)[Action, Crime, Drama, Thriller]
\n", "

100836 rows × 6 columns

\n", "
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.789305Z", "start_time": "2024-12-07T01:55:39.776211Z" } }, "cell_type": "code", "source": "train_data, test_data = split_by_user(data)", "id": "b66149513c0c6b0e", "outputs": [], "execution_count": 8 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.800344Z", "start_time": "2024-12-07T01:55:39.790308Z" } }, "cell_type": "code", "source": "train_data", "id": "2cf1266ad52d70b0", "outputs": [ { "data": { "text/plain": [ " userId movieId rating timestamp title \\\n", "0 1 1 4.0 964982703 Toy Story (1995) \n", "1 1 3 4.0 964981247 Grumpier Old Men (1995) \n", "2 1 6 4.0 964982224 Heat (1995) \n", "3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n", "4 1 50 5.0 964982931 Usual Suspects, The (1995) \n", "... ... ... ... ... ... \n", "100831 610 166534 4.0 1493848402 Split (2017) \n", "100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n", "100833 610 168250 5.0 1494273047 Get Out (2017) \n", "100834 610 168252 5.0 1493846352 Logan (2017) \n", "100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n", "\n", " genres \n", "0 [Adventure, Animation, Children, Comedy, Fantasy] \n", "1 [Comedy, Romance] \n", "2 [Action, Crime, Thriller] \n", "3 [Mystery, Thriller] \n", "4 [Crime, Mystery, Thriller] \n", "... ... \n", "100831 [Drama, Horror, Thriller] \n", "100832 [Action, Crime, Thriller] \n", "100833 [Horror] \n", "100834 [Action, Sci-Fi] \n", "100835 [Action, Crime, Drama, Thriller] \n", "\n", "[73177 rows x 6 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitlegenres
0114.0964982703Toy Story (1995)[Adventure, Animation, Children, Comedy, Fantasy]
1134.0964981247Grumpier Old Men (1995)[Comedy, Romance]
2164.0964982224Heat (1995)[Action, Crime, Thriller]
31475.0964983815Seven (a.k.a. Se7en) (1995)[Mystery, Thriller]
41505.0964982931Usual Suspects, The (1995)[Crime, Mystery, Thriller]
.....................
1008316101665344.01493848402Split (2017)[Drama, Horror, Thriller]
1008326101682485.01493850091John Wick: Chapter Two (2017)[Action, Crime, Thriller]
1008336101682505.01494273047Get Out (2017)[Horror]
1008346101682525.01493846352Logan (2017)[Action, Sci-Fi]
1008356101708753.01493846415The Fate of the Furious (2017)[Action, Crime, Drama, Thriller]
\n", "

73177 rows × 6 columns

\n", "
" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 9 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:55:39.813332Z", "start_time": "2024-12-07T01:55:39.803336Z" } }, "cell_type": "code", "source": "test_data", "id": "93c87a58f9f4d704", "outputs": [ { "data": { "text/plain": [ " userId movieId rating timestamp \\\n", "261 3 31 0.5 1306463578 \n", "262 3 527 0.5 1306464275 \n", "263 3 647 0.5 1306463619 \n", "264 3 688 0.5 1306464228 \n", "265 3 720 0.5 1306463595 \n", "... ... ... ... ... \n", "99529 609 892 3.0 847221080 \n", "99530 609 1056 3.0 847221080 \n", "99531 609 1059 3.0 847221054 \n", "99532 609 1150 4.0 847221054 \n", "99533 609 1161 4.0 847221080 \n", "\n", " title \\\n", "261 Dangerous Minds (1995) \n", "262 Schindler's List (1993) \n", "263 Courage Under Fire (1996) \n", "264 Operation Dumbo Drop (1995) \n", "265 Wallace & Gromit: The Best of Aardman Animatio... \n", "... ... \n", "99529 Twelfth Night (1996) \n", "99530 Jude (1996) \n", "99531 William Shakespeare's Romeo + Juliet (1996) \n", "99532 Return of Martin Guerre, The (Retour de Martin... \n", "99533 Tin Drum, The (Blechtrommel, Die) (1979) \n", "\n", " genres \n", "261 [Drama] \n", "262 [Drama, War] \n", "263 [Action, Crime, Drama, War] \n", "264 [Action, Adventure, Comedy, War] \n", "265 [Adventure, Animation, Comedy] \n", "... ... \n", "99529 [Comedy, Drama, Romance] \n", "99530 [Drama] \n", "99531 [Drama, Romance] \n", "99532 [Drama] \n", "99533 [Drama, War] \n", "\n", "[27659 rows x 6 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamptitlegenres
2613310.51306463578Dangerous Minds (1995)[Drama]
26235270.51306464275Schindler's List (1993)[Drama, War]
26336470.51306463619Courage Under Fire (1996)[Action, Crime, Drama, War]
26436880.51306464228Operation Dumbo Drop (1995)[Action, Adventure, Comedy, War]
26537200.51306463595Wallace & Gromit: The Best of Aardman Animatio...[Adventure, Animation, Comedy]
.....................
995296098923.0847221080Twelfth Night (1996)[Comedy, Drama, Romance]
9953060910563.0847221080Jude (1996)[Drama]
9953160910593.0847221054William Shakespeare's Romeo + Juliet (1996)[Drama, Romance]
9953260911504.0847221054Return of Martin Guerre, The (Retour de Martin...[Drama]
9953360911614.0847221080Tin Drum, The (Blechtrommel, Die) (1979)[Drama, War]
\n", "

27659 rows × 6 columns

\n", "
" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 10 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:24.904397Z", "start_time": "2024-12-07T01:55:39.833745Z" } }, "cell_type": "code", "source": [ "expanded_train_data = train_data.explode('genres')\n", "train_data = train_data.copy()\n", "test_data = test_data.copy()\n", "\n", "train_user_genre_rating = (\n", " expanded_train_data.groupby(['userId', 'genres'])['rating']\n", " .mean()\n", " .reset_index()\n", " .rename(columns={'rating': 'avg_genre_rating'})\n", ")\n", "\n", "train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n", "\n", "def train_genre_match_calc(row):\n", " user_id = row['userId']\n", " genres = row['genres']\n", "\n", " if isinstance(genres, str):\n", " genres = [genres]\n", "\n", " user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n", " genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n", "\n", " if not genre_scores.empty:\n", " return genre_scores.mean()\n", " else:\n", " return 0\n", "\n", "train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n", "\n", "\n", "expanded_test_data = test_data.explode('genres')\n", "\n", "test_user_genre_rating = (\n", " expanded_test_data.groupby(['userId', 'genres'])['rating']\n", " .mean()\n", " .reset_index()\n", " .rename(columns={'rating': 'avg_genre_rating'})\n", ")\n", "\n", "test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n", "\n", "def test_genre_match_calc(row):\n", " user_id = row['userId']\n", " genres = row['genres']\n", "\n", " if isinstance(genres, str):\n", " genres = [genres]\n", "\n", " user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n", " genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n", "\n", " if not genre_scores.empty:\n", " return genre_scores.mean()\n", " else:\n", " return 0\n", "\n", "test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)" ], "id": "88296c8c47cdbf60", "outputs": [], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.150918Z", "start_time": "2024-12-07T01:56:24.905400Z" } }, "cell_type": "code", "source": [ "train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n", "train_user_movie_filled = train_user_movie.fillna(0)\n", "train_user_similarity = cosine_similarity(train_user_movie_filled)\n", "train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n", "\n", "def train_average(user_id, movie_id, top_n=5):\n", " similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n", " similar_ratings = train_user_movie.loc[similar_users, movie_id]\n", "\n", " return similar_ratings.dropna().mean()\n", "\n", "train_data['similarUsers'] = train_data.apply(\n", " lambda row: train_average(row['userId'], row['movieId']), axis=1\n", ")\n", "\n", "test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n", "test_user_movie_filled = test_user_movie.fillna(0)\n", "test_user_similarity = cosine_similarity(test_user_movie_filled)\n", "test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n", "\n", "def test_average(user_id, movie_id, top_n=5):\n", " similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n", " similar_ratings = test_user_movie.loc[similar_users, movie_id]\n", "\n", " return similar_ratings.dropna().mean()\n", "\n", "test_data['similarUsers'] = test_data.apply(\n", " lambda row: test_average(row['userId'], row['movieId']), axis=1\n", ")\n" ], "id": "e931f4041a1802fb", "outputs": [], "execution_count": 12 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.194857Z", "start_time": "2024-12-07T01:56:54.151920Z" } }, "cell_type": "code", "source": [ "train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n", "train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n", "\n", "test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n", "test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')" ], "id": "ce65ce417e7f5207", "outputs": [], "execution_count": 13 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.224008Z", "start_time": "2024-12-07T01:56:54.195860Z" } }, "cell_type": "code", "source": [ "train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n", "train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n", "\n", "test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n", "test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')" ], "id": "aa798201db531188", "outputs": [], "execution_count": 14 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.819973Z", "start_time": "2024-12-07T01:56:54.225011Z" } }, "cell_type": "code", "source": [ "train_data.to_csv('datasets/train_all.csv', index=False)\n", "test_data.to_csv('datasets/test_all.csv', index=False)" ], "id": "f7b5130c72ad35af", "outputs": [], "execution_count": 15 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.846938Z", "start_time": "2024-12-07T01:56:54.820977Z" } }, "cell_type": "code", "source": [ "def get_top_movies(data):\n", " top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n", " return top\n", "\n", "train_top = get_top_movies(train_data)\n", "test_top = get_top_movies(test_data)" ], "id": "20dba13e7a3d105b", "outputs": [], "execution_count": 16 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-07T01:56:54.873768Z", "start_time": "2024-12-07T01:56:54.847941Z" } }, "cell_type": "code", "source": [ "train_top.to_csv('datasets/train_top.csv', index=False)\n", "test_top.to_csv('datasets/test_top.csv', index=False)" ], "id": "be9f6106c5e4b04a", "outputs": [], "execution_count": 17 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }