rekomendacja_filmow/movies_data.ipynb
2024-12-07 03:00:29 +01:00

883 lines
30 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.637007Z",
"start_time": "2024-12-07T01:55:39.633398Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np"
],
"id": "5a823fabad3c186f",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.646149Z",
"start_time": "2024-12-07T01:55:39.641016Z"
}
},
"cell_type": "code",
"source": [
"def split_by_user(data, test_size=0.2, random_state=42):\n",
" unique_users = data['userId'].unique()\n",
" np.random.seed(random_state)\n",
" test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n",
" test_data = data[data['userId'].isin(test_users)]\n",
" train_data = data[~data['userId'].isin(test_users)]\n",
" return train_data, test_data"
],
"id": "d694dbe819b591ac",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.715567Z",
"start_time": "2024-12-07T01:55:39.660120Z"
}
},
"cell_type": "code",
"source": [
"movies = pd.read_csv('ml-latest-small/movies.csv')\n",
"ratings = pd.read_csv('ml-latest-small/ratings.csv')"
],
"id": "1bc78bafbae06c89",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.729648Z",
"start_time": "2024-12-07T01:55:39.717570Z"
}
},
"cell_type": "code",
"source": "movies['genres'] = movies['genres'].str.split('|')",
"id": "2543aa8216425342",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.758043Z",
"start_time": "2024-12-07T01:55:39.730651Z"
}
},
"cell_type": "code",
"source": "data = pd.merge(ratings, movies, on=\"movieId\")",
"id": "b3fa37255dccb066",
"outputs": [],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.775205Z",
"start_time": "2024-12-07T01:55:39.759045Z"
}
},
"cell_type": "code",
"source": "data",
"id": "c9542abeaa0c59d7",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[100836 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100836 rows × 6 columns</p>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.789305Z",
"start_time": "2024-12-07T01:55:39.776211Z"
}
},
"cell_type": "code",
"source": "train_data, test_data = split_by_user(data)",
"id": "b66149513c0c6b0e",
"outputs": [],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.800344Z",
"start_time": "2024-12-07T01:55:39.790308Z"
}
},
"cell_type": "code",
"source": "train_data",
"id": "2cf1266ad52d70b0",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[73177 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73177 rows × 6 columns</p>\n",
"</div>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:55:39.813332Z",
"start_time": "2024-12-07T01:55:39.803336Z"
}
},
"cell_type": "code",
"source": "test_data",
"id": "93c87a58f9f4d704",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp \\\n",
"261 3 31 0.5 1306463578 \n",
"262 3 527 0.5 1306464275 \n",
"263 3 647 0.5 1306463619 \n",
"264 3 688 0.5 1306464228 \n",
"265 3 720 0.5 1306463595 \n",
"... ... ... ... ... \n",
"99529 609 892 3.0 847221080 \n",
"99530 609 1056 3.0 847221080 \n",
"99531 609 1059 3.0 847221054 \n",
"99532 609 1150 4.0 847221054 \n",
"99533 609 1161 4.0 847221080 \n",
"\n",
" title \\\n",
"261 Dangerous Minds (1995) \n",
"262 Schindler's List (1993) \n",
"263 Courage Under Fire (1996) \n",
"264 Operation Dumbo Drop (1995) \n",
"265 Wallace & Gromit: The Best of Aardman Animatio... \n",
"... ... \n",
"99529 Twelfth Night (1996) \n",
"99530 Jude (1996) \n",
"99531 William Shakespeare's Romeo + Juliet (1996) \n",
"99532 Return of Martin Guerre, The (Retour de Martin... \n",
"99533 Tin Drum, The (Blechtrommel, Die) (1979) \n",
"\n",
" genres \n",
"261 [Drama] \n",
"262 [Drama, War] \n",
"263 [Action, Crime, Drama, War] \n",
"264 [Action, Adventure, Comedy, War] \n",
"265 [Adventure, Animation, Comedy] \n",
"... ... \n",
"99529 [Comedy, Drama, Romance] \n",
"99530 [Drama] \n",
"99531 [Drama, Romance] \n",
"99532 [Drama] \n",
"99533 [Drama, War] \n",
"\n",
"[27659 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>3</td>\n",
" <td>31</td>\n",
" <td>0.5</td>\n",
" <td>1306463578</td>\n",
" <td>Dangerous Minds (1995)</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>3</td>\n",
" <td>527</td>\n",
" <td>0.5</td>\n",
" <td>1306464275</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>[Drama, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>3</td>\n",
" <td>647</td>\n",
" <td>0.5</td>\n",
" <td>1306463619</td>\n",
" <td>Courage Under Fire (1996)</td>\n",
" <td>[Action, Crime, Drama, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>3</td>\n",
" <td>688</td>\n",
" <td>0.5</td>\n",
" <td>1306464228</td>\n",
" <td>Operation Dumbo Drop (1995)</td>\n",
" <td>[Action, Adventure, Comedy, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>3</td>\n",
" <td>720</td>\n",
" <td>0.5</td>\n",
" <td>1306463595</td>\n",
" <td>Wallace &amp; Gromit: The Best of Aardman Animatio...</td>\n",
" <td>[Adventure, Animation, Comedy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99529</th>\n",
" <td>609</td>\n",
" <td>892</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Twelfth Night (1996)</td>\n",
" <td>[Comedy, Drama, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99530</th>\n",
" <td>609</td>\n",
" <td>1056</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Jude (1996)</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99531</th>\n",
" <td>609</td>\n",
" <td>1059</td>\n",
" <td>3.0</td>\n",
" <td>847221054</td>\n",
" <td>William Shakespeare's Romeo + Juliet (1996)</td>\n",
" <td>[Drama, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99532</th>\n",
" <td>609</td>\n",
" <td>1150</td>\n",
" <td>4.0</td>\n",
" <td>847221054</td>\n",
" <td>Return of Martin Guerre, The (Retour de Martin...</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99533</th>\n",
" <td>609</td>\n",
" <td>1161</td>\n",
" <td>4.0</td>\n",
" <td>847221080</td>\n",
" <td>Tin Drum, The (Blechtrommel, Die) (1979)</td>\n",
" <td>[Drama, War]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27659 rows × 6 columns</p>\n",
"</div>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:24.904397Z",
"start_time": "2024-12-07T01:55:39.833745Z"
}
},
"cell_type": "code",
"source": [
"expanded_train_data = train_data.explode('genres')\n",
"train_data = train_data.copy()\n",
"test_data = test_data.copy()\n",
"\n",
"train_user_genre_rating = (\n",
" expanded_train_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def train_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return genre_scores.mean()\n",
" else:\n",
" return 0\n",
"\n",
"train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n",
"\n",
"\n",
"expanded_test_data = test_data.explode('genres')\n",
"\n",
"test_user_genre_rating = (\n",
" expanded_test_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def test_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return genre_scores.mean()\n",
" else:\n",
" return 0\n",
"\n",
"test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)"
],
"id": "88296c8c47cdbf60",
"outputs": [],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.150918Z",
"start_time": "2024-12-07T01:56:24.905400Z"
}
},
"cell_type": "code",
"source": [
"train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n",
"train_user_movie_filled = train_user_movie.fillna(0)\n",
"train_user_similarity = cosine_similarity(train_user_movie_filled)\n",
"train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n",
"\n",
"def train_average(user_id, movie_id, top_n=5):\n",
" similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = train_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return similar_ratings.dropna().mean()\n",
"\n",
"train_data['similarUsers'] = train_data.apply(\n",
" lambda row: train_average(row['userId'], row['movieId']), axis=1\n",
")\n",
"\n",
"test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n",
"test_user_movie_filled = test_user_movie.fillna(0)\n",
"test_user_similarity = cosine_similarity(test_user_movie_filled)\n",
"test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n",
"\n",
"def test_average(user_id, movie_id, top_n=5):\n",
" similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = test_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return similar_ratings.dropna().mean()\n",
"\n",
"test_data['similarUsers'] = test_data.apply(\n",
" lambda row: test_average(row['userId'], row['movieId']), axis=1\n",
")\n"
],
"id": "e931f4041a1802fb",
"outputs": [],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.194857Z",
"start_time": "2024-12-07T01:56:54.151920Z"
}
},
"cell_type": "code",
"source": [
"train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n",
"train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n",
"\n",
"test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n",
"test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')"
],
"id": "ce65ce417e7f5207",
"outputs": [],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.224008Z",
"start_time": "2024-12-07T01:56:54.195860Z"
}
},
"cell_type": "code",
"source": [
"train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n",
"train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n",
"\n",
"test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n",
"test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')"
],
"id": "aa798201db531188",
"outputs": [],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.819973Z",
"start_time": "2024-12-07T01:56:54.225011Z"
}
},
"cell_type": "code",
"source": [
"train_data.to_csv('datasets/train_all.csv', index=False)\n",
"test_data.to_csv('datasets/test_all.csv', index=False)"
],
"id": "f7b5130c72ad35af",
"outputs": [],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.846938Z",
"start_time": "2024-12-07T01:56:54.820977Z"
}
},
"cell_type": "code",
"source": [
"def get_top_movies(data):\n",
" top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n",
" return top\n",
"\n",
"train_top = get_top_movies(train_data)\n",
"test_top = get_top_movies(test_data)"
],
"id": "20dba13e7a3d105b",
"outputs": [],
"execution_count": 16
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T01:56:54.873768Z",
"start_time": "2024-12-07T01:56:54.847941Z"
}
},
"cell_type": "code",
"source": [
"train_top.to_csv('datasets/train_top.csv', index=False)\n",
"test_top.to_csv('datasets/test_top.csv', index=False)"
],
"id": "be9f6106c5e4b04a",
"outputs": [],
"execution_count": 17
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}