rekomendacja_filmow/movies_data.ipynb

883 lines
30 KiB
Plaintext
Raw Normal View History

2024-12-07 03:00:29 +01:00
{
"cells": [
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.179886Z",
"start_time": "2024-12-07T02:10:47.169245Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np"
],
"id": "5a823fabad3c186f",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 18
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.197156Z",
"start_time": "2024-12-07T02:10:47.181892Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"def split_by_user(data, test_size=0.2, random_state=42):\n",
" unique_users = data['userId'].unique()\n",
" np.random.seed(random_state)\n",
" test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n",
" test_data = data[data['userId'].isin(test_users)]\n",
" train_data = data[~data['userId'].isin(test_users)]\n",
" return train_data, test_data"
],
"id": "d694dbe819b591ac",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 19
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.278189Z",
"start_time": "2024-12-07T02:10:47.199151Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"movies = pd.read_csv('ml-latest-small/movies.csv')\n",
"ratings = pd.read_csv('ml-latest-small/ratings.csv')"
],
"id": "1bc78bafbae06c89",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 20
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.298677Z",
"start_time": "2024-12-07T02:10:47.280192Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "movies['genres'] = movies['genres'].str.split('|')",
"id": "2543aa8216425342",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 21
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.334238Z",
"start_time": "2024-12-07T02:10:47.299673Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "data = pd.merge(ratings, movies, on=\"movieId\")",
"id": "b3fa37255dccb066",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 22
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.349030Z",
"start_time": "2024-12-07T02:10:47.336244Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "data",
"id": "c9542abeaa0c59d7",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[100836 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100836 rows × 6 columns</p>\n",
"</div>"
]
},
2024-12-07 03:17:45 +01:00
"execution_count": 23,
2024-12-07 03:00:29 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-12-07 03:17:45 +01:00
"execution_count": 23
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.374751Z",
"start_time": "2024-12-07T02:10:47.352036Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "train_data, test_data = split_by_user(data)",
"id": "b66149513c0c6b0e",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 24
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.387434Z",
"start_time": "2024-12-07T02:10:47.375755Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "train_data",
"id": "2cf1266ad52d70b0",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[73177 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73177 rows × 6 columns</p>\n",
"</div>"
]
},
2024-12-07 03:17:45 +01:00
"execution_count": 25,
2024-12-07 03:00:29 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-12-07 03:17:45 +01:00
"execution_count": 25
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:10:47.402433Z",
"start_time": "2024-12-07T02:10:47.389431Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": "test_data",
"id": "93c87a58f9f4d704",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp \\\n",
"261 3 31 0.5 1306463578 \n",
"262 3 527 0.5 1306464275 \n",
"263 3 647 0.5 1306463619 \n",
"264 3 688 0.5 1306464228 \n",
"265 3 720 0.5 1306463595 \n",
"... ... ... ... ... \n",
"99529 609 892 3.0 847221080 \n",
"99530 609 1056 3.0 847221080 \n",
"99531 609 1059 3.0 847221054 \n",
"99532 609 1150 4.0 847221054 \n",
"99533 609 1161 4.0 847221080 \n",
"\n",
" title \\\n",
"261 Dangerous Minds (1995) \n",
"262 Schindler's List (1993) \n",
"263 Courage Under Fire (1996) \n",
"264 Operation Dumbo Drop (1995) \n",
"265 Wallace & Gromit: The Best of Aardman Animatio... \n",
"... ... \n",
"99529 Twelfth Night (1996) \n",
"99530 Jude (1996) \n",
"99531 William Shakespeare's Romeo + Juliet (1996) \n",
"99532 Return of Martin Guerre, The (Retour de Martin... \n",
"99533 Tin Drum, The (Blechtrommel, Die) (1979) \n",
"\n",
" genres \n",
"261 [Drama] \n",
"262 [Drama, War] \n",
"263 [Action, Crime, Drama, War] \n",
"264 [Action, Adventure, Comedy, War] \n",
"265 [Adventure, Animation, Comedy] \n",
"... ... \n",
"99529 [Comedy, Drama, Romance] \n",
"99530 [Drama] \n",
"99531 [Drama, Romance] \n",
"99532 [Drama] \n",
"99533 [Drama, War] \n",
"\n",
"[27659 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>3</td>\n",
" <td>31</td>\n",
" <td>0.5</td>\n",
" <td>1306463578</td>\n",
" <td>Dangerous Minds (1995)</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>3</td>\n",
" <td>527</td>\n",
" <td>0.5</td>\n",
" <td>1306464275</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>[Drama, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>3</td>\n",
" <td>647</td>\n",
" <td>0.5</td>\n",
" <td>1306463619</td>\n",
" <td>Courage Under Fire (1996)</td>\n",
" <td>[Action, Crime, Drama, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>3</td>\n",
" <td>688</td>\n",
" <td>0.5</td>\n",
" <td>1306464228</td>\n",
" <td>Operation Dumbo Drop (1995)</td>\n",
" <td>[Action, Adventure, Comedy, War]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>3</td>\n",
" <td>720</td>\n",
" <td>0.5</td>\n",
" <td>1306463595</td>\n",
" <td>Wallace &amp; Gromit: The Best of Aardman Animatio...</td>\n",
" <td>[Adventure, Animation, Comedy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99529</th>\n",
" <td>609</td>\n",
" <td>892</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Twelfth Night (1996)</td>\n",
" <td>[Comedy, Drama, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99530</th>\n",
" <td>609</td>\n",
" <td>1056</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Jude (1996)</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99531</th>\n",
" <td>609</td>\n",
" <td>1059</td>\n",
" <td>3.0</td>\n",
" <td>847221054</td>\n",
" <td>William Shakespeare's Romeo + Juliet (1996)</td>\n",
" <td>[Drama, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99532</th>\n",
" <td>609</td>\n",
" <td>1150</td>\n",
" <td>4.0</td>\n",
" <td>847221054</td>\n",
" <td>Return of Martin Guerre, The (Retour de Martin...</td>\n",
" <td>[Drama]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99533</th>\n",
" <td>609</td>\n",
" <td>1161</td>\n",
" <td>4.0</td>\n",
" <td>847221080</td>\n",
" <td>Tin Drum, The (Blechtrommel, Die) (1979)</td>\n",
" <td>[Drama, War]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27659 rows × 6 columns</p>\n",
"</div>"
]
},
2024-12-07 03:17:45 +01:00
"execution_count": 26,
2024-12-07 03:00:29 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-12-07 03:17:45 +01:00
"execution_count": 26
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:11:35.298333Z",
"start_time": "2024-12-07T02:10:47.404434Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"expanded_train_data = train_data.explode('genres')\n",
"train_data = train_data.copy()\n",
"test_data = test_data.copy()\n",
"\n",
"train_user_genre_rating = (\n",
" expanded_train_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def train_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
2024-12-07 03:17:45 +01:00
" return round(genre_scores.mean(), 2)\n",
2024-12-07 03:00:29 +01:00
" else:\n",
" return 0\n",
"\n",
"train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n",
"\n",
"\n",
"expanded_test_data = test_data.explode('genres')\n",
"\n",
"test_user_genre_rating = (\n",
" expanded_test_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def test_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
2024-12-07 03:17:45 +01:00
" return round(genre_scores.mean(), 2)\n",
2024-12-07 03:00:29 +01:00
" else:\n",
" return 0\n",
"\n",
"test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)"
],
"id": "88296c8c47cdbf60",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 27
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:06.387910Z",
"start_time": "2024-12-07T02:11:35.300336Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n",
"train_user_movie_filled = train_user_movie.fillna(0)\n",
"train_user_similarity = cosine_similarity(train_user_movie_filled)\n",
"train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n",
"\n",
"def train_average(user_id, movie_id, top_n=5):\n",
" similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = train_user_movie.loc[similar_users, movie_id]\n",
"\n",
2024-12-07 03:17:45 +01:00
" return round(similar_ratings.dropna().mean(), 2)\n",
2024-12-07 03:00:29 +01:00
"\n",
"train_data['similarUsers'] = train_data.apply(\n",
" lambda row: train_average(row['userId'], row['movieId']), axis=1\n",
")\n",
"\n",
"test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n",
"test_user_movie_filled = test_user_movie.fillna(0)\n",
"test_user_similarity = cosine_similarity(test_user_movie_filled)\n",
"test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n",
"\n",
"def test_average(user_id, movie_id, top_n=5):\n",
" similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = test_user_movie.loc[similar_users, movie_id]\n",
"\n",
2024-12-07 03:17:45 +01:00
" return round(similar_ratings.dropna().mean(), 2)\n",
2024-12-07 03:00:29 +01:00
"\n",
"test_data['similarUsers'] = test_data.apply(\n",
" lambda row: test_average(row['userId'], row['movieId']), axis=1\n",
")\n"
],
"id": "e931f4041a1802fb",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 28
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:06.432258Z",
"start_time": "2024-12-07T02:12:06.388912Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n",
"train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n",
"\n",
"test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n",
"test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')"
],
"id": "ce65ce417e7f5207",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 29
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:06.459220Z",
"start_time": "2024-12-07T02:12:06.433260Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n",
"train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n",
"\n",
"test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n",
"test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')"
],
"id": "aa798201db531188",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 30
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:06.988425Z",
"start_time": "2024-12-07T02:12:06.459645Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"train_data.to_csv('datasets/train_all.csv', index=False)\n",
"test_data.to_csv('datasets/test_all.csv', index=False)"
],
"id": "f7b5130c72ad35af",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 31
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:07.017641Z",
"start_time": "2024-12-07T02:12:06.989428Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"def get_top_movies(data):\n",
" top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n",
" return top\n",
"\n",
"train_top = get_top_movies(train_data)\n",
"test_top = get_top_movies(test_data)"
],
"id": "20dba13e7a3d105b",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 32
2024-12-07 03:00:29 +01:00
},
{
"metadata": {
"ExecuteTime": {
2024-12-07 03:17:45 +01:00
"end_time": "2024-12-07T02:12:07.042554Z",
"start_time": "2024-12-07T02:12:07.018643Z"
2024-12-07 03:00:29 +01:00
}
},
"cell_type": "code",
"source": [
"train_top.to_csv('datasets/train_top.csv', index=False)\n",
"test_top.to_csv('datasets/test_top.csv', index=False)"
],
"id": "be9f6106c5e4b04a",
"outputs": [],
2024-12-07 03:17:45 +01:00
"execution_count": 33
2024-12-07 03:00:29 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}