rekomendacja_filmow/movies_data.ipynb
2024-12-09 23:53:05 +01:00

1012 lines
35 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.610507Z",
"start_time": "2024-12-09T22:43:58.602149Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np"
],
"id": "5a823fabad3c186f",
"outputs": [],
"execution_count": 32
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.621153Z",
"start_time": "2024-12-09T22:43:58.612510Z"
}
},
"cell_type": "code",
"source": [
"def split_by_user(data, test_size=0.2, random_state=42):\n",
" unique_users = data['userId'].unique()\n",
" np.random.seed(random_state)\n",
" test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n",
" test_data = data[data['userId'].isin(test_users)]\n",
" train_data = data[~data['userId'].isin(test_users)]\n",
" return train_data, test_data"
],
"id": "d694dbe819b591ac",
"outputs": [],
"execution_count": 33
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.689968Z",
"start_time": "2024-12-09T22:43:58.622148Z"
}
},
"cell_type": "code",
"source": [
"movies = pd.read_csv('ml-latest-small/movies.csv')\n",
"ratings = pd.read_csv('ml-latest-small/ratings.csv')"
],
"id": "1bc78bafbae06c89",
"outputs": [],
"execution_count": 34
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.707803Z",
"start_time": "2024-12-09T22:43:58.690972Z"
}
},
"cell_type": "code",
"source": "movies['genres'] = movies['genres'].str.split('|')",
"id": "2543aa8216425342",
"outputs": [],
"execution_count": 35
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.742861Z",
"start_time": "2024-12-09T22:43:58.709808Z"
}
},
"cell_type": "code",
"source": "data = pd.merge(ratings, movies, on=\"movieId\")",
"id": "b3fa37255dccb066",
"outputs": [],
"execution_count": 36
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.759240Z",
"start_time": "2024-12-09T22:43:58.743879Z"
}
},
"cell_type": "code",
"source": "data",
"id": "c9542abeaa0c59d7",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[100836 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100836 rows × 6 columns</p>\n",
"</div>"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 37
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.779353Z",
"start_time": "2024-12-09T22:43:58.760243Z"
}
},
"cell_type": "code",
"source": "train_data, test_data = split_by_user(data)",
"id": "b66149513c0c6b0e",
"outputs": [],
"execution_count": 38
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:44:41.206825Z",
"start_time": "2024-12-09T22:43:58.780355Z"
}
},
"cell_type": "code",
"source": [
"expanded_train_data = train_data.explode('genres')\n",
"train_data = train_data.copy()\n",
"test_data = test_data.copy()\n",
"\n",
"train_user_genre_rating = (\n",
" expanded_train_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def train_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return round(genre_scores.mean(), 2)\n",
" else:\n",
" return 0\n",
"\n",
"train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n",
"\n",
"\n",
"expanded_test_data = test_data.explode('genres')\n",
"\n",
"test_user_genre_rating = (\n",
" expanded_test_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def test_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return round(genre_scores.mean(), 2)\n",
" else:\n",
" return 0\n",
"\n",
"test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)"
],
"id": "88296c8c47cdbf60",
"outputs": [],
"execution_count": 39
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.267834Z",
"start_time": "2024-12-09T22:44:41.207821Z"
}
},
"cell_type": "code",
"source": [
"train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n",
"train_user_movie_filled = train_user_movie.fillna(0)\n",
"train_user_similarity = cosine_similarity(train_user_movie_filled)\n",
"train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n",
"\n",
"def train_average(user_id, movie_id, top_n=5):\n",
" similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = train_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return round(similar_ratings.dropna().mean(), 2)\n",
"\n",
"train_data['similarUsers'] = train_data.apply(\n",
" lambda row: train_average(row['userId'], row['movieId']), axis=1\n",
")\n",
"\n",
"test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n",
"test_user_movie_filled = test_user_movie.fillna(0)\n",
"test_user_similarity = cosine_similarity(test_user_movie_filled)\n",
"test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n",
"\n",
"def test_average(user_id, movie_id, top_n=5):\n",
" similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = test_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return round(similar_ratings.dropna().mean(), 2)\n",
"\n",
"test_data['similarUsers'] = test_data.apply(\n",
" lambda row: test_average(row['userId'], row['movieId']), axis=1\n",
")\n"
],
"id": "e931f4041a1802fb",
"outputs": [],
"execution_count": 40
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.311604Z",
"start_time": "2024-12-09T22:45:09.268839Z"
}
},
"cell_type": "code",
"source": [
"train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n",
"train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n",
"\n",
"test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n",
"test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')"
],
"id": "ce65ce417e7f5207",
"outputs": [],
"execution_count": 41
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.342614Z",
"start_time": "2024-12-09T22:45:09.313582Z"
}
},
"cell_type": "code",
"source": [
"train_users = train_data['userId'].nunique()\n",
"test_users = test_data['userId'].nunique()\n",
"\n",
"train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n",
"train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n",
"\n",
"train_data['popularity'] = ((train_data['popularity'] / train_users) * 100).round(2)\n",
"\n",
"test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n",
"test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')\n",
"\n",
"test_data['popularity'] = ((test_data['popularity'] / test_users) * 100).round(2)"
],
"id": "aa798201db531188",
"outputs": [],
"execution_count": 42
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.915862Z",
"start_time": "2024-12-09T22:45:09.344595Z"
}
},
"cell_type": "code",
"source": [
"train_data.to_csv('datasets/train_all.csv', index=False)\n",
"test_data.to_csv('datasets/test_all.csv', index=False)"
],
"id": "f7b5130c72ad35af",
"outputs": [],
"execution_count": 43
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.942641Z",
"start_time": "2024-12-09T22:45:09.916835Z"
}
},
"cell_type": "code",
"source": [
"def get_top_movies(data):\n",
" top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n",
" return top\n",
"\n",
"train_top = get_top_movies(train_data)\n",
"test_top = get_top_movies(test_data)"
],
"id": "20dba13e7a3d105b",
"outputs": [],
"execution_count": 44
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.968052Z",
"start_time": "2024-12-09T22:45:09.943175Z"
}
},
"cell_type": "code",
"source": [
"train_top.to_csv('datasets/train_top.csv', index=False)\n",
"test_top.to_csv('datasets/test_top.csv', index=False)"
],
"id": "be9f6106c5e4b04a",
"outputs": [],
"execution_count": 45
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:49:01.766906Z",
"start_time": "2024-12-09T22:49:01.744809Z"
}
},
"cell_type": "code",
"source": "train_data",
"id": "f6ea94b0951b8471",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"73172 610 166534 4.0 1493848402 Split (2017) \n",
"73173 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"73174 610 168250 5.0 1494273047 Get Out (2017) \n",
"73175 610 168252 5.0 1493846352 Logan (2017) \n",
"73176 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres genreMatch \\\n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] 4.44 \n",
"1 [Comedy, Romance] 4.29 \n",
"2 [Action, Crime, Thriller] 4.27 \n",
"3 [Mystery, Thriller] 4.16 \n",
"4 [Crime, Mystery, Thriller] 4.22 \n",
"... ... ... \n",
"73172 [Drama, Horror, Thriller] 3.65 \n",
"73173 [Action, Crime, Thriller] 3.66 \n",
"73174 [Horror] 3.51 \n",
"73175 [Action, Sci-Fi] 3.63 \n",
"73176 [Action, Crime, Drama, Thriller] 3.71 \n",
"\n",
" similarUsers_x similarUsers_y popularity \n",
"0 4.50 4.50 33.81 \n",
"1 4.00 4.00 8.81 \n",
"2 3.00 3.00 16.19 \n",
"3 3.88 3.88 33.20 \n",
"4 4.75 4.75 32.38 \n",
"... ... ... ... \n",
"73172 NaN NaN 0.82 \n",
"73173 5.00 5.00 1.02 \n",
"73174 NaN NaN 2.66 \n",
"73175 4.50 4.50 4.30 \n",
"73176 NaN NaN 0.41 \n",
"\n",
"[73177 rows x 10 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>genreMatch</th>\n",
" <th>similarUsers_x</th>\n",
" <th>similarUsers_y</th>\n",
" <th>popularity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" <td>4.44</td>\n",
" <td>4.50</td>\n",
" <td>4.50</td>\n",
" <td>33.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" <td>4.29</td>\n",
" <td>4.00</td>\n",
" <td>4.00</td>\n",
" <td>8.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" <td>4.27</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>16.19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" <td>4.16</td>\n",
" <td>3.88</td>\n",
" <td>3.88</td>\n",
" <td>33.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" <td>4.22</td>\n",
" <td>4.75</td>\n",
" <td>4.75</td>\n",
" <td>32.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73172</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" <td>3.65</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73173</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" <td>3.66</td>\n",
" <td>5.00</td>\n",
" <td>5.00</td>\n",
" <td>1.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73174</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" <td>3.51</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73175</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" <td>3.63</td>\n",
" <td>4.50</td>\n",
" <td>4.50</td>\n",
" <td>4.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73176</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" <td>3.71</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.41</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73177 rows × 10 columns</p>\n",
"</div>"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 46
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:49:14.765827Z",
"start_time": "2024-12-09T22:49:14.748582Z"
}
},
"cell_type": "code",
"source": "test_data",
"id": "e0c5ec2294939201",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp \\\n",
"0 3 31 0.5 1306463578 \n",
"1 3 527 0.5 1306464275 \n",
"2 3 647 0.5 1306463619 \n",
"3 3 688 0.5 1306464228 \n",
"4 3 720 0.5 1306463595 \n",
"... ... ... ... ... \n",
"27654 609 892 3.0 847221080 \n",
"27655 609 1056 3.0 847221080 \n",
"27656 609 1059 3.0 847221054 \n",
"27657 609 1150 4.0 847221054 \n",
"27658 609 1161 4.0 847221080 \n",
"\n",
" title \\\n",
"0 Dangerous Minds (1995) \n",
"1 Schindler's List (1993) \n",
"2 Courage Under Fire (1996) \n",
"3 Operation Dumbo Drop (1995) \n",
"4 Wallace & Gromit: The Best of Aardman Animatio... \n",
"... ... \n",
"27654 Twelfth Night (1996) \n",
"27655 Jude (1996) \n",
"27656 William Shakespeare's Romeo + Juliet (1996) \n",
"27657 Return of Martin Guerre, The (Retour de Martin... \n",
"27658 Tin Drum, The (Blechtrommel, Die) (1979) \n",
"\n",
" genres genreMatch similarUsers_x \\\n",
"0 [Drama] 0.75 2.00 \n",
"1 [Drama, War] 0.62 3.67 \n",
"2 [Action, Crime, Drama, War] 1.33 3.00 \n",
"3 [Action, Adventure, Comedy, War] 1.95 1.50 \n",
"4 [Adventure, Animation, Comedy] 1.41 4.50 \n",
"... ... ... ... \n",
"27654 [Comedy, Drama, Romance] 3.28 NaN \n",
"27655 [Drama] 3.37 NaN \n",
"27656 [Drama, Romance] 3.28 NaN \n",
"27657 [Drama] 3.37 NaN \n",
"27658 [Drama, War] 3.43 NaN \n",
"\n",
" similarUsers_y popularity \n",
"0 2.00 5.74 \n",
"1 3.67 31.97 \n",
"2 3.00 5.74 \n",
"3 1.50 3.28 \n",
"4 4.50 5.74 \n",
"... ... ... \n",
"27654 NaN 4.10 \n",
"27655 NaN 1.64 \n",
"27656 NaN 7.38 \n",
"27657 NaN 0.82 \n",
"27658 NaN 0.82 \n",
"\n",
"[27659 rows x 10 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>genreMatch</th>\n",
" <th>similarUsers_x</th>\n",
" <th>similarUsers_y</th>\n",
" <th>popularity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>31</td>\n",
" <td>0.5</td>\n",
" <td>1306463578</td>\n",
" <td>Dangerous Minds (1995)</td>\n",
" <td>[Drama]</td>\n",
" <td>0.75</td>\n",
" <td>2.00</td>\n",
" <td>2.00</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>527</td>\n",
" <td>0.5</td>\n",
" <td>1306464275</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>[Drama, War]</td>\n",
" <td>0.62</td>\n",
" <td>3.67</td>\n",
" <td>3.67</td>\n",
" <td>31.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>647</td>\n",
" <td>0.5</td>\n",
" <td>1306463619</td>\n",
" <td>Courage Under Fire (1996)</td>\n",
" <td>[Action, Crime, Drama, War]</td>\n",
" <td>1.33</td>\n",
" <td>3.00</td>\n",
" <td>3.00</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>688</td>\n",
" <td>0.5</td>\n",
" <td>1306464228</td>\n",
" <td>Operation Dumbo Drop (1995)</td>\n",
" <td>[Action, Adventure, Comedy, War]</td>\n",
" <td>1.95</td>\n",
" <td>1.50</td>\n",
" <td>1.50</td>\n",
" <td>3.28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>720</td>\n",
" <td>0.5</td>\n",
" <td>1306463595</td>\n",
" <td>Wallace &amp; Gromit: The Best of Aardman Animatio...</td>\n",
" <td>[Adventure, Animation, Comedy]</td>\n",
" <td>1.41</td>\n",
" <td>4.50</td>\n",
" <td>4.50</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27654</th>\n",
" <td>609</td>\n",
" <td>892</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Twelfth Night (1996)</td>\n",
" <td>[Comedy, Drama, Romance]</td>\n",
" <td>3.28</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27655</th>\n",
" <td>609</td>\n",
" <td>1056</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Jude (1996)</td>\n",
" <td>[Drama]</td>\n",
" <td>3.37</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.64</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27656</th>\n",
" <td>609</td>\n",
" <td>1059</td>\n",
" <td>3.0</td>\n",
" <td>847221054</td>\n",
" <td>William Shakespeare's Romeo + Juliet (1996)</td>\n",
" <td>[Drama, Romance]</td>\n",
" <td>3.28</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27657</th>\n",
" <td>609</td>\n",
" <td>1150</td>\n",
" <td>4.0</td>\n",
" <td>847221054</td>\n",
" <td>Return of Martin Guerre, The (Retour de Martin...</td>\n",
" <td>[Drama]</td>\n",
" <td>3.37</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27658</th>\n",
" <td>609</td>\n",
" <td>1161</td>\n",
" <td>4.0</td>\n",
" <td>847221080</td>\n",
" <td>Tin Drum, The (Blechtrommel, Die) (1979)</td>\n",
" <td>[Drama, War]</td>\n",
" <td>3.43</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27659 rows × 10 columns</p>\n",
"</div>"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 47
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}