rekomendacja_filmow/movies_data.ipynb
2024-12-11 12:06:46 +01:00

1002 lines
34 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.610507Z",
"start_time": "2024-12-09T22:43:58.602149Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np"
],
"id": "5a823fabad3c186f",
"outputs": [],
"execution_count": 32
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.621153Z",
"start_time": "2024-12-09T22:43:58.612510Z"
}
},
"cell_type": "code",
"source": [
"def split_by_user(data, test_size=0.2, random_state=42):\n",
" unique_users = data['userId'].unique()\n",
" np.random.seed(random_state)\n",
" test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n",
" test_data = data[data['userId'].isin(test_users)]\n",
" train_data = data[~data['userId'].isin(test_users)]\n",
" return train_data, test_data"
],
"id": "d694dbe819b591ac",
"outputs": [],
"execution_count": 33
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.689968Z",
"start_time": "2024-12-09T22:43:58.622148Z"
}
},
"cell_type": "code",
"source": [
"movies = pd.read_csv('ml-latest-small/movies.csv')\n",
"ratings = pd.read_csv('ml-latest-small/ratings.csv')"
],
"id": "1bc78bafbae06c89",
"outputs": [],
"execution_count": 34
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.707803Z",
"start_time": "2024-12-09T22:43:58.690972Z"
}
},
"cell_type": "code",
"source": "movies['genres'] = movies['genres'].str.split('|')",
"id": "2543aa8216425342",
"outputs": [],
"execution_count": 35
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.742861Z",
"start_time": "2024-12-09T22:43:58.709808Z"
}
},
"cell_type": "code",
"source": "data = pd.merge(ratings, movies, on=\"movieId\")",
"id": "b3fa37255dccb066",
"outputs": [],
"execution_count": 36
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.759240Z",
"start_time": "2024-12-09T22:43:58.743879Z"
}
},
"cell_type": "code",
"source": "data",
"id": "c9542abeaa0c59d7",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"100831 610 166534 4.0 1493848402 Split (2017) \n",
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres \n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
"1 [Comedy, Romance] \n",
"2 [Action, Crime, Thriller] \n",
"3 [Mystery, Thriller] \n",
"4 [Crime, Mystery, Thriller] \n",
"... ... \n",
"100831 [Drama, Horror, Thriller] \n",
"100832 [Action, Crime, Thriller] \n",
"100833 [Horror] \n",
"100834 [Action, Sci-Fi] \n",
"100835 [Action, Crime, Drama, Thriller] \n",
"\n",
"[100836 rows x 6 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100831</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100832</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100833</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100834</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100835</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100836 rows × 6 columns</p>\n",
"</div>"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 37
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:43:58.779353Z",
"start_time": "2024-12-09T22:43:58.760243Z"
}
},
"cell_type": "code",
"source": "train_data, test_data = split_by_user(data)",
"id": "b66149513c0c6b0e",
"outputs": [],
"execution_count": 38
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:44:41.206825Z",
"start_time": "2024-12-09T22:43:58.780355Z"
}
},
"cell_type": "code",
"source": [
"expanded_train_data = train_data.explode('genres')\n",
"train_data = train_data.copy()\n",
"test_data = test_data.copy()\n",
"\n",
"train_user_genre_rating = (\n",
" expanded_train_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def train_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return round(genre_scores.mean(), 2)\n",
" else:\n",
" return 0\n",
"\n",
"train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n",
"\n",
"\n",
"expanded_test_data = test_data.explode('genres')\n",
"\n",
"test_user_genre_rating = (\n",
" expanded_test_data.groupby(['userId', 'genres'])['rating']\n",
" .mean()\n",
" .reset_index()\n",
" .rename(columns={'rating': 'avg_genre_rating'})\n",
")\n",
"\n",
"test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
"\n",
"def test_genre_match_calc(row):\n",
" user_id = row['userId']\n",
" genres = row['genres']\n",
"\n",
" if isinstance(genres, str):\n",
" genres = [genres]\n",
"\n",
" user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n",
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
"\n",
" if not genre_scores.empty:\n",
" return round(genre_scores.mean(), 2)\n",
" else:\n",
" return 0\n",
"\n",
"test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)"
],
"id": "88296c8c47cdbf60",
"outputs": [],
"execution_count": 39
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.267834Z",
"start_time": "2024-12-09T22:44:41.207821Z"
}
},
"cell_type": "code",
"source": [
"train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n",
"train_user_movie_filled = train_user_movie.fillna(0)\n",
"train_user_similarity = cosine_similarity(train_user_movie_filled)\n",
"train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n",
"\n",
"def train_average(user_id, movie_id, top_n=5):\n",
" similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = train_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return round(similar_ratings.dropna().mean(), 2)\n",
"\n",
"train_data['similarUsers'] = train_data.apply(\n",
" lambda row: train_average(row['userId'], row['movieId']), axis=1\n",
")\n",
"\n",
"test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n",
"test_user_movie_filled = test_user_movie.fillna(0)\n",
"test_user_similarity = cosine_similarity(test_user_movie_filled)\n",
"test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n",
"\n",
"def test_average(user_id, movie_id, top_n=5):\n",
" similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
" similar_ratings = test_user_movie.loc[similar_users, movie_id]\n",
"\n",
" return round(similar_ratings.dropna().mean(), 2)\n",
"\n",
"test_data['similarUsers'] = test_data.apply(\n",
" lambda row: test_average(row['userId'], row['movieId']), axis=1\n",
")\n"
],
"id": "e931f4041a1802fb",
"outputs": [],
"execution_count": 40
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.311604Z",
"start_time": "2024-12-09T22:45:09.268839Z"
}
},
"cell_type": "code",
"source": [
"train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n",
"train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n",
"\n",
"test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n",
"test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')"
],
"id": "ce65ce417e7f5207",
"outputs": [],
"execution_count": 41
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T22:45:09.342614Z",
"start_time": "2024-12-09T22:45:09.313582Z"
}
},
"cell_type": "code",
"source": [
"train_users = train_data['userId'].nunique()\n",
"test_users = test_data['userId'].nunique()\n",
"\n",
"train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n",
"train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n",
"\n",
"train_data['popularity'] = ((train_data['popularity'] / train_users) * 100).round(2)\n",
"\n",
"test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n",
"test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')\n",
"\n",
"test_data['popularity'] = ((test_data['popularity'] / test_users) * 100).round(2)"
],
"id": "aa798201db531188",
"outputs": [],
"execution_count": 42
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:04:32.648126Z",
"start_time": "2024-12-11T11:04:31.503628Z"
}
},
"cell_type": "code",
"source": [
"train_data.to_csv('datasets/train_all.csv', index=False)\n",
"test_data.to_csv('datasets/test_all.csv', index=False)"
],
"id": "f7b5130c72ad35af",
"outputs": [],
"execution_count": 53
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# train_data = train_data.drop(columns=['similarUsers_y'])\n",
"# test_data = test_data.drop(columns=['similarUsers_y'])"
],
"id": "4cd347cc3bfd35aa",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:03:13.267007Z",
"start_time": "2024-12-11T11:03:13.258841Z"
}
},
"cell_type": "code",
"source": [
"# train_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)\n",
"# test_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)"
],
"id": "aa8a10762dd70a4d",
"outputs": [],
"execution_count": 50
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:04:43.594264Z",
"start_time": "2024-12-11T11:04:43.495639Z"
}
},
"cell_type": "code",
"source": [
"def get_top_movies(data):\n",
" top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n",
" return top\n",
"\n",
"train_top = get_top_movies(train_data)\n",
"test_top = get_top_movies(test_data)"
],
"id": "20dba13e7a3d105b",
"outputs": [],
"execution_count": 54
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:04:45.194584Z",
"start_time": "2024-12-11T11:04:45.132035Z"
}
},
"cell_type": "code",
"source": [
"train_top.to_csv('datasets/train_top.csv', index=False)\n",
"test_top.to_csv('datasets/test_top.csv', index=False)"
],
"id": "be9f6106c5e4b04a",
"outputs": [],
"execution_count": 55
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:03:47.746798Z",
"start_time": "2024-12-11T11:03:47.714278Z"
}
},
"cell_type": "code",
"source": "train_data",
"id": "f6ea94b0951b8471",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp title \\\n",
"0 1 1 4.0 964982703 Toy Story (1995) \n",
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
"2 1 6 4.0 964982224 Heat (1995) \n",
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
"... ... ... ... ... ... \n",
"73172 610 166534 4.0 1493848402 Split (2017) \n",
"73173 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
"73174 610 168250 5.0 1494273047 Get Out (2017) \n",
"73175 610 168252 5.0 1493846352 Logan (2017) \n",
"73176 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
"\n",
" genres genreMatch \\\n",
"0 [Adventure, Animation, Children, Comedy, Fantasy] 4.44 \n",
"1 [Comedy, Romance] 4.29 \n",
"2 [Action, Crime, Thriller] 4.27 \n",
"3 [Mystery, Thriller] 4.16 \n",
"4 [Crime, Mystery, Thriller] 4.22 \n",
"... ... ... \n",
"73172 [Drama, Horror, Thriller] 3.65 \n",
"73173 [Action, Crime, Thriller] 3.66 \n",
"73174 [Horror] 3.51 \n",
"73175 [Action, Sci-Fi] 3.63 \n",
"73176 [Action, Crime, Drama, Thriller] 3.71 \n",
"\n",
" similarUsers popularity \n",
"0 4.50 33.81 \n",
"1 4.00 8.81 \n",
"2 3.00 16.19 \n",
"3 3.88 33.20 \n",
"4 4.75 32.38 \n",
"... ... ... \n",
"73172 NaN 0.82 \n",
"73173 5.00 1.02 \n",
"73174 NaN 2.66 \n",
"73175 4.50 4.30 \n",
"73176 NaN 0.41 \n",
"\n",
"[73177 rows x 9 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>genreMatch</th>\n",
" <th>similarUsers</th>\n",
" <th>popularity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>964982703</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
" <td>4.44</td>\n",
" <td>4.50</td>\n",
" <td>33.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>964981247</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>[Comedy, Romance]</td>\n",
" <td>4.29</td>\n",
" <td>4.00</td>\n",
" <td>8.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>964982224</td>\n",
" <td>Heat (1995)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" <td>4.27</td>\n",
" <td>3.00</td>\n",
" <td>16.19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>5.0</td>\n",
" <td>964983815</td>\n",
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
" <td>[Mystery, Thriller]</td>\n",
" <td>4.16</td>\n",
" <td>3.88</td>\n",
" <td>33.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>5.0</td>\n",
" <td>964982931</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>[Crime, Mystery, Thriller]</td>\n",
" <td>4.22</td>\n",
" <td>4.75</td>\n",
" <td>32.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73172</th>\n",
" <td>610</td>\n",
" <td>166534</td>\n",
" <td>4.0</td>\n",
" <td>1493848402</td>\n",
" <td>Split (2017)</td>\n",
" <td>[Drama, Horror, Thriller]</td>\n",
" <td>3.65</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73173</th>\n",
" <td>610</td>\n",
" <td>168248</td>\n",
" <td>5.0</td>\n",
" <td>1493850091</td>\n",
" <td>John Wick: Chapter Two (2017)</td>\n",
" <td>[Action, Crime, Thriller]</td>\n",
" <td>3.66</td>\n",
" <td>5.00</td>\n",
" <td>1.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73174</th>\n",
" <td>610</td>\n",
" <td>168250</td>\n",
" <td>5.0</td>\n",
" <td>1494273047</td>\n",
" <td>Get Out (2017)</td>\n",
" <td>[Horror]</td>\n",
" <td>3.51</td>\n",
" <td>NaN</td>\n",
" <td>2.66</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73175</th>\n",
" <td>610</td>\n",
" <td>168252</td>\n",
" <td>5.0</td>\n",
" <td>1493846352</td>\n",
" <td>Logan (2017)</td>\n",
" <td>[Action, Sci-Fi]</td>\n",
" <td>3.63</td>\n",
" <td>4.50</td>\n",
" <td>4.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73176</th>\n",
" <td>610</td>\n",
" <td>170875</td>\n",
" <td>3.0</td>\n",
" <td>1493846415</td>\n",
" <td>The Fate of the Furious (2017)</td>\n",
" <td>[Action, Crime, Drama, Thriller]</td>\n",
" <td>3.71</td>\n",
" <td>NaN</td>\n",
" <td>0.41</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73177 rows × 9 columns</p>\n",
"</div>"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 51
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T11:04:01.183022Z",
"start_time": "2024-12-11T11:04:01.159989Z"
}
},
"cell_type": "code",
"source": "test_data",
"id": "e0c5ec2294939201",
"outputs": [
{
"data": {
"text/plain": [
" userId movieId rating timestamp \\\n",
"0 3 31 0.5 1306463578 \n",
"1 3 527 0.5 1306464275 \n",
"2 3 647 0.5 1306463619 \n",
"3 3 688 0.5 1306464228 \n",
"4 3 720 0.5 1306463595 \n",
"... ... ... ... ... \n",
"27654 609 892 3.0 847221080 \n",
"27655 609 1056 3.0 847221080 \n",
"27656 609 1059 3.0 847221054 \n",
"27657 609 1150 4.0 847221054 \n",
"27658 609 1161 4.0 847221080 \n",
"\n",
" title \\\n",
"0 Dangerous Minds (1995) \n",
"1 Schindler's List (1993) \n",
"2 Courage Under Fire (1996) \n",
"3 Operation Dumbo Drop (1995) \n",
"4 Wallace & Gromit: The Best of Aardman Animatio... \n",
"... ... \n",
"27654 Twelfth Night (1996) \n",
"27655 Jude (1996) \n",
"27656 William Shakespeare's Romeo + Juliet (1996) \n",
"27657 Return of Martin Guerre, The (Retour de Martin... \n",
"27658 Tin Drum, The (Blechtrommel, Die) (1979) \n",
"\n",
" genres genreMatch similarUsers popularity \n",
"0 [Drama] 0.75 2.00 5.74 \n",
"1 [Drama, War] 0.62 3.67 31.97 \n",
"2 [Action, Crime, Drama, War] 1.33 3.00 5.74 \n",
"3 [Action, Adventure, Comedy, War] 1.95 1.50 3.28 \n",
"4 [Adventure, Animation, Comedy] 1.41 4.50 5.74 \n",
"... ... ... ... ... \n",
"27654 [Comedy, Drama, Romance] 3.28 NaN 4.10 \n",
"27655 [Drama] 3.37 NaN 1.64 \n",
"27656 [Drama, Romance] 3.28 NaN 7.38 \n",
"27657 [Drama] 3.37 NaN 0.82 \n",
"27658 [Drama, War] 3.43 NaN 0.82 \n",
"\n",
"[27659 rows x 9 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>genreMatch</th>\n",
" <th>similarUsers</th>\n",
" <th>popularity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>31</td>\n",
" <td>0.5</td>\n",
" <td>1306463578</td>\n",
" <td>Dangerous Minds (1995)</td>\n",
" <td>[Drama]</td>\n",
" <td>0.75</td>\n",
" <td>2.00</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>527</td>\n",
" <td>0.5</td>\n",
" <td>1306464275</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>[Drama, War]</td>\n",
" <td>0.62</td>\n",
" <td>3.67</td>\n",
" <td>31.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>647</td>\n",
" <td>0.5</td>\n",
" <td>1306463619</td>\n",
" <td>Courage Under Fire (1996)</td>\n",
" <td>[Action, Crime, Drama, War]</td>\n",
" <td>1.33</td>\n",
" <td>3.00</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>688</td>\n",
" <td>0.5</td>\n",
" <td>1306464228</td>\n",
" <td>Operation Dumbo Drop (1995)</td>\n",
" <td>[Action, Adventure, Comedy, War]</td>\n",
" <td>1.95</td>\n",
" <td>1.50</td>\n",
" <td>3.28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>720</td>\n",
" <td>0.5</td>\n",
" <td>1306463595</td>\n",
" <td>Wallace &amp; Gromit: The Best of Aardman Animatio...</td>\n",
" <td>[Adventure, Animation, Comedy]</td>\n",
" <td>1.41</td>\n",
" <td>4.50</td>\n",
" <td>5.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27654</th>\n",
" <td>609</td>\n",
" <td>892</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Twelfth Night (1996)</td>\n",
" <td>[Comedy, Drama, Romance]</td>\n",
" <td>3.28</td>\n",
" <td>NaN</td>\n",
" <td>4.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27655</th>\n",
" <td>609</td>\n",
" <td>1056</td>\n",
" <td>3.0</td>\n",
" <td>847221080</td>\n",
" <td>Jude (1996)</td>\n",
" <td>[Drama]</td>\n",
" <td>3.37</td>\n",
" <td>NaN</td>\n",
" <td>1.64</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27656</th>\n",
" <td>609</td>\n",
" <td>1059</td>\n",
" <td>3.0</td>\n",
" <td>847221054</td>\n",
" <td>William Shakespeare's Romeo + Juliet (1996)</td>\n",
" <td>[Drama, Romance]</td>\n",
" <td>3.28</td>\n",
" <td>NaN</td>\n",
" <td>7.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27657</th>\n",
" <td>609</td>\n",
" <td>1150</td>\n",
" <td>4.0</td>\n",
" <td>847221054</td>\n",
" <td>Return of Martin Guerre, The (Retour de Martin...</td>\n",
" <td>[Drama]</td>\n",
" <td>3.37</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27658</th>\n",
" <td>609</td>\n",
" <td>1161</td>\n",
" <td>4.0</td>\n",
" <td>847221080</td>\n",
" <td>Tin Drum, The (Blechtrommel, Die) (1979)</td>\n",
" <td>[Drama, War]</td>\n",
" <td>3.43</td>\n",
" <td>NaN</td>\n",
" <td>0.82</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27659 rows × 9 columns</p>\n",
"</div>"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 52
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}