1002 lines
34 KiB
Plaintext
1002 lines
34 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.610507Z",
|
||
"start_time": "2024-12-09T22:43:58.602149Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||
"import numpy as np"
|
||
],
|
||
"id": "5a823fabad3c186f",
|
||
"outputs": [],
|
||
"execution_count": 32
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.621153Z",
|
||
"start_time": "2024-12-09T22:43:58.612510Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"def split_by_user(data, test_size=0.2, random_state=42):\n",
|
||
" unique_users = data['userId'].unique()\n",
|
||
" np.random.seed(random_state)\n",
|
||
" test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)\n",
|
||
" test_data = data[data['userId'].isin(test_users)]\n",
|
||
" train_data = data[~data['userId'].isin(test_users)]\n",
|
||
" return train_data, test_data"
|
||
],
|
||
"id": "d694dbe819b591ac",
|
||
"outputs": [],
|
||
"execution_count": 33
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.689968Z",
|
||
"start_time": "2024-12-09T22:43:58.622148Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"movies = pd.read_csv('ml-latest-small/movies.csv')\n",
|
||
"ratings = pd.read_csv('ml-latest-small/ratings.csv')"
|
||
],
|
||
"id": "1bc78bafbae06c89",
|
||
"outputs": [],
|
||
"execution_count": 34
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.707803Z",
|
||
"start_time": "2024-12-09T22:43:58.690972Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "movies['genres'] = movies['genres'].str.split('|')",
|
||
"id": "2543aa8216425342",
|
||
"outputs": [],
|
||
"execution_count": 35
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.742861Z",
|
||
"start_time": "2024-12-09T22:43:58.709808Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "data = pd.merge(ratings, movies, on=\"movieId\")",
|
||
"id": "b3fa37255dccb066",
|
||
"outputs": [],
|
||
"execution_count": 36
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.759240Z",
|
||
"start_time": "2024-12-09T22:43:58.743879Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "data",
|
||
"id": "c9542abeaa0c59d7",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" userId movieId rating timestamp title \\\n",
|
||
"0 1 1 4.0 964982703 Toy Story (1995) \n",
|
||
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
|
||
"2 1 6 4.0 964982224 Heat (1995) \n",
|
||
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
|
||
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
|
||
"... ... ... ... ... ... \n",
|
||
"100831 610 166534 4.0 1493848402 Split (2017) \n",
|
||
"100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
|
||
"100833 610 168250 5.0 1494273047 Get Out (2017) \n",
|
||
"100834 610 168252 5.0 1493846352 Logan (2017) \n",
|
||
"100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
|
||
"\n",
|
||
" genres \n",
|
||
"0 [Adventure, Animation, Children, Comedy, Fantasy] \n",
|
||
"1 [Comedy, Romance] \n",
|
||
"2 [Action, Crime, Thriller] \n",
|
||
"3 [Mystery, Thriller] \n",
|
||
"4 [Crime, Mystery, Thriller] \n",
|
||
"... ... \n",
|
||
"100831 [Drama, Horror, Thriller] \n",
|
||
"100832 [Action, Crime, Thriller] \n",
|
||
"100833 [Horror] \n",
|
||
"100834 [Action, Sci-Fi] \n",
|
||
"100835 [Action, Crime, Drama, Thriller] \n",
|
||
"\n",
|
||
"[100836 rows x 6 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>userId</th>\n",
|
||
" <th>movieId</th>\n",
|
||
" <th>rating</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>genres</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964982703</td>\n",
|
||
" <td>Toy Story (1995)</td>\n",
|
||
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964981247</td>\n",
|
||
" <td>Grumpier Old Men (1995)</td>\n",
|
||
" <td>[Comedy, Romance]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964982224</td>\n",
|
||
" <td>Heat (1995)</td>\n",
|
||
" <td>[Action, Crime, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>47</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>964983815</td>\n",
|
||
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
|
||
" <td>[Mystery, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>964982931</td>\n",
|
||
" <td>Usual Suspects, The (1995)</td>\n",
|
||
" <td>[Crime, Mystery, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100831</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>166534</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1493848402</td>\n",
|
||
" <td>Split (2017)</td>\n",
|
||
" <td>[Drama, Horror, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100832</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168248</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1493850091</td>\n",
|
||
" <td>John Wick: Chapter Two (2017)</td>\n",
|
||
" <td>[Action, Crime, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100833</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168250</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1494273047</td>\n",
|
||
" <td>Get Out (2017)</td>\n",
|
||
" <td>[Horror]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100834</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168252</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1493846352</td>\n",
|
||
" <td>Logan (2017)</td>\n",
|
||
" <td>[Action, Sci-Fi]</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>100835</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>170875</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1493846415</td>\n",
|
||
" <td>The Fate of the Furious (2017)</td>\n",
|
||
" <td>[Action, Crime, Drama, Thriller]</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>100836 rows × 6 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 37
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:43:58.779353Z",
|
||
"start_time": "2024-12-09T22:43:58.760243Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "train_data, test_data = split_by_user(data)",
|
||
"id": "b66149513c0c6b0e",
|
||
"outputs": [],
|
||
"execution_count": 38
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:44:41.206825Z",
|
||
"start_time": "2024-12-09T22:43:58.780355Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"expanded_train_data = train_data.explode('genres')\n",
|
||
"train_data = train_data.copy()\n",
|
||
"test_data = test_data.copy()\n",
|
||
"\n",
|
||
"train_user_genre_rating = (\n",
|
||
" expanded_train_data.groupby(['userId', 'genres'])['rating']\n",
|
||
" .mean()\n",
|
||
" .reset_index()\n",
|
||
" .rename(columns={'rating': 'avg_genre_rating'})\n",
|
||
")\n",
|
||
"\n",
|
||
"train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
|
||
"\n",
|
||
"def train_genre_match_calc(row):\n",
|
||
" user_id = row['userId']\n",
|
||
" genres = row['genres']\n",
|
||
"\n",
|
||
" if isinstance(genres, str):\n",
|
||
" genres = [genres]\n",
|
||
"\n",
|
||
" user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]\n",
|
||
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
|
||
"\n",
|
||
" if not genre_scores.empty:\n",
|
||
" return round(genre_scores.mean(), 2)\n",
|
||
" else:\n",
|
||
" return 0\n",
|
||
"\n",
|
||
"train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)\n",
|
||
"\n",
|
||
"\n",
|
||
"expanded_test_data = test_data.explode('genres')\n",
|
||
"\n",
|
||
"test_user_genre_rating = (\n",
|
||
" expanded_test_data.groupby(['userId', 'genres'])['rating']\n",
|
||
" .mean()\n",
|
||
" .reset_index()\n",
|
||
" .rename(columns={'rating': 'avg_genre_rating'})\n",
|
||
")\n",
|
||
"\n",
|
||
"test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)\n",
|
||
"\n",
|
||
"def test_genre_match_calc(row):\n",
|
||
" user_id = row['userId']\n",
|
||
" genres = row['genres']\n",
|
||
"\n",
|
||
" if isinstance(genres, str):\n",
|
||
" genres = [genres]\n",
|
||
"\n",
|
||
" user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]\n",
|
||
" genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']\n",
|
||
"\n",
|
||
" if not genre_scores.empty:\n",
|
||
" return round(genre_scores.mean(), 2)\n",
|
||
" else:\n",
|
||
" return 0\n",
|
||
"\n",
|
||
"test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)"
|
||
],
|
||
"id": "88296c8c47cdbf60",
|
||
"outputs": [],
|
||
"execution_count": 39
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:45:09.267834Z",
|
||
"start_time": "2024-12-09T22:44:41.207821Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')\n",
|
||
"train_user_movie_filled = train_user_movie.fillna(0)\n",
|
||
"train_user_similarity = cosine_similarity(train_user_movie_filled)\n",
|
||
"train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)\n",
|
||
"\n",
|
||
"def train_average(user_id, movie_id, top_n=5):\n",
|
||
" similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
|
||
" similar_ratings = train_user_movie.loc[similar_users, movie_id]\n",
|
||
"\n",
|
||
" return round(similar_ratings.dropna().mean(), 2)\n",
|
||
"\n",
|
||
"train_data['similarUsers'] = train_data.apply(\n",
|
||
" lambda row: train_average(row['userId'], row['movieId']), axis=1\n",
|
||
")\n",
|
||
"\n",
|
||
"test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')\n",
|
||
"test_user_movie_filled = test_user_movie.fillna(0)\n",
|
||
"test_user_similarity = cosine_similarity(test_user_movie_filled)\n",
|
||
"test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)\n",
|
||
"\n",
|
||
"def test_average(user_id, movie_id, top_n=5):\n",
|
||
" similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]\n",
|
||
" similar_ratings = test_user_movie.loc[similar_users, movie_id]\n",
|
||
"\n",
|
||
" return round(similar_ratings.dropna().mean(), 2)\n",
|
||
"\n",
|
||
"test_data['similarUsers'] = test_data.apply(\n",
|
||
" lambda row: test_average(row['userId'], row['movieId']), axis=1\n",
|
||
")\n"
|
||
],
|
||
"id": "e931f4041a1802fb",
|
||
"outputs": [],
|
||
"execution_count": 40
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:45:09.311604Z",
|
||
"start_time": "2024-12-09T22:45:09.268839Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_ratings = train_data[['userId', 'movieId', 'similarUsers']]\n",
|
||
"train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')\n",
|
||
"\n",
|
||
"test_ratings = test_data[['userId', 'movieId', 'similarUsers']]\n",
|
||
"test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')"
|
||
],
|
||
"id": "ce65ce417e7f5207",
|
||
"outputs": [],
|
||
"execution_count": 41
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-09T22:45:09.342614Z",
|
||
"start_time": "2024-12-09T22:45:09.313582Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_users = train_data['userId'].nunique()\n",
|
||
"test_users = test_data['userId'].nunique()\n",
|
||
"\n",
|
||
"train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')\n",
|
||
"train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')\n",
|
||
"\n",
|
||
"train_data['popularity'] = ((train_data['popularity'] / train_users) * 100).round(2)\n",
|
||
"\n",
|
||
"test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')\n",
|
||
"test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')\n",
|
||
"\n",
|
||
"test_data['popularity'] = ((test_data['popularity'] / test_users) * 100).round(2)"
|
||
],
|
||
"id": "aa798201db531188",
|
||
"outputs": [],
|
||
"execution_count": 42
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:04:32.648126Z",
|
||
"start_time": "2024-12-11T11:04:31.503628Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_data.to_csv('datasets/train_all.csv', index=False)\n",
|
||
"test_data.to_csv('datasets/test_all.csv', index=False)"
|
||
],
|
||
"id": "f7b5130c72ad35af",
|
||
"outputs": [],
|
||
"execution_count": 53
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# train_data = train_data.drop(columns=['similarUsers_y'])\n",
|
||
"# test_data = test_data.drop(columns=['similarUsers_y'])"
|
||
],
|
||
"id": "4cd347cc3bfd35aa",
|
||
"outputs": [],
|
||
"execution_count": null
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:03:13.267007Z",
|
||
"start_time": "2024-12-11T11:03:13.258841Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# train_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)\n",
|
||
"# test_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)"
|
||
],
|
||
"id": "aa8a10762dd70a4d",
|
||
"outputs": [],
|
||
"execution_count": 50
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:04:43.594264Z",
|
||
"start_time": "2024-12-11T11:04:43.495639Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"def get_top_movies(data):\n",
|
||
" top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))\n",
|
||
" return top\n",
|
||
"\n",
|
||
"train_top = get_top_movies(train_data)\n",
|
||
"test_top = get_top_movies(test_data)"
|
||
],
|
||
"id": "20dba13e7a3d105b",
|
||
"outputs": [],
|
||
"execution_count": 54
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:04:45.194584Z",
|
||
"start_time": "2024-12-11T11:04:45.132035Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_top.to_csv('datasets/train_top.csv', index=False)\n",
|
||
"test_top.to_csv('datasets/test_top.csv', index=False)"
|
||
],
|
||
"id": "be9f6106c5e4b04a",
|
||
"outputs": [],
|
||
"execution_count": 55
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:03:47.746798Z",
|
||
"start_time": "2024-12-11T11:03:47.714278Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "train_data",
|
||
"id": "f6ea94b0951b8471",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" userId movieId rating timestamp title \\\n",
|
||
"0 1 1 4.0 964982703 Toy Story (1995) \n",
|
||
"1 1 3 4.0 964981247 Grumpier Old Men (1995) \n",
|
||
"2 1 6 4.0 964982224 Heat (1995) \n",
|
||
"3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) \n",
|
||
"4 1 50 5.0 964982931 Usual Suspects, The (1995) \n",
|
||
"... ... ... ... ... ... \n",
|
||
"73172 610 166534 4.0 1493848402 Split (2017) \n",
|
||
"73173 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) \n",
|
||
"73174 610 168250 5.0 1494273047 Get Out (2017) \n",
|
||
"73175 610 168252 5.0 1493846352 Logan (2017) \n",
|
||
"73176 610 170875 3.0 1493846415 The Fate of the Furious (2017) \n",
|
||
"\n",
|
||
" genres genreMatch \\\n",
|
||
"0 [Adventure, Animation, Children, Comedy, Fantasy] 4.44 \n",
|
||
"1 [Comedy, Romance] 4.29 \n",
|
||
"2 [Action, Crime, Thriller] 4.27 \n",
|
||
"3 [Mystery, Thriller] 4.16 \n",
|
||
"4 [Crime, Mystery, Thriller] 4.22 \n",
|
||
"... ... ... \n",
|
||
"73172 [Drama, Horror, Thriller] 3.65 \n",
|
||
"73173 [Action, Crime, Thriller] 3.66 \n",
|
||
"73174 [Horror] 3.51 \n",
|
||
"73175 [Action, Sci-Fi] 3.63 \n",
|
||
"73176 [Action, Crime, Drama, Thriller] 3.71 \n",
|
||
"\n",
|
||
" similarUsers popularity \n",
|
||
"0 4.50 33.81 \n",
|
||
"1 4.00 8.81 \n",
|
||
"2 3.00 16.19 \n",
|
||
"3 3.88 33.20 \n",
|
||
"4 4.75 32.38 \n",
|
||
"... ... ... \n",
|
||
"73172 NaN 0.82 \n",
|
||
"73173 5.00 1.02 \n",
|
||
"73174 NaN 2.66 \n",
|
||
"73175 4.50 4.30 \n",
|
||
"73176 NaN 0.41 \n",
|
||
"\n",
|
||
"[73177 rows x 9 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>userId</th>\n",
|
||
" <th>movieId</th>\n",
|
||
" <th>rating</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>genres</th>\n",
|
||
" <th>genreMatch</th>\n",
|
||
" <th>similarUsers</th>\n",
|
||
" <th>popularity</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964982703</td>\n",
|
||
" <td>Toy Story (1995)</td>\n",
|
||
" <td>[Adventure, Animation, Children, Comedy, Fantasy]</td>\n",
|
||
" <td>4.44</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>33.81</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964981247</td>\n",
|
||
" <td>Grumpier Old Men (1995)</td>\n",
|
||
" <td>[Comedy, Romance]</td>\n",
|
||
" <td>4.29</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>8.81</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>964982224</td>\n",
|
||
" <td>Heat (1995)</td>\n",
|
||
" <td>[Action, Crime, Thriller]</td>\n",
|
||
" <td>4.27</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>16.19</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>47</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>964983815</td>\n",
|
||
" <td>Seven (a.k.a. Se7en) (1995)</td>\n",
|
||
" <td>[Mystery, Thriller]</td>\n",
|
||
" <td>4.16</td>\n",
|
||
" <td>3.88</td>\n",
|
||
" <td>33.20</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>964982931</td>\n",
|
||
" <td>Usual Suspects, The (1995)</td>\n",
|
||
" <td>[Crime, Mystery, Thriller]</td>\n",
|
||
" <td>4.22</td>\n",
|
||
" <td>4.75</td>\n",
|
||
" <td>32.38</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>73172</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>166534</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1493848402</td>\n",
|
||
" <td>Split (2017)</td>\n",
|
||
" <td>[Drama, Horror, Thriller]</td>\n",
|
||
" <td>3.65</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.82</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>73173</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168248</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1493850091</td>\n",
|
||
" <td>John Wick: Chapter Two (2017)</td>\n",
|
||
" <td>[Action, Crime, Thriller]</td>\n",
|
||
" <td>3.66</td>\n",
|
||
" <td>5.00</td>\n",
|
||
" <td>1.02</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>73174</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168250</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1494273047</td>\n",
|
||
" <td>Get Out (2017)</td>\n",
|
||
" <td>[Horror]</td>\n",
|
||
" <td>3.51</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2.66</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>73175</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>168252</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>1493846352</td>\n",
|
||
" <td>Logan (2017)</td>\n",
|
||
" <td>[Action, Sci-Fi]</td>\n",
|
||
" <td>3.63</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>4.30</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>73176</th>\n",
|
||
" <td>610</td>\n",
|
||
" <td>170875</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>1493846415</td>\n",
|
||
" <td>The Fate of the Furious (2017)</td>\n",
|
||
" <td>[Action, Crime, Drama, Thriller]</td>\n",
|
||
" <td>3.71</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.41</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>73177 rows × 9 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 51
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-11T11:04:01.183022Z",
|
||
"start_time": "2024-12-11T11:04:01.159989Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": "test_data",
|
||
"id": "e0c5ec2294939201",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" userId movieId rating timestamp \\\n",
|
||
"0 3 31 0.5 1306463578 \n",
|
||
"1 3 527 0.5 1306464275 \n",
|
||
"2 3 647 0.5 1306463619 \n",
|
||
"3 3 688 0.5 1306464228 \n",
|
||
"4 3 720 0.5 1306463595 \n",
|
||
"... ... ... ... ... \n",
|
||
"27654 609 892 3.0 847221080 \n",
|
||
"27655 609 1056 3.0 847221080 \n",
|
||
"27656 609 1059 3.0 847221054 \n",
|
||
"27657 609 1150 4.0 847221054 \n",
|
||
"27658 609 1161 4.0 847221080 \n",
|
||
"\n",
|
||
" title \\\n",
|
||
"0 Dangerous Minds (1995) \n",
|
||
"1 Schindler's List (1993) \n",
|
||
"2 Courage Under Fire (1996) \n",
|
||
"3 Operation Dumbo Drop (1995) \n",
|
||
"4 Wallace & Gromit: The Best of Aardman Animatio... \n",
|
||
"... ... \n",
|
||
"27654 Twelfth Night (1996) \n",
|
||
"27655 Jude (1996) \n",
|
||
"27656 William Shakespeare's Romeo + Juliet (1996) \n",
|
||
"27657 Return of Martin Guerre, The (Retour de Martin... \n",
|
||
"27658 Tin Drum, The (Blechtrommel, Die) (1979) \n",
|
||
"\n",
|
||
" genres genreMatch similarUsers popularity \n",
|
||
"0 [Drama] 0.75 2.00 5.74 \n",
|
||
"1 [Drama, War] 0.62 3.67 31.97 \n",
|
||
"2 [Action, Crime, Drama, War] 1.33 3.00 5.74 \n",
|
||
"3 [Action, Adventure, Comedy, War] 1.95 1.50 3.28 \n",
|
||
"4 [Adventure, Animation, Comedy] 1.41 4.50 5.74 \n",
|
||
"... ... ... ... ... \n",
|
||
"27654 [Comedy, Drama, Romance] 3.28 NaN 4.10 \n",
|
||
"27655 [Drama] 3.37 NaN 1.64 \n",
|
||
"27656 [Drama, Romance] 3.28 NaN 7.38 \n",
|
||
"27657 [Drama] 3.37 NaN 0.82 \n",
|
||
"27658 [Drama, War] 3.43 NaN 0.82 \n",
|
||
"\n",
|
||
"[27659 rows x 9 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>userId</th>\n",
|
||
" <th>movieId</th>\n",
|
||
" <th>rating</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" <th>title</th>\n",
|
||
" <th>genres</th>\n",
|
||
" <th>genreMatch</th>\n",
|
||
" <th>similarUsers</th>\n",
|
||
" <th>popularity</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>31</td>\n",
|
||
" <td>0.5</td>\n",
|
||
" <td>1306463578</td>\n",
|
||
" <td>Dangerous Minds (1995)</td>\n",
|
||
" <td>[Drama]</td>\n",
|
||
" <td>0.75</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>5.74</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>527</td>\n",
|
||
" <td>0.5</td>\n",
|
||
" <td>1306464275</td>\n",
|
||
" <td>Schindler's List (1993)</td>\n",
|
||
" <td>[Drama, War]</td>\n",
|
||
" <td>0.62</td>\n",
|
||
" <td>3.67</td>\n",
|
||
" <td>31.97</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>647</td>\n",
|
||
" <td>0.5</td>\n",
|
||
" <td>1306463619</td>\n",
|
||
" <td>Courage Under Fire (1996)</td>\n",
|
||
" <td>[Action, Crime, Drama, War]</td>\n",
|
||
" <td>1.33</td>\n",
|
||
" <td>3.00</td>\n",
|
||
" <td>5.74</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>688</td>\n",
|
||
" <td>0.5</td>\n",
|
||
" <td>1306464228</td>\n",
|
||
" <td>Operation Dumbo Drop (1995)</td>\n",
|
||
" <td>[Action, Adventure, Comedy, War]</td>\n",
|
||
" <td>1.95</td>\n",
|
||
" <td>1.50</td>\n",
|
||
" <td>3.28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>3</td>\n",
|
||
" <td>720</td>\n",
|
||
" <td>0.5</td>\n",
|
||
" <td>1306463595</td>\n",
|
||
" <td>Wallace & Gromit: The Best of Aardman Animatio...</td>\n",
|
||
" <td>[Adventure, Animation, Comedy]</td>\n",
|
||
" <td>1.41</td>\n",
|
||
" <td>4.50</td>\n",
|
||
" <td>5.74</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27654</th>\n",
|
||
" <td>609</td>\n",
|
||
" <td>892</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>847221080</td>\n",
|
||
" <td>Twelfth Night (1996)</td>\n",
|
||
" <td>[Comedy, Drama, Romance]</td>\n",
|
||
" <td>3.28</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27655</th>\n",
|
||
" <td>609</td>\n",
|
||
" <td>1056</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>847221080</td>\n",
|
||
" <td>Jude (1996)</td>\n",
|
||
" <td>[Drama]</td>\n",
|
||
" <td>3.37</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.64</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27656</th>\n",
|
||
" <td>609</td>\n",
|
||
" <td>1059</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>847221054</td>\n",
|
||
" <td>William Shakespeare's Romeo + Juliet (1996)</td>\n",
|
||
" <td>[Drama, Romance]</td>\n",
|
||
" <td>3.28</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7.38</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27657</th>\n",
|
||
" <td>609</td>\n",
|
||
" <td>1150</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>847221054</td>\n",
|
||
" <td>Return of Martin Guerre, The (Retour de Martin...</td>\n",
|
||
" <td>[Drama]</td>\n",
|
||
" <td>3.37</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.82</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27658</th>\n",
|
||
" <td>609</td>\n",
|
||
" <td>1161</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>847221080</td>\n",
|
||
" <td>Tin Drum, The (Blechtrommel, Die) (1979)</td>\n",
|
||
" <td>[Drama, War]</td>\n",
|
||
" <td>3.43</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.82</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>27659 rows × 9 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 52
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|