34 KiB
34 KiB
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def split_by_user(data, test_size=0.2, random_state=42):
unique_users = data['userId'].unique()
np.random.seed(random_state)
test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)
test_data = data[data['userId'].isin(test_users)]
train_data = data[~data['userId'].isin(test_users)]
return train_data, test_data
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies['genres'] = movies['genres'].str.split('|')
data = pd.merge(ratings, movies, on="movieId")
data
userId | movieId | rating | timestamp | title | genres | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | [Adventure, Animation, Children, Comedy, Fantasy] |
1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | [Comedy, Romance] |
2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | [Action, Crime, Thriller] |
3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | [Mystery, Thriller] |
4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | [Crime, Mystery, Thriller] |
... | ... | ... | ... | ... | ... | ... |
100831 | 610 | 166534 | 4.0 | 1493848402 | Split (2017) | [Drama, Horror, Thriller] |
100832 | 610 | 168248 | 5.0 | 1493850091 | John Wick: Chapter Two (2017) | [Action, Crime, Thriller] |
100833 | 610 | 168250 | 5.0 | 1494273047 | Get Out (2017) | [Horror] |
100834 | 610 | 168252 | 5.0 | 1493846352 | Logan (2017) | [Action, Sci-Fi] |
100835 | 610 | 170875 | 3.0 | 1493846415 | The Fate of the Furious (2017) | [Action, Crime, Drama, Thriller] |
100836 rows × 6 columns
train_data, test_data = split_by_user(data)
expanded_train_data = train_data.explode('genres')
train_data = train_data.copy()
test_data = test_data.copy()
train_user_genre_rating = (
expanded_train_data.groupby(['userId', 'genres'])['rating']
.mean()
.reset_index()
.rename(columns={'rating': 'avg_genre_rating'})
)
train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)
def train_genre_match_calc(row):
user_id = row['userId']
genres = row['genres']
if isinstance(genres, str):
genres = [genres]
user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]
genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']
if not genre_scores.empty:
return round(genre_scores.mean(), 2)
else:
return 0
train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)
expanded_test_data = test_data.explode('genres')
test_user_genre_rating = (
expanded_test_data.groupby(['userId', 'genres'])['rating']
.mean()
.reset_index()
.rename(columns={'rating': 'avg_genre_rating'})
)
test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)
def test_genre_match_calc(row):
user_id = row['userId']
genres = row['genres']
if isinstance(genres, str):
genres = [genres]
user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]
genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']
if not genre_scores.empty:
return round(genre_scores.mean(), 2)
else:
return 0
test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)
train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')
train_user_movie_filled = train_user_movie.fillna(0)
train_user_similarity = cosine_similarity(train_user_movie_filled)
train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)
def train_average(user_id, movie_id, top_n=5):
similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
similar_ratings = train_user_movie.loc[similar_users, movie_id]
return round(similar_ratings.dropna().mean(), 2)
train_data['similarUsers'] = train_data.apply(
lambda row: train_average(row['userId'], row['movieId']), axis=1
)
test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')
test_user_movie_filled = test_user_movie.fillna(0)
test_user_similarity = cosine_similarity(test_user_movie_filled)
test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)
def test_average(user_id, movie_id, top_n=5):
similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
similar_ratings = test_user_movie.loc[similar_users, movie_id]
return round(similar_ratings.dropna().mean(), 2)
test_data['similarUsers'] = test_data.apply(
lambda row: test_average(row['userId'], row['movieId']), axis=1
)
train_ratings = train_data[['userId', 'movieId', 'similarUsers']]
train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')
test_ratings = test_data[['userId', 'movieId', 'similarUsers']]
test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')
train_users = train_data['userId'].nunique()
test_users = test_data['userId'].nunique()
train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')
train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')
train_data['popularity'] = ((train_data['popularity'] / train_users) * 100).round(2)
test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')
test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')
test_data['popularity'] = ((test_data['popularity'] / test_users) * 100).round(2)
train_data.to_csv('datasets/train_all.csv', index=False)
test_data.to_csv('datasets/test_all.csv', index=False)
# train_data = train_data.drop(columns=['similarUsers_y'])
# test_data = test_data.drop(columns=['similarUsers_y'])
# train_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)
# test_data.rename(columns={'similarUsers_x': 'similarUsers'}, inplace=True)
def get_top_movies(data):
top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))
return top
train_top = get_top_movies(train_data)
test_top = get_top_movies(test_data)
train_top.to_csv('datasets/train_top.csv', index=False)
test_top.to_csv('datasets/test_top.csv', index=False)
train_data
userId | movieId | rating | timestamp | title | genres | genreMatch | similarUsers | popularity | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | [Adventure, Animation, Children, Comedy, Fantasy] | 4.44 | 4.50 | 33.81 |
1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | [Comedy, Romance] | 4.29 | 4.00 | 8.81 |
2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | [Action, Crime, Thriller] | 4.27 | 3.00 | 16.19 |
3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | [Mystery, Thriller] | 4.16 | 3.88 | 33.20 |
4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | [Crime, Mystery, Thriller] | 4.22 | 4.75 | 32.38 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
73172 | 610 | 166534 | 4.0 | 1493848402 | Split (2017) | [Drama, Horror, Thriller] | 3.65 | NaN | 0.82 |
73173 | 610 | 168248 | 5.0 | 1493850091 | John Wick: Chapter Two (2017) | [Action, Crime, Thriller] | 3.66 | 5.00 | 1.02 |
73174 | 610 | 168250 | 5.0 | 1494273047 | Get Out (2017) | [Horror] | 3.51 | NaN | 2.66 |
73175 | 610 | 168252 | 5.0 | 1493846352 | Logan (2017) | [Action, Sci-Fi] | 3.63 | 4.50 | 4.30 |
73176 | 610 | 170875 | 3.0 | 1493846415 | The Fate of the Furious (2017) | [Action, Crime, Drama, Thriller] | 3.71 | NaN | 0.41 |
73177 rows × 9 columns
test_data
userId | movieId | rating | timestamp | title | genres | genreMatch | similarUsers | popularity | |
---|---|---|---|---|---|---|---|---|---|
0 | 3 | 31 | 0.5 | 1306463578 | Dangerous Minds (1995) | [Drama] | 0.75 | 2.00 | 5.74 |
1 | 3 | 527 | 0.5 | 1306464275 | Schindler's List (1993) | [Drama, War] | 0.62 | 3.67 | 31.97 |
2 | 3 | 647 | 0.5 | 1306463619 | Courage Under Fire (1996) | [Action, Crime, Drama, War] | 1.33 | 3.00 | 5.74 |
3 | 3 | 688 | 0.5 | 1306464228 | Operation Dumbo Drop (1995) | [Action, Adventure, Comedy, War] | 1.95 | 1.50 | 3.28 |
4 | 3 | 720 | 0.5 | 1306463595 | Wallace & Gromit: The Best of Aardman Animatio... | [Adventure, Animation, Comedy] | 1.41 | 4.50 | 5.74 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
27654 | 609 | 892 | 3.0 | 847221080 | Twelfth Night (1996) | [Comedy, Drama, Romance] | 3.28 | NaN | 4.10 |
27655 | 609 | 1056 | 3.0 | 847221080 | Jude (1996) | [Drama] | 3.37 | NaN | 1.64 |
27656 | 609 | 1059 | 3.0 | 847221054 | William Shakespeare's Romeo + Juliet (1996) | [Drama, Romance] | 3.28 | NaN | 7.38 |
27657 | 609 | 1150 | 4.0 | 847221054 | Return of Martin Guerre, The (Retour de Martin... | [Drama] | 3.37 | NaN | 0.82 |
27658 | 609 | 1161 | 4.0 | 847221080 | Tin Drum, The (Blechtrommel, Die) (1979) | [Drama, War] | 3.43 | NaN | 0.82 |
27659 rows × 9 columns