30 KiB
30 KiB
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def split_by_user(data, test_size=0.2, random_state=42):
unique_users = data['userId'].unique()
np.random.seed(random_state)
test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)
test_data = data[data['userId'].isin(test_users)]
train_data = data[~data['userId'].isin(test_users)]
return train_data, test_data
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies['genres'] = movies['genres'].str.split('|')
data = pd.merge(ratings, movies, on="movieId")
data
userId | movieId | rating | timestamp | title | genres | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | [Adventure, Animation, Children, Comedy, Fantasy] |
1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | [Comedy, Romance] |
2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | [Action, Crime, Thriller] |
3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | [Mystery, Thriller] |
4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | [Crime, Mystery, Thriller] |
... | ... | ... | ... | ... | ... | ... |
100831 | 610 | 166534 | 4.0 | 1493848402 | Split (2017) | [Drama, Horror, Thriller] |
100832 | 610 | 168248 | 5.0 | 1493850091 | John Wick: Chapter Two (2017) | [Action, Crime, Thriller] |
100833 | 610 | 168250 | 5.0 | 1494273047 | Get Out (2017) | [Horror] |
100834 | 610 | 168252 | 5.0 | 1493846352 | Logan (2017) | [Action, Sci-Fi] |
100835 | 610 | 170875 | 3.0 | 1493846415 | The Fate of the Furious (2017) | [Action, Crime, Drama, Thriller] |
100836 rows × 6 columns
train_data, test_data = split_by_user(data)
train_data
userId | movieId | rating | timestamp | title | genres | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 4.0 | 964982703 | Toy Story (1995) | [Adventure, Animation, Children, Comedy, Fantasy] |
1 | 1 | 3 | 4.0 | 964981247 | Grumpier Old Men (1995) | [Comedy, Romance] |
2 | 1 | 6 | 4.0 | 964982224 | Heat (1995) | [Action, Crime, Thriller] |
3 | 1 | 47 | 5.0 | 964983815 | Seven (a.k.a. Se7en) (1995) | [Mystery, Thriller] |
4 | 1 | 50 | 5.0 | 964982931 | Usual Suspects, The (1995) | [Crime, Mystery, Thriller] |
... | ... | ... | ... | ... | ... | ... |
100831 | 610 | 166534 | 4.0 | 1493848402 | Split (2017) | [Drama, Horror, Thriller] |
100832 | 610 | 168248 | 5.0 | 1493850091 | John Wick: Chapter Two (2017) | [Action, Crime, Thriller] |
100833 | 610 | 168250 | 5.0 | 1494273047 | Get Out (2017) | [Horror] |
100834 | 610 | 168252 | 5.0 | 1493846352 | Logan (2017) | [Action, Sci-Fi] |
100835 | 610 | 170875 | 3.0 | 1493846415 | The Fate of the Furious (2017) | [Action, Crime, Drama, Thriller] |
73177 rows × 6 columns
test_data
userId | movieId | rating | timestamp | title | genres | |
---|---|---|---|---|---|---|
261 | 3 | 31 | 0.5 | 1306463578 | Dangerous Minds (1995) | [Drama] |
262 | 3 | 527 | 0.5 | 1306464275 | Schindler's List (1993) | [Drama, War] |
263 | 3 | 647 | 0.5 | 1306463619 | Courage Under Fire (1996) | [Action, Crime, Drama, War] |
264 | 3 | 688 | 0.5 | 1306464228 | Operation Dumbo Drop (1995) | [Action, Adventure, Comedy, War] |
265 | 3 | 720 | 0.5 | 1306463595 | Wallace & Gromit: The Best of Aardman Animatio... | [Adventure, Animation, Comedy] |
... | ... | ... | ... | ... | ... | ... |
99529 | 609 | 892 | 3.0 | 847221080 | Twelfth Night (1996) | [Comedy, Drama, Romance] |
99530 | 609 | 1056 | 3.0 | 847221080 | Jude (1996) | [Drama] |
99531 | 609 | 1059 | 3.0 | 847221054 | William Shakespeare's Romeo + Juliet (1996) | [Drama, Romance] |
99532 | 609 | 1150 | 4.0 | 847221054 | Return of Martin Guerre, The (Retour de Martin... | [Drama] |
99533 | 609 | 1161 | 4.0 | 847221080 | Tin Drum, The (Blechtrommel, Die) (1979) | [Drama, War] |
27659 rows × 6 columns
expanded_train_data = train_data.explode('genres')
train_data = train_data.copy()
test_data = test_data.copy()
train_user_genre_rating = (
expanded_train_data.groupby(['userId', 'genres'])['rating']
.mean()
.reset_index()
.rename(columns={'rating': 'avg_genre_rating'})
)
train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)
def train_genre_match_calc(row):
user_id = row['userId']
genres = row['genres']
if isinstance(genres, str):
genres = [genres]
user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]
genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']
if not genre_scores.empty:
return genre_scores.mean()
else:
return 0
train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)
expanded_test_data = test_data.explode('genres')
test_user_genre_rating = (
expanded_test_data.groupby(['userId', 'genres'])['rating']
.mean()
.reset_index()
.rename(columns={'rating': 'avg_genre_rating'})
)
test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)
def test_genre_match_calc(row):
user_id = row['userId']
genres = row['genres']
if isinstance(genres, str):
genres = [genres]
user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]
genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']
if not genre_scores.empty:
return genre_scores.mean()
else:
return 0
test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)
train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')
train_user_movie_filled = train_user_movie.fillna(0)
train_user_similarity = cosine_similarity(train_user_movie_filled)
train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)
def train_average(user_id, movie_id, top_n=5):
similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
similar_ratings = train_user_movie.loc[similar_users, movie_id]
return similar_ratings.dropna().mean()
train_data['similarUsers'] = train_data.apply(
lambda row: train_average(row['userId'], row['movieId']), axis=1
)
test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')
test_user_movie_filled = test_user_movie.fillna(0)
test_user_similarity = cosine_similarity(test_user_movie_filled)
test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)
def test_average(user_id, movie_id, top_n=5):
similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
similar_ratings = test_user_movie.loc[similar_users, movie_id]
return similar_ratings.dropna().mean()
test_data['similarUsers'] = test_data.apply(
lambda row: test_average(row['userId'], row['movieId']), axis=1
)
train_ratings = train_data[['userId', 'movieId', 'similarUsers']]
train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')
test_ratings = test_data[['userId', 'movieId', 'similarUsers']]
test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')
train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')
train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')
test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')
test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')
train_data.to_csv('datasets/train_all.csv', index=False)
test_data.to_csv('datasets/test_all.csv', index=False)
def get_top_movies(data):
top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))
return top
train_top = get_top_movies(train_data)
test_top = get_top_movies(test_data)
train_top.to_csv('datasets/train_top.csv', index=False)
test_top.to_csv('datasets/test_top.csv', index=False)