rekomendacja_filmow/movies_data.ipynb
2024-12-07 03:17:45 +01:00

30 KiB
Raw Blame History

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def split_by_user(data, test_size=0.2, random_state=42):
    unique_users = data['userId'].unique()
    np.random.seed(random_state)
    test_users = np.random.choice(unique_users, size=int(len(unique_users) * test_size), replace=False)
    test_data = data[data['userId'].isin(test_users)]
    train_data = data[~data['userId'].isin(test_users)]
    return train_data, test_data
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies['genres'] = movies['genres'].str.split('|')
data = pd.merge(ratings, movies, on="movieId")
data
userId movieId rating timestamp title genres
0 1 1 4.0 964982703 Toy Story (1995) [Adventure, Animation, Children, Comedy, Fantasy]
1 1 3 4.0 964981247 Grumpier Old Men (1995) [Comedy, Romance]
2 1 6 4.0 964982224 Heat (1995) [Action, Crime, Thriller]
3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) [Mystery, Thriller]
4 1 50 5.0 964982931 Usual Suspects, The (1995) [Crime, Mystery, Thriller]
... ... ... ... ... ... ...
100831 610 166534 4.0 1493848402 Split (2017) [Drama, Horror, Thriller]
100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) [Action, Crime, Thriller]
100833 610 168250 5.0 1494273047 Get Out (2017) [Horror]
100834 610 168252 5.0 1493846352 Logan (2017) [Action, Sci-Fi]
100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) [Action, Crime, Drama, Thriller]

100836 rows × 6 columns

train_data, test_data = split_by_user(data)
train_data
userId movieId rating timestamp title genres
0 1 1 4.0 964982703 Toy Story (1995) [Adventure, Animation, Children, Comedy, Fantasy]
1 1 3 4.0 964981247 Grumpier Old Men (1995) [Comedy, Romance]
2 1 6 4.0 964982224 Heat (1995) [Action, Crime, Thriller]
3 1 47 5.0 964983815 Seven (a.k.a. Se7en) (1995) [Mystery, Thriller]
4 1 50 5.0 964982931 Usual Suspects, The (1995) [Crime, Mystery, Thriller]
... ... ... ... ... ... ...
100831 610 166534 4.0 1493848402 Split (2017) [Drama, Horror, Thriller]
100832 610 168248 5.0 1493850091 John Wick: Chapter Two (2017) [Action, Crime, Thriller]
100833 610 168250 5.0 1494273047 Get Out (2017) [Horror]
100834 610 168252 5.0 1493846352 Logan (2017) [Action, Sci-Fi]
100835 610 170875 3.0 1493846415 The Fate of the Furious (2017) [Action, Crime, Drama, Thriller]

73177 rows × 6 columns

test_data
userId movieId rating timestamp title genres
261 3 31 0.5 1306463578 Dangerous Minds (1995) [Drama]
262 3 527 0.5 1306464275 Schindler's List (1993) [Drama, War]
263 3 647 0.5 1306463619 Courage Under Fire (1996) [Action, Crime, Drama, War]
264 3 688 0.5 1306464228 Operation Dumbo Drop (1995) [Action, Adventure, Comedy, War]
265 3 720 0.5 1306463595 Wallace & Gromit: The Best of Aardman Animatio... [Adventure, Animation, Comedy]
... ... ... ... ... ... ...
99529 609 892 3.0 847221080 Twelfth Night (1996) [Comedy, Drama, Romance]
99530 609 1056 3.0 847221080 Jude (1996) [Drama]
99531 609 1059 3.0 847221054 William Shakespeare's Romeo + Juliet (1996) [Drama, Romance]
99532 609 1150 4.0 847221054 Return of Martin Guerre, The (Retour de Martin... [Drama]
99533 609 1161 4.0 847221080 Tin Drum, The (Blechtrommel, Die) (1979) [Drama, War]

27659 rows × 6 columns

expanded_train_data = train_data.explode('genres')
train_data = train_data.copy()
test_data = test_data.copy()

train_user_genre_rating = (
    expanded_train_data.groupby(['userId', 'genres'])['rating']
    .mean()
    .reset_index()
    .rename(columns={'rating': 'avg_genre_rating'})
)

train_user_preferences = train_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)

def train_genre_match_calc(row):
    user_id = row['userId']
    genres = row['genres']

    if isinstance(genres, str):
        genres = [genres]

    user_pref = train_user_genre_rating[train_user_genre_rating['userId'] == user_id]
    genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']

    if not genre_scores.empty:
        return round(genre_scores.mean(), 2)
    else:
        return 0

train_data['genreMatch'] = train_data.apply(train_genre_match_calc, axis=1)


expanded_test_data = test_data.explode('genres')

test_user_genre_rating = (
    expanded_test_data.groupby(['userId', 'genres'])['rating']
    .mean()
    .reset_index()
    .rename(columns={'rating': 'avg_genre_rating'})
)

test_user_preferences = test_user_genre_rating.pivot(index='userId', columns='genres', values='avg_genre_rating').fillna(0)

def test_genre_match_calc(row):
    user_id = row['userId']
    genres = row['genres']

    if isinstance(genres, str):
        genres = [genres]

    user_pref = test_user_genre_rating[test_user_genre_rating['userId'] == user_id]
    genre_scores = user_pref[user_pref['genres'].isin(genres)]['avg_genre_rating']

    if not genre_scores.empty:
        return round(genre_scores.mean(), 2)
    else:
        return 0

test_data['genreMatch'] = test_data.apply(test_genre_match_calc, axis=1)
train_user_movie = train_data.pivot(index='userId', columns='movieId', values='rating')
train_user_movie_filled = train_user_movie.fillna(0)
train_user_similarity = cosine_similarity(train_user_movie_filled)
train_user_similarity_df = pd.DataFrame(train_user_similarity, index=train_user_movie.index, columns=train_user_movie.index)

def train_average(user_id, movie_id, top_n=5):
    similar_users = train_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
    similar_ratings = train_user_movie.loc[similar_users, movie_id]

    return round(similar_ratings.dropna().mean(), 2)

train_data['similarUsers'] = train_data.apply(
    lambda row: train_average(row['userId'], row['movieId']), axis=1
)

test_user_movie = test_data.pivot(index='userId', columns='movieId', values='rating')
test_user_movie_filled = test_user_movie.fillna(0)
test_user_similarity = cosine_similarity(test_user_movie_filled)
test_user_similarity_df = pd.DataFrame(test_user_similarity, index=test_user_movie.index, columns=test_user_movie.index)

def test_average(user_id, movie_id, top_n=5):
    similar_users = test_user_similarity_df[user_id].sort_values(ascending=False).index[1:top_n + 1]
    similar_ratings = test_user_movie.loc[similar_users, movie_id]

    return round(similar_ratings.dropna().mean(), 2)

test_data['similarUsers'] = test_data.apply(
    lambda row: test_average(row['userId'], row['movieId']), axis=1
)
train_ratings = train_data[['userId', 'movieId', 'similarUsers']]
train_data = pd.merge(train_data, train_ratings, on=['userId', 'movieId'], how='left')

test_ratings = test_data[['userId', 'movieId', 'similarUsers']]
test_data = pd.merge(test_data, test_ratings, on=['userId', 'movieId'], how='left')
train_popularity = train_data.groupby('movieId').size().reset_index(name='popularity')
train_data = pd.merge(train_data, train_popularity, on='movieId', how='left')

test_popularity = test_data.groupby('movieId').size().reset_index(name='popularity')
test_data = pd.merge(test_data, test_popularity, on='movieId', how='left')
train_data.to_csv('datasets/train_all.csv', index=False)
test_data.to_csv('datasets/test_all.csv', index=False)
def get_top_movies(data):
    top = (data.sort_values(by=['userId', 'rating', 'popularity'], ascending=[True, False, False]).groupby('userId').head(5))
    return top

train_top = get_top_movies(train_data)
test_top = get_top_movies(test_data)
train_top.to_csv('datasets/train_top.csv', index=False)
test_top.to_csv('datasets/test_top.csv', index=False)