Added weighted metadata to the model

This commit is contained in:
jan-kordas 2025-01-05 23:34:35 +01:00
parent 117b7775da
commit 04b88b3616
13 changed files with 2547 additions and 2499 deletions

View File

@ -1150,6 +1150,7 @@ Joe Cappelletti
Kenshô Ono
Reina Ueda
Jun'ichi Suwabe
Toa Yukinari
Kôhei Kiyasu
Kenji Utsumi
Ayumu Murase
@ -1162,15 +1163,14 @@ John Rafter Lee
Shigeru Matsuzaki
Akiko Nakamura
Toshiko Fujita
Keiko Han
Aya Suzaki
Atsushi Tamaru
Yûki Kaneko
Lilas Ikuta
Ano
Erin Yvette
Stephen Fu
Grace Lu
Yumi Kawai
Mizuki Yoshida
Yôichirô Saitô
Kengo Kawanishi
Shun Oguri
Haruma Miura
@ -1276,9 +1276,6 @@ Kirby Morrow
Shôta Sometani
Tetsurô Sagawa
Gorô Naya
Debora Rabbai
Greg Wolfe
Elisa Wain
Shôtarô Morikubo
Ryuji Aigase
Takehito Koyasu
@ -2372,7 +2369,7 @@ Akshay Kumar
Sonu Sood
Forest Whitaker
Gong Chan-shik
Won Tae Min
Won Tae-min
Do Woo
Dominic Cooper
Tyrone Power
@ -3119,9 +3116,6 @@ Joe Rogan
Kochi
Avijit Halder
Shanti Das
Kerem Bürsin
Dan Babic
Adrianna Costa
Dan Cohen
Louise Dueno
Nell Hardie
@ -3136,9 +3130,6 @@ Joel Sartore
Shndar Noradin Ali
Ahmed Noradin Ali
Noradin Khalaf Ali
Jerry Springer
Angel Anes
Maria Gara
Peter Andre
Dominic Applewhite
Bruce Dickinson
@ -3372,10 +3363,6 @@ Eloise Eonnet
Hadrian Dagannaud-Brouard
Caitlin Zerra Rose
Roy Wood Jr.
Shirley Alvarez
Melissa Arroyo
Sarah Ashworth
Joe Francis
Cory Lane
Taryn Carter
Jacqueline Lovell
@ -3386,9 +3373,6 @@ Vitaly S. Alexius
Jeff Dunham
Joel Johansson
Jason 'Wee Man' Acuña
Shah Rukh Khan
Saif Ali Khan
Rajkumar Hirani
Derren Brown
Richard Pope
Simon Tcherniak

1 Fred Ward
1150 Kenshô Ono
1151 Reina Ueda
1152 Jun'ichi Suwabe
1153 Toa Yukinari
1154 Kôhei Kiyasu
1155 Kenji Utsumi
1156 Ayumu Murase
1163 Shigeru Matsuzaki
1164 Akiko Nakamura
1165 Toshiko Fujita
Keiko Han
1166 Aya Suzaki
1167 Atsushi Tamaru
1168 Yûki Kaneko
1169 Lilas Ikuta
1170 Ano
1171 Erin Yvette Yumi Kawai
1172 Stephen Fu Mizuki Yoshida
1173 Grace Lu Yôichirô Saitô
1174 Kengo Kawanishi
1175 Shun Oguri
1176 Haruma Miura
1276 Shôta Sometani
1277 Tetsurô Sagawa
1278 Gorô Naya
Debora Rabbai
Greg Wolfe
Elisa Wain
1279 Shôtarô Morikubo
1280 Ryuji Aigase
1281 Takehito Koyasu
2369 Sonu Sood
2370 Forest Whitaker
2371 Gong Chan-shik
2372 Won Tae Min Won Tae-min
2373 Do Woo
2374 Dominic Cooper
2375 Tyrone Power
3116 Kochi
3117 Avijit Halder
3118 Shanti Das
Kerem Bürsin
Dan Babic
Adrianna Costa
3119 Dan Cohen
3120 Louise Dueno
3121 Nell Hardie
3130 Shndar Noradin Ali
3131 Ahmed Noradin Ali
3132 Noradin Khalaf Ali
Jerry Springer
Angel Anes
Maria Gara
3133 Peter Andre
3134 Dominic Applewhite
3135 Bruce Dickinson
3363 Hadrian Dagannaud-Brouard
3364 Caitlin Zerra Rose
3365 Roy Wood Jr.
Shirley Alvarez
Melissa Arroyo
Sarah Ashworth
Joe Francis
3366 Cory Lane
3367 Taryn Carter
3368 Jacqueline Lovell
3373 Jeff Dunham
3374 Joel Johansson
3375 Jason 'Wee Man' Acuña
Shah Rukh Khan
Saif Ali Khan
Rajkumar Hirani
3376 Derren Brown
3377 Richard Pope
3378 Simon Tcherniak

View File

@ -456,7 +456,6 @@ Satoshi Nishimura
Susumu Mitsunaka
Naoki Miyahara
Osamu Dezaki
Jun'ichi Satô
Tomoyuki Kurokawa
Kiyotaka Oshiyama
Kenji Kamiyama
@ -480,7 +479,6 @@ Yûsuke Yamamoto
Takayuki Hirao
Gorô Miyazaki
Sunao Katabuchi
Toshiya Shinohara
Mayumi Nishimoto
Hiroaki Akagi
Taichi Ishidate
@ -509,12 +507,10 @@ Shinji Itadaki
Junji Shimizu
Naoko Kusumi
Mizuho Nishikubo
See production info at IMDbPro
Yasunao Aoki
Nana Harada
Hiroshi Haraguchi
Keisuke Inoue
Iku Suzuki
Hirotsugu Kawasaki
Norihiko Sutô
Atsushi Takeuchi
@ -856,7 +852,6 @@ David McMahon
Brian Flemming
Gaurav Jani
Gabriela Cowperthwaite
Jillian Schlesinger
Chuan Lu
Raoul Peck
Joshua Oppenheimer
@ -1356,7 +1351,6 @@ Anders Østergaard
Brett Harvey
Zana Briski
Ross Kauffman
Steve Meyer
Michael Rossato-Bennett
Adnan Al-Kaissy
Lou Albano
@ -1364,10 +1358,6 @@ Muhammad Ali
Franny Armstrong
John Scheinfeld
Karzan Kardozi
Panagiotis Tsartsianidis
Jerry Springer
Angel Anes
Maria Gara
Chris Howe
Sam Wrench
Bradley Cooper
@ -1476,8 +1466,6 @@ Etan Cohen
Johnny Knoxville
Steve-O
Chris Pontius
Scott Cope
Joe Francis
Jason Williams
Glenn Weiss
Nick Sweeney
@ -1490,8 +1478,6 @@ Richard A. Preuss
Joel Johansson
Bill Chapman
Jason 'Wee Man' Acuña
Shah Rukh Khan
Saif Ali Khan
Troy Miller
Spencer Davis Gray
Ricardo Benítez Garrido

1 Guy Hamilton
456 Susumu Mitsunaka
457 Naoki Miyahara
458 Osamu Dezaki
Jun'ichi Satô
459 Tomoyuki Kurokawa
460 Kiyotaka Oshiyama
461 Kenji Kamiyama
479 Takayuki Hirao
480 Gorô Miyazaki
481 Sunao Katabuchi
Toshiya Shinohara
482 Mayumi Nishimoto
483 Hiroaki Akagi
484 Taichi Ishidate
507 Junji Shimizu
508 Naoko Kusumi
509 Mizuho Nishikubo
See production info at IMDbPro
510 Yasunao Aoki
511 Nana Harada
512 Hiroshi Haraguchi
513 Keisuke Inoue
Iku Suzuki
514 Hirotsugu Kawasaki
515 Norihiko Sutô
516 Atsushi Takeuchi
852 Brian Flemming
853 Gaurav Jani
854 Gabriela Cowperthwaite
Jillian Schlesinger
855 Chuan Lu
856 Raoul Peck
857 Joshua Oppenheimer
1351 Brett Harvey
1352 Zana Briski
1353 Ross Kauffman
Steve Meyer
1354 Michael Rossato-Bennett
1355 Adnan Al-Kaissy
1356 Lou Albano
1358 Franny Armstrong
1359 John Scheinfeld
1360 Karzan Kardozi
Panagiotis Tsartsianidis
Jerry Springer
Angel Anes
Maria Gara
1361 Chris Howe
1362 Sam Wrench
1363 Bradley Cooper
1466 Johnny Knoxville
1467 Steve-O
1468 Chris Pontius
Scott Cope
Joe Francis
1469 Jason Williams
1470 Glenn Weiss
1471 Nick Sweeney
1478 Joel Johansson
1479 Bill Chapman
1480 Jason 'Wee Man' Acuña
Shah Rukh Khan
Saif Ali Khan
1481 Troy Miller
1482 Spencer Davis Gray
1483 Ricardo Benítez Garrido

View File

@ -51,6 +51,7 @@ Tragedy
Slapstick
Globetrotting Adventure
Spy
Political Thriller
Psychological Thriller
Caper
Docudrama
@ -88,7 +89,6 @@ Hand-Drawn Animation
Pop Musical
Buddy Cop
Survival
Political Thriller
Teen Fantasy
Spaghetti Western
Western

1 Action
51 Slapstick
52 Globetrotting Adventure
53 Spy
54 Political Thriller
55 Psychological Thriller
56 Caper
57 Docudrama
89 Pop Musical
90 Buddy Cop
91 Survival
Political Thriller
92 Teen Fantasy
93 Spaghetti Western
94 Western

File diff suppressed because one or more lines are too long

4
src/app/how to run.txt Normal file
View File

@ -0,0 +1,4 @@
cd src folder
python app/main.py
example C:\UAM-Repos\PJN-PROJEKT\src>python app/main.py

View File

@ -14,10 +14,14 @@
{% for recommendation in recommendations %}
<li class="list-group-item">
<h5><strong>{{ recommendation.title }}</strong></h5>
<p><strong>Rating:</strong> {{ recommendation.rating }}</p>
<p><strong>Release Date:</strong> {{ recommendation.release_date }}</p>
<p><strong>Director:</strong> {{ recommendation.directors }}</p>
<p><strong>Cast:</strong> {{ recommendation.stars }}</p>
<p><strong>Genres:</strong> {{ recommendation.genres }}</p>
<p><strong>Description:</strong> {{ recommendation.description }}</p>
<p><strong>Duration:</strong> {{ recommendation.duration_minutes }} minutes</p>
<p><strong>Rating:</strong> {{ recommendation.rating }}</p>
<p>
<strong>More Info:</strong>
<a href="{{ recommendation.url }}" target="_blank" class="link-primary">View Details</a>

View File

@ -1,42 +1,100 @@
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import pickle
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from process_input.analyze_input import analyze_users_input
def load_embeddings(embeddings_path):
with open(embeddings_path, 'rb') as f:
embeddings = pickle.load(f)
return embeddings
def enhance_user_input(user_input):
analyzed_data = analyze_users_input(user_input)
parts = []
if analyzed_data['actors']:
parts.append(f"featuring {' and '.join(analyzed_data['actors'])}")
if analyzed_data['directors']:
parts.append(f"directed by {' and '.join(analyzed_data['directors'])}")
if analyzed_data['genres']:
parts.append(f"in the genre of {' and '.join(analyzed_data['genres'])}")
if analyzed_data['years']:
start, end = analyzed_data['years']
parts.append(f"released between {start} and {end}")
if analyzed_data['duration']:
parts.append(f"with a runtime under {analyzed_data['duration']} minutes")
print(f"Enhanced Input: {user_input} -> {', '.join(parts)}")
return " ".join(parts) or user_input
def generate_user_embedding(user_input, model_type, model, tokenizer=None):
enhanced_input = enhance_user_input(user_input)
print(f"Enhanced Input for Embedding: {enhanced_input}")
if model_type == 'bert':
if tokenizer is None:
raise ValueError("Tokenizer is required for BERT model.")
inputs = tokenizer(user_input, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs = tokenizer(enhanced_input, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze(0).detach().numpy()
elif model_type in ['sentence-transformer', 'sentence-bert']:
return model.encode(user_input, convert_to_tensor=False)
return model.encode(enhanced_input, convert_to_tensor=False)
else:
raise ValueError("Invalid model type. Choose 'bert', 'sentence-transformer', or 'sentence-bert'.")
def recommend_movies(user_input, df, embeddings, model_type, model, tokenizer=None):
user_embedding = generate_user_embedding(user_input, model_type, model, tokenizer)
similarities = cosine_similarity([user_embedding], embeddings).flatten()
df['similarity'] = similarities
analyzed_data = analyze_users_input(user_input)
weights = pd.Series(1.0, index=df.index)
if analyzed_data['actors']:
for actor in analyzed_data['actors']:
weights += df['stars'].str.contains(actor, case=False, na=False) * 8
if analyzed_data['directors']:
for director in analyzed_data['directors']:
weights += df['directors'].str.contains(director, case=False, na=False) * 8
if analyzed_data['genres']:
for genre in analyzed_data['genres']:
weights += df['genres'].str.contains(genre, case=False, na=False) * 8
if analyzed_data['years']:
start_year, end_year = map(int, analyzed_data['years'])
df_years = pd.to_numeric(df['release_date'], errors='coerce').fillna(0).astype(int)
weights += ((df_years >= start_year) & (df_years <= end_year)) * 8
if analyzed_data['duration']:
weights += (df['duration_minutes'] <= analyzed_data['duration']) * 4
final_scores = similarities * weights
df['similarity'] = final_scores
recommendations = df.sort_values(by='similarity', ascending=False).head(10)
recommendations['genres'] = recommendations['genres'].apply(
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
)
recommendations['directors'] = recommendations['directors'].apply(
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
)
recommendations['stars'] = recommendations['stars'].apply(
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
)
return recommendations[['title', 'rating', 'genres', 'description', 'duration_minutes', 'url', 'similarity']]
return recommendations[['title', 'rating', 'genres', 'release_date', 'directors', 'description', 'duration_minutes', 'stars', 'url', 'similarity']]

View File

@ -30,7 +30,7 @@ def prepare_set_with_regex(data_path, save_path=None):
actors_set.append(actor)
if save_path is not None:
with open(save_path, "w", newline="") as file:
with open(save_path, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
for actor in actors_set:
@ -39,6 +39,7 @@ def prepare_set_with_regex(data_path, save_path=None):
return
def prepare_set_from_list(data_path, save_path=None):
df = pd.read_csv(data_path, converters={
'stars': ast.literal_eval,

View File

@ -14,11 +14,13 @@ def prepare_set_from_list(data_path, save_path=None):
directors_set.append(director)
if save_path is not None:
with open(save_path, "w", newline="") as file:
with open(save_path, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
for director in directors_set:
writer.writerow([director])
data_path = "../../data/movies_data.csv"
save_path = "../../data/directors_set.csv"
save_path = "../../data/directors_set.csv"
prepare_set_from_list(data_path, save_path)

View File

@ -27,7 +27,7 @@ def prepare_set_with_regex(data_path, save_path=None):
genres_set.append(genre)
if save_path is not None:
with open(save_path, "w", newline="") as file:
with open(save_path, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
for genre in genres_set:
writer.writerow([genre])
@ -47,7 +47,7 @@ def prepare_set_from_list(data_path, save_path=None):
genres_set.append(genre)
if save_path is not None:
with open(save_path, "w", newline="") as file:
with open(save_path, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
for genre in genres_set:
writer.writerow([genre])

View File

@ -1,8 +1,7 @@
import pandas as pd
def convert_duration_to_minutes(duration):
if not isinstance(duration, str):
if not isinstance(duration, str):
return None
try:
parts = duration.split()
@ -10,39 +9,47 @@ def convert_duration_to_minutes(duration):
minutes = int(parts[1][:-1]) if len(parts) > 1 and 'm' in parts[1] else 0
return hours * 60 + minutes
except (ValueError, IndexError):
return None
return None
def format_field(field):
if isinstance(field, list):
return ", ".join(field)
if isinstance(field, str) and field.startswith('['):
return ", ".join(eval(field))
return field
def load_and_preprocess(data_path, save_path=None):
print("Loading data...")
df = pd.read_csv(data_path)
print("Preprocessing data...")
df['duration_minutes'] = df['duration'].apply(convert_duration_to_minutes)
df = df[df['duration_minutes'].notnull() & (df['duration_minutes'] >= 60)]
columns_to_parse = ['genres', 'directors', 'stars', 'keywords']
for column in columns_to_parse:
df[column] = df[column].apply(eval)
df[column] = df[column].apply(eval).apply(format_field)
df['description'] = df['description'].fillna('').astype(str)
df['storyline'] = df['storyline'].fillna('').astype(str)
df['combined_text'] = (
df['description'] + " " +
df['storyline'] + " " +
df['keywords'].apply(lambda x: " ".join(x))
df['description'] + " " +
df['storyline'] + " " +
df['keywords'] + " " +
"featuring " + df['stars'] + " " +
"directed by " + df['directors'] + " " +
"in the genre of " + df['genres'] + " " +
"released in " + df['release_date'].astype(str) + " " +
"with a runtime of " + df['duration_minutes'].astype(str) + " minutes"
)
df = df.drop(columns=['storyline', 'keywords', 'duration'])
print("Preprocessing complete.")
if save_path:
@ -51,10 +58,12 @@ def load_and_preprocess(data_path, save_path=None):
return df
data_path = "../../data/movies_data.csv"
save_path = "../../data/preprocessed_data.csv"
data_path = "../../data/movies_data.csv"
save_path = "../../data/preprocessed_data.csv"
df = load_and_preprocess(data_path, save_path)
# Uncomment to preview the preprocessed data
# print("\nPreprocessed Data Preview:")
# print(df.head())
# print(df.head())

View File

@ -1,22 +1,22 @@
import spacy
import pandas as pd
from src.process_input.get_duration import get_movie_duration
from src.process_input.get_genres import get_movie_genres
from src.process_input.get_people import get_actor_or_director
from src.process_input.get_years import get_movie_years
from process_input.get_duration import get_movie_duration
from process_input.get_genres import get_movie_genres
from process_input.get_people import get_actor_or_director
from process_input.get_years import get_movie_years
def load_data():
sets = {}
df = pd.read_csv('../../data/directors_set.csv', header=None)
df = pd.read_csv('../data/directors_set.csv', header=None)
df.columns = ['Name']
sets["directors"] = df['Name'].str.lower().tolist()
df = pd.read_csv('../../data/actors_set.csv', header=None)
df = pd.read_csv('../data/actors_set.csv', header=None)
df.columns = ['Name']
sets["actors"] = df['Name'].str.lower().tolist()
df = pd.read_csv('../../data/genres_set.csv', header=None)
df = pd.read_csv('../data/genres_set.csv', header=None)
df.columns = ['Name']
sets["genres"] = df['Name'].str.lower().tolist()