Added weighted metadata to the model
This commit is contained in:
parent
117b7775da
commit
04b88b3616
@ -1150,6 +1150,7 @@ Joe Cappelletti
|
||||
Kenshô Ono
|
||||
Reina Ueda
|
||||
Jun'ichi Suwabe
|
||||
Toa Yukinari
|
||||
Kôhei Kiyasu
|
||||
Kenji Utsumi
|
||||
Ayumu Murase
|
||||
@ -1162,15 +1163,14 @@ John Rafter Lee
|
||||
Shigeru Matsuzaki
|
||||
Akiko Nakamura
|
||||
Toshiko Fujita
|
||||
Keiko Han
|
||||
Aya Suzaki
|
||||
Atsushi Tamaru
|
||||
Yûki Kaneko
|
||||
Lilas Ikuta
|
||||
Ano
|
||||
Erin Yvette
|
||||
Stephen Fu
|
||||
Grace Lu
|
||||
Yumi Kawai
|
||||
Mizuki Yoshida
|
||||
Yôichirô Saitô
|
||||
Kengo Kawanishi
|
||||
Shun Oguri
|
||||
Haruma Miura
|
||||
@ -1276,9 +1276,6 @@ Kirby Morrow
|
||||
Shôta Sometani
|
||||
Tetsurô Sagawa
|
||||
Gorô Naya
|
||||
Debora Rabbai
|
||||
Greg Wolfe
|
||||
Elisa Wain
|
||||
Shôtarô Morikubo
|
||||
Ryuji Aigase
|
||||
Takehito Koyasu
|
||||
@ -2372,7 +2369,7 @@ Akshay Kumar
|
||||
Sonu Sood
|
||||
Forest Whitaker
|
||||
Gong Chan-shik
|
||||
Won Tae Min
|
||||
Won Tae-min
|
||||
Do Woo
|
||||
Dominic Cooper
|
||||
Tyrone Power
|
||||
@ -3119,9 +3116,6 @@ Joe Rogan
|
||||
Kochi
|
||||
Avijit Halder
|
||||
Shanti Das
|
||||
Kerem Bürsin
|
||||
Dan Babic
|
||||
Adrianna Costa
|
||||
Dan Cohen
|
||||
Louise Dueno
|
||||
Nell Hardie
|
||||
@ -3136,9 +3130,6 @@ Joel Sartore
|
||||
Shndar Noradin Ali
|
||||
Ahmed Noradin Ali
|
||||
Noradin Khalaf Ali
|
||||
Jerry Springer
|
||||
Angel Anes
|
||||
Maria Gara
|
||||
Peter Andre
|
||||
Dominic Applewhite
|
||||
Bruce Dickinson
|
||||
@ -3372,10 +3363,6 @@ Eloise Eonnet
|
||||
Hadrian Dagannaud-Brouard
|
||||
Caitlin Zerra Rose
|
||||
Roy Wood Jr.
|
||||
Shirley Alvarez
|
||||
Melissa Arroyo
|
||||
Sarah Ashworth
|
||||
Joe Francis
|
||||
Cory Lane
|
||||
Taryn Carter
|
||||
Jacqueline Lovell
|
||||
@ -3386,9 +3373,6 @@ Vitaly S. Alexius
|
||||
Jeff Dunham
|
||||
Joel Johansson
|
||||
Jason 'Wee Man' Acuña
|
||||
Shah Rukh Khan
|
||||
Saif Ali Khan
|
||||
Rajkumar Hirani
|
||||
Derren Brown
|
||||
Richard Pope
|
||||
Simon Tcherniak
|
||||
|
|
@ -456,7 +456,6 @@ Satoshi Nishimura
|
||||
Susumu Mitsunaka
|
||||
Naoki Miyahara
|
||||
Osamu Dezaki
|
||||
Jun'ichi Satô
|
||||
Tomoyuki Kurokawa
|
||||
Kiyotaka Oshiyama
|
||||
Kenji Kamiyama
|
||||
@ -480,7 +479,6 @@ Yûsuke Yamamoto
|
||||
Takayuki Hirao
|
||||
Gorô Miyazaki
|
||||
Sunao Katabuchi
|
||||
Toshiya Shinohara
|
||||
Mayumi Nishimoto
|
||||
Hiroaki Akagi
|
||||
Taichi Ishidate
|
||||
@ -509,12 +507,10 @@ Shinji Itadaki
|
||||
Junji Shimizu
|
||||
Naoko Kusumi
|
||||
Mizuho Nishikubo
|
||||
See production info at IMDbPro
|
||||
Yasunao Aoki
|
||||
Nana Harada
|
||||
Hiroshi Haraguchi
|
||||
Keisuke Inoue
|
||||
Iku Suzuki
|
||||
Hirotsugu Kawasaki
|
||||
Norihiko Sutô
|
||||
Atsushi Takeuchi
|
||||
@ -856,7 +852,6 @@ David McMahon
|
||||
Brian Flemming
|
||||
Gaurav Jani
|
||||
Gabriela Cowperthwaite
|
||||
Jillian Schlesinger
|
||||
Chuan Lu
|
||||
Raoul Peck
|
||||
Joshua Oppenheimer
|
||||
@ -1356,7 +1351,6 @@ Anders Østergaard
|
||||
Brett Harvey
|
||||
Zana Briski
|
||||
Ross Kauffman
|
||||
Steve Meyer
|
||||
Michael Rossato-Bennett
|
||||
Adnan Al-Kaissy
|
||||
Lou Albano
|
||||
@ -1364,10 +1358,6 @@ Muhammad Ali
|
||||
Franny Armstrong
|
||||
John Scheinfeld
|
||||
Karzan Kardozi
|
||||
Panagiotis Tsartsianidis
|
||||
Jerry Springer
|
||||
Angel Anes
|
||||
Maria Gara
|
||||
Chris Howe
|
||||
Sam Wrench
|
||||
Bradley Cooper
|
||||
@ -1476,8 +1466,6 @@ Etan Cohen
|
||||
Johnny Knoxville
|
||||
Steve-O
|
||||
Chris Pontius
|
||||
Scott Cope
|
||||
Joe Francis
|
||||
Jason Williams
|
||||
Glenn Weiss
|
||||
Nick Sweeney
|
||||
@ -1490,8 +1478,6 @@ Richard A. Preuss
|
||||
Joel Johansson
|
||||
Bill Chapman
|
||||
Jason 'Wee Man' Acuña
|
||||
Shah Rukh Khan
|
||||
Saif Ali Khan
|
||||
Troy Miller
|
||||
Spencer Davis Gray
|
||||
Ricardo Benítez Garrido
|
||||
|
|
@ -51,6 +51,7 @@ Tragedy
|
||||
Slapstick
|
||||
Globetrotting Adventure
|
||||
Spy
|
||||
Political Thriller
|
||||
Psychological Thriller
|
||||
Caper
|
||||
Docudrama
|
||||
@ -88,7 +89,6 @@ Hand-Drawn Animation
|
||||
Pop Musical
|
||||
Buddy Cop
|
||||
Survival
|
||||
Political Thriller
|
||||
Teen Fantasy
|
||||
Spaghetti Western
|
||||
Western
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
4
src/app/how to run.txt
Normal file
4
src/app/how to run.txt
Normal file
@ -0,0 +1,4 @@
|
||||
cd src folder
|
||||
python app/main.py
|
||||
|
||||
example C:\UAM-Repos\PJN-PROJEKT\src>python app/main.py
|
@ -14,10 +14,14 @@
|
||||
{% for recommendation in recommendations %}
|
||||
<li class="list-group-item">
|
||||
<h5><strong>{{ recommendation.title }}</strong></h5>
|
||||
<p><strong>Rating:</strong> {{ recommendation.rating }}</p>
|
||||
<p><strong>Release Date:</strong> {{ recommendation.release_date }}</p>
|
||||
<p><strong>Director:</strong> {{ recommendation.directors }}</p>
|
||||
<p><strong>Cast:</strong> {{ recommendation.stars }}</p>
|
||||
<p><strong>Genres:</strong> {{ recommendation.genres }}</p>
|
||||
<p><strong>Description:</strong> {{ recommendation.description }}</p>
|
||||
<p><strong>Duration:</strong> {{ recommendation.duration_minutes }} minutes</p>
|
||||
<p><strong>Rating:</strong> {{ recommendation.rating }}</p>
|
||||
|
||||
<p>
|
||||
<strong>More Info:</strong>
|
||||
<a href="{{ recommendation.url }}" target="_blank" class="link-primary">View Details</a>
|
||||
|
@ -1,42 +1,100 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
import pandas as pd
|
||||
import pickle
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from process_input.analyze_input import analyze_users_input
|
||||
|
||||
|
||||
def load_embeddings(embeddings_path):
|
||||
|
||||
with open(embeddings_path, 'rb') as f:
|
||||
embeddings = pickle.load(f)
|
||||
return embeddings
|
||||
|
||||
|
||||
def enhance_user_input(user_input):
|
||||
analyzed_data = analyze_users_input(user_input)
|
||||
parts = []
|
||||
|
||||
if analyzed_data['actors']:
|
||||
parts.append(f"featuring {' and '.join(analyzed_data['actors'])}")
|
||||
if analyzed_data['directors']:
|
||||
parts.append(f"directed by {' and '.join(analyzed_data['directors'])}")
|
||||
if analyzed_data['genres']:
|
||||
parts.append(f"in the genre of {' and '.join(analyzed_data['genres'])}")
|
||||
if analyzed_data['years']:
|
||||
start, end = analyzed_data['years']
|
||||
parts.append(f"released between {start} and {end}")
|
||||
if analyzed_data['duration']:
|
||||
parts.append(f"with a runtime under {analyzed_data['duration']} minutes")
|
||||
|
||||
print(f"Enhanced Input: {user_input} -> {', '.join(parts)}")
|
||||
|
||||
return " ".join(parts) or user_input
|
||||
|
||||
|
||||
def generate_user_embedding(user_input, model_type, model, tokenizer=None):
|
||||
enhanced_input = enhance_user_input(user_input)
|
||||
print(f"Enhanced Input for Embedding: {enhanced_input}")
|
||||
|
||||
if model_type == 'bert':
|
||||
if tokenizer is None:
|
||||
raise ValueError("Tokenizer is required for BERT model.")
|
||||
inputs = tokenizer(user_input, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
||||
inputs = tokenizer(enhanced_input, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
||||
outputs = model(**inputs)
|
||||
return outputs.last_hidden_state[:, 0, :].squeeze(0).detach().numpy()
|
||||
elif model_type in ['sentence-transformer', 'sentence-bert']:
|
||||
return model.encode(user_input, convert_to_tensor=False)
|
||||
return model.encode(enhanced_input, convert_to_tensor=False)
|
||||
else:
|
||||
raise ValueError("Invalid model type. Choose 'bert', 'sentence-transformer', or 'sentence-bert'.")
|
||||
|
||||
|
||||
def recommend_movies(user_input, df, embeddings, model_type, model, tokenizer=None):
|
||||
|
||||
user_embedding = generate_user_embedding(user_input, model_type, model, tokenizer)
|
||||
|
||||
similarities = cosine_similarity([user_embedding], embeddings).flatten()
|
||||
|
||||
df['similarity'] = similarities
|
||||
analyzed_data = analyze_users_input(user_input)
|
||||
|
||||
weights = pd.Series(1.0, index=df.index)
|
||||
|
||||
if analyzed_data['actors']:
|
||||
for actor in analyzed_data['actors']:
|
||||
weights += df['stars'].str.contains(actor, case=False, na=False) * 8
|
||||
|
||||
if analyzed_data['directors']:
|
||||
for director in analyzed_data['directors']:
|
||||
weights += df['directors'].str.contains(director, case=False, na=False) * 8
|
||||
|
||||
if analyzed_data['genres']:
|
||||
for genre in analyzed_data['genres']:
|
||||
weights += df['genres'].str.contains(genre, case=False, na=False) * 8
|
||||
|
||||
if analyzed_data['years']:
|
||||
start_year, end_year = map(int, analyzed_data['years'])
|
||||
df_years = pd.to_numeric(df['release_date'], errors='coerce').fillna(0).astype(int)
|
||||
weights += ((df_years >= start_year) & (df_years <= end_year)) * 8
|
||||
|
||||
if analyzed_data['duration']:
|
||||
weights += (df['duration_minutes'] <= analyzed_data['duration']) * 4
|
||||
|
||||
final_scores = similarities * weights
|
||||
|
||||
df['similarity'] = final_scores
|
||||
|
||||
recommendations = df.sort_values(by='similarity', ascending=False).head(10)
|
||||
|
||||
recommendations['genres'] = recommendations['genres'].apply(
|
||||
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
|
||||
)
|
||||
recommendations['directors'] = recommendations['directors'].apply(
|
||||
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
|
||||
)
|
||||
recommendations['stars'] = recommendations['stars'].apply(
|
||||
lambda x: ', '.join(eval(x)) if isinstance(x, str) and x.startswith('[') else x
|
||||
)
|
||||
|
||||
return recommendations[['title', 'rating', 'genres', 'description', 'duration_minutes', 'url', 'similarity']]
|
||||
return recommendations[['title', 'rating', 'genres', 'release_date', 'directors', 'description', 'duration_minutes', 'stars', 'url', 'similarity']]
|
||||
|
@ -30,7 +30,7 @@ def prepare_set_with_regex(data_path, save_path=None):
|
||||
actors_set.append(actor)
|
||||
|
||||
if save_path is not None:
|
||||
with open(save_path, "w", newline="") as file:
|
||||
with open(save_path, "w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.writer(file)
|
||||
|
||||
for actor in actors_set:
|
||||
@ -39,6 +39,7 @@ def prepare_set_with_regex(data_path, save_path=None):
|
||||
return
|
||||
|
||||
|
||||
|
||||
def prepare_set_from_list(data_path, save_path=None):
|
||||
df = pd.read_csv(data_path, converters={
|
||||
'stars': ast.literal_eval,
|
||||
|
@ -14,11 +14,13 @@ def prepare_set_from_list(data_path, save_path=None):
|
||||
directors_set.append(director)
|
||||
|
||||
if save_path is not None:
|
||||
with open(save_path, "w", newline="") as file:
|
||||
with open(save_path, "w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.writer(file)
|
||||
for director in directors_set:
|
||||
writer.writerow([director])
|
||||
|
||||
|
||||
data_path = "../../data/movies_data.csv"
|
||||
save_path = "../../data/directors_set.csv"
|
||||
save_path = "../../data/directors_set.csv"
|
||||
|
||||
prepare_set_from_list(data_path, save_path)
|
@ -27,7 +27,7 @@ def prepare_set_with_regex(data_path, save_path=None):
|
||||
genres_set.append(genre)
|
||||
|
||||
if save_path is not None:
|
||||
with open(save_path, "w", newline="") as file:
|
||||
with open(save_path, "w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.writer(file)
|
||||
for genre in genres_set:
|
||||
writer.writerow([genre])
|
||||
@ -47,7 +47,7 @@ def prepare_set_from_list(data_path, save_path=None):
|
||||
genres_set.append(genre)
|
||||
|
||||
if save_path is not None:
|
||||
with open(save_path, "w", newline="") as file:
|
||||
with open(save_path, "w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.writer(file)
|
||||
for genre in genres_set:
|
||||
writer.writerow([genre])
|
||||
|
@ -1,8 +1,7 @@
|
||||
import pandas as pd
|
||||
|
||||
def convert_duration_to_minutes(duration):
|
||||
|
||||
if not isinstance(duration, str):
|
||||
if not isinstance(duration, str):
|
||||
return None
|
||||
try:
|
||||
parts = duration.split()
|
||||
@ -10,39 +9,47 @@ def convert_duration_to_minutes(duration):
|
||||
minutes = int(parts[1][:-1]) if len(parts) > 1 and 'm' in parts[1] else 0
|
||||
return hours * 60 + minutes
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def format_field(field):
|
||||
if isinstance(field, list):
|
||||
return ", ".join(field)
|
||||
if isinstance(field, str) and field.startswith('['):
|
||||
return ", ".join(eval(field))
|
||||
return field
|
||||
|
||||
|
||||
|
||||
def load_and_preprocess(data_path, save_path=None):
|
||||
print("Loading data...")
|
||||
|
||||
df = pd.read_csv(data_path)
|
||||
|
||||
print("Preprocessing data...")
|
||||
|
||||
|
||||
df['duration_minutes'] = df['duration'].apply(convert_duration_to_minutes)
|
||||
|
||||
df = df[df['duration_minutes'].notnull() & (df['duration_minutes'] >= 60)]
|
||||
|
||||
columns_to_parse = ['genres', 'directors', 'stars', 'keywords']
|
||||
for column in columns_to_parse:
|
||||
df[column] = df[column].apply(eval)
|
||||
|
||||
|
||||
df[column] = df[column].apply(eval).apply(format_field)
|
||||
|
||||
df['description'] = df['description'].fillna('').astype(str)
|
||||
df['storyline'] = df['storyline'].fillna('').astype(str)
|
||||
|
||||
|
||||
df['combined_text'] = (
|
||||
df['description'] + " " +
|
||||
df['storyline'] + " " +
|
||||
df['keywords'].apply(lambda x: " ".join(x))
|
||||
df['description'] + " " +
|
||||
df['storyline'] + " " +
|
||||
df['keywords'] + " " +
|
||||
"featuring " + df['stars'] + " " +
|
||||
"directed by " + df['directors'] + " " +
|
||||
"in the genre of " + df['genres'] + " " +
|
||||
"released in " + df['release_date'].astype(str) + " " +
|
||||
"with a runtime of " + df['duration_minutes'].astype(str) + " minutes"
|
||||
)
|
||||
|
||||
|
||||
df = df.drop(columns=['storyline', 'keywords', 'duration'])
|
||||
|
||||
|
||||
print("Preprocessing complete.")
|
||||
|
||||
if save_path:
|
||||
@ -51,10 +58,12 @@ def load_and_preprocess(data_path, save_path=None):
|
||||
|
||||
return df
|
||||
|
||||
data_path = "../../data/movies_data.csv"
|
||||
save_path = "../../data/preprocessed_data.csv"
|
||||
|
||||
data_path = "../../data/movies_data.csv"
|
||||
save_path = "../../data/preprocessed_data.csv"
|
||||
|
||||
df = load_and_preprocess(data_path, save_path)
|
||||
|
||||
# Uncomment to preview the preprocessed data
|
||||
# print("\nPreprocessed Data Preview:")
|
||||
# print(df.head())
|
||||
# print(df.head())
|
||||
|
@ -1,22 +1,22 @@
|
||||
import spacy
|
||||
import pandas as pd
|
||||
from src.process_input.get_duration import get_movie_duration
|
||||
from src.process_input.get_genres import get_movie_genres
|
||||
from src.process_input.get_people import get_actor_or_director
|
||||
from src.process_input.get_years import get_movie_years
|
||||
from process_input.get_duration import get_movie_duration
|
||||
from process_input.get_genres import get_movie_genres
|
||||
from process_input.get_people import get_actor_or_director
|
||||
from process_input.get_years import get_movie_years
|
||||
|
||||
|
||||
def load_data():
|
||||
sets = {}
|
||||
df = pd.read_csv('../../data/directors_set.csv', header=None)
|
||||
df = pd.read_csv('../data/directors_set.csv', header=None)
|
||||
df.columns = ['Name']
|
||||
sets["directors"] = df['Name'].str.lower().tolist()
|
||||
|
||||
df = pd.read_csv('../../data/actors_set.csv', header=None)
|
||||
df = pd.read_csv('../data/actors_set.csv', header=None)
|
||||
df.columns = ['Name']
|
||||
sets["actors"] = df['Name'].str.lower().tolist()
|
||||
|
||||
df = pd.read_csv('../../data/genres_set.csv', header=None)
|
||||
df = pd.read_csv('../data/genres_set.csv', header=None)
|
||||
df.columns = ['Name']
|
||||
sets["genres"] = df['Name'].str.lower().tolist()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user