- Implemented get_movie_data.py - Saved example of movie data - Changed columns for movie data - Added new packages (pandas, tabulate)

This commit is contained in:
jakzar 2024-12-23 17:50:03 +01:00
parent 5d6b2c4427
commit 8f42247244
5 changed files with 254 additions and 4 deletions

56
data/movie_data.json Normal file
View File

@ -0,0 +1,56 @@
{
"title":{
"0":"The Dark Knight"
},
"release_date":{
"0":"2008"
},
"main_genres":{
"0":[
"Action"
]
},
"sub_genres":{
"0":[
"Action Epic",
"Epic",
"Superhero",
"Crime",
"Drama",
"Thriller"
]
},
"directors":{
"0":[
"Christopher Nolan"
]
},
"stars":{
"0":[
"Christian Bale",
"Heath Ledger",
"Aaron Eckhart"
]
},
"rating":{
"0":9.0
},
"duration":{
"0":"2h 32m"
},
"description":{
"0":"When a menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman, James Gordon and Harvey Dent must work together to put an end to the madness."
},
"storyline":{
"0":"Gotham's new District Attorney has been elected. His name is Harvey Dent, and he has a radical new agenda that threatens to take down Gotham's organized crime underworld once and for all with an iron fist. But the emergence of the rogue vigilante known as Batman has caused problems for Dent and his agenda. A new criminal mastermind known only as \"The Joker\" has arrived and aims to take Gotham out from under Harvey Dent's iron fist. The Joker stages a masterfully planned bank robbery and robs the Gotham mob blind. He uses this money to stage a series of horrific and strategic attacks against the city and its people, each one carefully planned and aimed at Dent and Batman, while causing the rest of the city to enter panic mode. Meanwhile, Batman thinks he might have found a lead to The Joker thanks to Wayne Enterprises' dealings with a shady Chinese banker, and that takes Batman and Alfred to Hong Kong. The Joker has no rules, but Batman has only one, and the Joker aims to make Batman break his only rule. But who will be the one to take him out, will it be rogue vigilante Batman, or will it be elected official Harvey Dent, the new hero with a face?"
},
"keywords":{
"0":[
"psychopath",
"superhero",
"moral dilemma",
"clown",
"criminal mastermind"
]
}
}

Binary file not shown.

View File

@ -1,2 +1,15 @@
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
data_save_location="data/"
data_save_location="data/"
movie_columns=[
"title",
"release_date",
"main_genres",
"sub_genres",
"directors",
"stars",
"rating",
"duration",
"description",
"storyline",
"keywords"
]

View File

@ -0,0 +1,183 @@
import requests
from bs4 import BeautifulSoup
import config
import json
import re
import pandas as pd
"""
QUICK INFO
returns DataFrame with columns from config.movie_columns
url - url of movie
main_genre - main genre of movie
save=False - if True save to file movie_data.json
iter - number of done tries to get data
MAXIMUM NUMBER OF TRIES = 3
Gets data for one movie from imdb
Return DataFrame with columns from config.movie_columns
"""
def get_movie_data(url,main_genre, save=False, iter=0):
if iter>3:
raise Exception("Too many tries.")
result=pd.DataFrame(columns=config.movie_columns)
not_found_flag=False
result.loc[0, "main_genres"]=[main_genre]
site = requests.get(url, headers=config.headers)
soup=BeautifulSoup(site.text, 'html.parser')
# Title
title=soup.find(
"h1",{"data-testid":"hero__pageTitle"}
)
if title:
title=title.text.strip()
result.loc[0, "title"]=title
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Release date and duration
details_row=soup.find(
"ul",{"role":"presentation", "class":"ipc-inline-list ipc-inline-list--show-dividers sc-ec65ba05-2 joVhBE baseAlt"}
)
if details_row:
details=details_row.find_all("li")
for detail in details:
text=detail.text.strip()
if re.match(r"^[0-9]{4}$", text):
result.loc[0, "release_date"]=text
if re.match(r"^[0-9]*h [0-9]*m", text):
result.loc[0, "duration"]=text
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find genres and process them (main and subgenres)
genres_row=soup.find("div",
{"data-testid":"interests"}
)
if genres_row:
genres=genres_row.find_all("a",
{"class":"ipc-chip ipc-chip--on-baseAlt"})
found_genres=[]
for genre in genres:
genre_text=genre.text.strip()
if genre_text not in result.loc[0, "main_genres"]:
found_genres.append(genre_text)
result.loc[0, "sub_genres"]=found_genres
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find rating
rating=soup.find("span",
{"class":"sc-d541859f-1 imUuxf"}
)
if rating:
rating=float(rating.text.strip())
result.loc[0, "rating"]=rating
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find description
description=soup.find("span",
{"data-testid":"plot-xs_to_m"})
if description:
description=description.text.strip()
result.loc[0, "description"]=description
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find directors
directors=[]
stars=[]
# There are 3 rows on IMDB page in order: directors,writers,stars
cast_rows=soup.find_all("div",
"ipc-metadata-list-item__content-container")
if cast_rows:
directors_links=cast_rows[0].find_all("a")
for director in directors_links:
directors.append(director.text.strip())
result.loc[0, "directors"]=directors
stars_links=cast_rows[2].find_all("a")
for star in stars_links:
stars.append(star.text.strip())
result.loc[0, "stars"]=stars
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find storyline
# Due to the fact that movie page has lazy loading
# we need to go to the page that contains plotsummary
plot_summary_url=url+"plotsummary/"
plot_summary_site=requests.get(plot_summary_url, headers=config.headers)
plot_summary_soup=BeautifulSoup(plot_summary_site.text, 'html.parser')
summaries=plot_summary_soup.find_all(
"li",
{"class":"ipc-metadata-list__item"}
)
if summaries:
text_summaries=[]
for summary in summaries:
if summary!=None:
try:
plot_summary=summary.find("div",
{"class":"ipc-html-content-inner-div"}).text.strip()
except AttributeError:
continue
if plot_summary:
text_summaries.append(plot_summary)
# Pick 2nd longest summary (if only one, pick it, if none, pick empty string)
if len(text_summaries) > 1:
longest_summary = sorted(text_summaries, key=len)[-2]
else:
longest_summary = text_summaries[0] if text_summaries else ""
# Remove author of the summary
if "" in longest_summary:
longest_summary = longest_summary.split("")[0].strip()
result.loc[0, "storyline"] = longest_summary
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
# Find keywords
# also different url than main movie page
keywords=[]
keywords_url=url+"keywords/"
keywords_site=requests.get(keywords_url, headers=config.headers)
keywords_soup=BeautifulSoup(keywords_site.text, 'html.parser')
all_keywords=keywords_soup.find_all("a",
{"class":"ipc-metadata-list-summary-item__t"})
if all_keywords:
for keyword in all_keywords:
if(keyword):
try:
keyword_text=keyword.text.strip()
if keyword_text:
keywords.append(keyword_text) if len(keywords)<5 else None
except AttributeError:
continue
if(len(keywords)==0):
keywords=[""]
result.loc[0, "keywords"]=keywords
else:
not_found_flag=True
result=get_movie_data(url, save, iter+1)
if save:
result.to_json(config.data_save_location+"movie_data.json", indent=4)
else:
return result
get_movie_data("https://www.imdb.com/title/tt0468569/",main_genre="Action",save=True)

View File

@ -45,6 +45,4 @@ def get_movies_links_for_interest(url, save=False):
with open(config.data_save_location+'movies_links.json', 'w') as f:
json.dump(results, f, indent=4)
else:
return results
get_movies_links_for_interest("https://www.imdb.com/interest/in0000001/")
return results