- Implemented get_movie_data.py - Saved example of movie data - Changed columns for movie data - Added new packages (pandas, tabulate)
This commit is contained in:
parent
5d6b2c4427
commit
8f42247244
56
data/movie_data.json
Normal file
56
data/movie_data.json
Normal file
@ -0,0 +1,56 @@
|
||||
{
|
||||
"title":{
|
||||
"0":"The Dark Knight"
|
||||
},
|
||||
"release_date":{
|
||||
"0":"2008"
|
||||
},
|
||||
"main_genres":{
|
||||
"0":[
|
||||
"Action"
|
||||
]
|
||||
},
|
||||
"sub_genres":{
|
||||
"0":[
|
||||
"Action Epic",
|
||||
"Epic",
|
||||
"Superhero",
|
||||
"Crime",
|
||||
"Drama",
|
||||
"Thriller"
|
||||
]
|
||||
},
|
||||
"directors":{
|
||||
"0":[
|
||||
"Christopher Nolan"
|
||||
]
|
||||
},
|
||||
"stars":{
|
||||
"0":[
|
||||
"Christian Bale",
|
||||
"Heath Ledger",
|
||||
"Aaron Eckhart"
|
||||
]
|
||||
},
|
||||
"rating":{
|
||||
"0":9.0
|
||||
},
|
||||
"duration":{
|
||||
"0":"2h 32m"
|
||||
},
|
||||
"description":{
|
||||
"0":"When a menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman, James Gordon and Harvey Dent must work together to put an end to the madness."
|
||||
},
|
||||
"storyline":{
|
||||
"0":"Gotham's new District Attorney has been elected. His name is Harvey Dent, and he has a radical new agenda that threatens to take down Gotham's organized crime underworld once and for all with an iron fist. But the emergence of the rogue vigilante known as Batman has caused problems for Dent and his agenda. A new criminal mastermind known only as \"The Joker\" has arrived and aims to take Gotham out from under Harvey Dent's iron fist. The Joker stages a masterfully planned bank robbery and robs the Gotham mob blind. He uses this money to stage a series of horrific and strategic attacks against the city and its people, each one carefully planned and aimed at Dent and Batman, while causing the rest of the city to enter panic mode. Meanwhile, Batman thinks he might have found a lead to The Joker thanks to Wayne Enterprises' dealings with a shady Chinese banker, and that takes Batman and Alfred to Hong Kong. The Joker has no rules, but Batman has only one, and the Joker aims to make Batman break his only rule. But who will be the one to take him out, will it be rogue vigilante Batman, or will it be elected official Harvey Dent, the new hero with a face?"
|
||||
},
|
||||
"keywords":{
|
||||
"0":[
|
||||
"psychopath",
|
||||
"superhero",
|
||||
"moral dilemma",
|
||||
"clown",
|
||||
"criminal mastermind"
|
||||
]
|
||||
}
|
||||
}
|
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
@ -1,2 +1,15 @@
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
|
||||
data_save_location="data/"
|
||||
data_save_location="data/"
|
||||
movie_columns=[
|
||||
"title",
|
||||
"release_date",
|
||||
"main_genres",
|
||||
"sub_genres",
|
||||
"directors",
|
||||
"stars",
|
||||
"rating",
|
||||
"duration",
|
||||
"description",
|
||||
"storyline",
|
||||
"keywords"
|
||||
]
|
183
src/data_scrapper/get_movie_data.py
Normal file
183
src/data_scrapper/get_movie_data.py
Normal file
@ -0,0 +1,183 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import config
|
||||
import json
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
"""
|
||||
QUICK INFO
|
||||
returns DataFrame with columns from config.movie_columns
|
||||
|
||||
url - url of movie
|
||||
main_genre - main genre of movie
|
||||
save=False - if True save to file movie_data.json
|
||||
iter - number of done tries to get data
|
||||
|
||||
MAXIMUM NUMBER OF TRIES = 3
|
||||
Gets data for one movie from imdb
|
||||
Return DataFrame with columns from config.movie_columns
|
||||
"""
|
||||
|
||||
|
||||
def get_movie_data(url,main_genre, save=False, iter=0):
|
||||
if iter>3:
|
||||
raise Exception("Too many tries.")
|
||||
|
||||
result=pd.DataFrame(columns=config.movie_columns)
|
||||
not_found_flag=False
|
||||
result.loc[0, "main_genres"]=[main_genre]
|
||||
site = requests.get(url, headers=config.headers)
|
||||
soup=BeautifulSoup(site.text, 'html.parser')
|
||||
|
||||
# Title
|
||||
title=soup.find(
|
||||
"h1",{"data-testid":"hero__pageTitle"}
|
||||
)
|
||||
if title:
|
||||
title=title.text.strip()
|
||||
result.loc[0, "title"]=title
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Release date and duration
|
||||
details_row=soup.find(
|
||||
"ul",{"role":"presentation", "class":"ipc-inline-list ipc-inline-list--show-dividers sc-ec65ba05-2 joVhBE baseAlt"}
|
||||
)
|
||||
if details_row:
|
||||
details=details_row.find_all("li")
|
||||
for detail in details:
|
||||
text=detail.text.strip()
|
||||
if re.match(r"^[0-9]{4}$", text):
|
||||
result.loc[0, "release_date"]=text
|
||||
if re.match(r"^[0-9]*h [0-9]*m", text):
|
||||
result.loc[0, "duration"]=text
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find genres and process them (main and subgenres)
|
||||
genres_row=soup.find("div",
|
||||
{"data-testid":"interests"}
|
||||
)
|
||||
if genres_row:
|
||||
genres=genres_row.find_all("a",
|
||||
{"class":"ipc-chip ipc-chip--on-baseAlt"})
|
||||
found_genres=[]
|
||||
for genre in genres:
|
||||
genre_text=genre.text.strip()
|
||||
if genre_text not in result.loc[0, "main_genres"]:
|
||||
found_genres.append(genre_text)
|
||||
result.loc[0, "sub_genres"]=found_genres
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find rating
|
||||
rating=soup.find("span",
|
||||
{"class":"sc-d541859f-1 imUuxf"}
|
||||
)
|
||||
if rating:
|
||||
rating=float(rating.text.strip())
|
||||
result.loc[0, "rating"]=rating
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find description
|
||||
description=soup.find("span",
|
||||
{"data-testid":"plot-xs_to_m"})
|
||||
if description:
|
||||
description=description.text.strip()
|
||||
result.loc[0, "description"]=description
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find directors
|
||||
directors=[]
|
||||
stars=[]
|
||||
# There are 3 rows on IMDB page in order: directors,writers,stars
|
||||
cast_rows=soup.find_all("div",
|
||||
"ipc-metadata-list-item__content-container")
|
||||
if cast_rows:
|
||||
directors_links=cast_rows[0].find_all("a")
|
||||
for director in directors_links:
|
||||
directors.append(director.text.strip())
|
||||
result.loc[0, "directors"]=directors
|
||||
stars_links=cast_rows[2].find_all("a")
|
||||
for star in stars_links:
|
||||
stars.append(star.text.strip())
|
||||
result.loc[0, "stars"]=stars
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find storyline
|
||||
|
||||
# Due to the fact that movie page has lazy loading
|
||||
# we need to go to the page that contains plotsummary
|
||||
|
||||
plot_summary_url=url+"plotsummary/"
|
||||
plot_summary_site=requests.get(plot_summary_url, headers=config.headers)
|
||||
plot_summary_soup=BeautifulSoup(plot_summary_site.text, 'html.parser')
|
||||
summaries=plot_summary_soup.find_all(
|
||||
"li",
|
||||
{"class":"ipc-metadata-list__item"}
|
||||
)
|
||||
if summaries:
|
||||
text_summaries=[]
|
||||
for summary in summaries:
|
||||
if summary!=None:
|
||||
try:
|
||||
plot_summary=summary.find("div",
|
||||
{"class":"ipc-html-content-inner-div"}).text.strip()
|
||||
except AttributeError:
|
||||
continue
|
||||
if plot_summary:
|
||||
text_summaries.append(plot_summary)
|
||||
# Pick 2nd longest summary (if only one, pick it, if none, pick empty string)
|
||||
if len(text_summaries) > 1:
|
||||
longest_summary = sorted(text_summaries, key=len)[-2]
|
||||
else:
|
||||
longest_summary = text_summaries[0] if text_summaries else ""
|
||||
# Remove author of the summary
|
||||
if "—" in longest_summary:
|
||||
longest_summary = longest_summary.split("—")[0].strip()
|
||||
result.loc[0, "storyline"] = longest_summary
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
# Find keywords
|
||||
# also different url than main movie page
|
||||
keywords=[]
|
||||
keywords_url=url+"keywords/"
|
||||
keywords_site=requests.get(keywords_url, headers=config.headers)
|
||||
keywords_soup=BeautifulSoup(keywords_site.text, 'html.parser')
|
||||
all_keywords=keywords_soup.find_all("a",
|
||||
{"class":"ipc-metadata-list-summary-item__t"})
|
||||
if all_keywords:
|
||||
for keyword in all_keywords:
|
||||
if(keyword):
|
||||
try:
|
||||
keyword_text=keyword.text.strip()
|
||||
if keyword_text:
|
||||
keywords.append(keyword_text) if len(keywords)<5 else None
|
||||
except AttributeError:
|
||||
continue
|
||||
if(len(keywords)==0):
|
||||
keywords=[""]
|
||||
result.loc[0, "keywords"]=keywords
|
||||
else:
|
||||
not_found_flag=True
|
||||
result=get_movie_data(url, save, iter+1)
|
||||
|
||||
if save:
|
||||
result.to_json(config.data_save_location+"movie_data.json", indent=4)
|
||||
else:
|
||||
return result
|
||||
|
||||
get_movie_data("https://www.imdb.com/title/tt0468569/",main_genre="Action",save=True)
|
||||
|
@ -45,6 +45,4 @@ def get_movies_links_for_interest(url, save=False):
|
||||
with open(config.data_save_location+'movies_links.json', 'w') as f:
|
||||
json.dump(results, f, indent=4)
|
||||
else:
|
||||
return results
|
||||
|
||||
get_movies_links_for_interest("https://www.imdb.com/interest/in0000001/")
|
||||
return results
|
Loading…
Reference in New Issue
Block a user