-Changed way of getting movie's description -Added new column (link to a movie) to data -Added progress bar to scrapper

This commit is contained in:
Jakub Zaręba 2025-01-05 13:44:22 +01:00
parent 25b962a3d3
commit 5fde8c1600
5 changed files with 6361 additions and 2583 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -10,5 +10,6 @@ movie_columns=[
"duration",
"description",
"storyline",
"keywords"
"keywords",
"url"
]

View File

@ -4,7 +4,9 @@ import config
import json
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import html
"""
QUICK INFO
returns DataFrame with columns from config.movie_columns
@ -24,13 +26,30 @@ Return DataFrame with columns from config.movie_columns
def get_movie_data(url, save=False):
result=pd.DataFrame(columns=config.movie_columns)
not_found_flag=False
# Version using requests
try:
site = requests.get(url, headers=config.headers)
except requests.exceptions.ConnectionError:
print("Connection error. Trying again")
return "CONNECTION_ERROR"
soup=BeautifulSoup(site.text, 'html.parser')
"""# Version using selenium
try:
chrome_options = Options()
#chrome_options.add_argument("--headless") # Tryb bez interfejsu graficznego
chrome_options.add_argument("--window-size=1920,1080") # Ustaw wymiary okna
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
site=driver.page_source
soup=BeautifulSoup(site, 'html.parser')
except:
print("Connection error. Trying again")
return "CONNECTION_ERROR"
"""
# Title
title=soup.find(
"h1",{"data-testid":"hero__pageTitle"}
@ -82,13 +101,25 @@ def get_movie_data(url, save=False):
not_found_flag=True
# Find description
description=soup.find("span",
{"data-testid":"plot-xs_to_m"})
if description:
description=description.text.strip()
# Description is sometimes cut due to it's length.
# Easiest way to get it all is to look for
# script that renders it in desired length
# and get it all from variable which stores description.
try:
found_script=soup.find("script", {"type":"application/ld+json"})
script=json.loads(found_script.text.strip())
description=script.get("description")
description=html.unescape(description)
except:
print("Error getting description")
description=""
if(description!=""):
result.loc[0, "description"]=description
else:
not_found_flag=True
# Find directors
directors=[]
stars=[]
@ -166,6 +197,9 @@ def get_movie_data(url, save=False):
else:
not_found_flag=True
# Set url
result.loc[0, "url"]=url
if not_found_flag:
print("One (many) of the movie data not found")
return None

View File

@ -6,13 +6,16 @@ import pandas as pd
import config
import json
import time
from tqdm import tqdm
def imdb_scrapper(default_sleep_time=5):
# Try to load already scrapped data
try:
with open(config.data_save_location+"interests_movie_links.json", 'r') as f:
interests_movie_links = json.load(f)
print("Loaded already scrapper movie links...")
except:
print("Getting movie links..")
interests=get_interests.get_interests()
interests_movie_links={}
number_of_genres=len(interests.keys())
@ -43,6 +46,7 @@ def imdb_scrapper(default_sleep_time=5):
json.dump(interests_movie_links, f, indent=4)
# Now get data for each movie
print("Getting movie data...")
movies=pd.DataFrame(columns=config.movie_columns)
number_of_genres=len(interests_movie_links.keys())
i=0
@ -51,7 +55,9 @@ def imdb_scrapper(default_sleep_time=5):
# Wait 60s after each genre to avoid rate limit
if(i>0):
time.sleep(60)
for movie_link in interests_movie_links[genre]:
movies_count=len(interests_movie_links[genre])
j=0
for movie_link in tqdm(interests_movie_links[genre], desc=f"Processing {genre}", unit="movie"):
# Wait 5s after each movie to avoid rate limit
time.sleep(default_sleep_time)
movie_data=get_movie_data.get_movie_data(movie_link)
@ -89,6 +95,7 @@ def imdb_scrapper(default_sleep_time=5):
error_saver.error_data_saver(e, movie_data, movie_link)
continue
movies.to_csv(config.data_save_location+"movies_data.csv", index=False)
j+=1
i+=1
movies.to_csv(config.data_save_location+"movies_data.csv", index=False)