-Changed way of getting movie's description -Added new column (link to a movie) to data -Added progress bar to scrapper
This commit is contained in:
parent
25b962a3d3
commit
5fde8c1600
3747
data/interests_movie_links.json
Normal file
3747
data/interests_movie_links.json
Normal file
File diff suppressed because it is too large
Load Diff
5139
data/movies_data.csv
5139
data/movies_data.csv
File diff suppressed because it is too large
Load Diff
@ -10,5 +10,6 @@ movie_columns=[
|
||||
"duration",
|
||||
"description",
|
||||
"storyline",
|
||||
"keywords"
|
||||
"keywords",
|
||||
"url"
|
||||
]
|
@ -4,7 +4,9 @@ import config
|
||||
import json
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import html
|
||||
"""
|
||||
QUICK INFO
|
||||
returns DataFrame with columns from config.movie_columns
|
||||
@ -24,13 +26,30 @@ Return DataFrame with columns from config.movie_columns
|
||||
def get_movie_data(url, save=False):
|
||||
result=pd.DataFrame(columns=config.movie_columns)
|
||||
not_found_flag=False
|
||||
# Version using requests
|
||||
try:
|
||||
site = requests.get(url, headers=config.headers)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print("Connection error. Trying again")
|
||||
return "CONNECTION_ERROR"
|
||||
|
||||
soup=BeautifulSoup(site.text, 'html.parser')
|
||||
|
||||
"""# Version using selenium
|
||||
try:
|
||||
chrome_options = Options()
|
||||
#chrome_options.add_argument("--headless") # Tryb bez interfejsu graficznego
|
||||
chrome_options.add_argument("--window-size=1920,1080") # Ustaw wymiary okna
|
||||
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
|
||||
driver.get(url)
|
||||
site=driver.page_source
|
||||
soup=BeautifulSoup(site, 'html.parser')
|
||||
except:
|
||||
print("Connection error. Trying again")
|
||||
return "CONNECTION_ERROR"
|
||||
"""
|
||||
|
||||
# Title
|
||||
title=soup.find(
|
||||
"h1",{"data-testid":"hero__pageTitle"}
|
||||
@ -82,13 +101,25 @@ def get_movie_data(url, save=False):
|
||||
not_found_flag=True
|
||||
|
||||
# Find description
|
||||
description=soup.find("span",
|
||||
{"data-testid":"plot-xs_to_m"})
|
||||
if description:
|
||||
description=description.text.strip()
|
||||
# Description is sometimes cut due to it's length.
|
||||
# Easiest way to get it all is to look for
|
||||
# script that renders it in desired length
|
||||
# and get it all from variable which stores description.
|
||||
|
||||
try:
|
||||
found_script=soup.find("script", {"type":"application/ld+json"})
|
||||
script=json.loads(found_script.text.strip())
|
||||
description=script.get("description")
|
||||
description=html.unescape(description)
|
||||
except:
|
||||
print("Error getting description")
|
||||
description=""
|
||||
if(description!=""):
|
||||
result.loc[0, "description"]=description
|
||||
else:
|
||||
not_found_flag=True
|
||||
|
||||
|
||||
# Find directors
|
||||
directors=[]
|
||||
stars=[]
|
||||
@ -166,6 +197,9 @@ def get_movie_data(url, save=False):
|
||||
else:
|
||||
not_found_flag=True
|
||||
|
||||
# Set url
|
||||
result.loc[0, "url"]=url
|
||||
|
||||
if not_found_flag:
|
||||
print("One (many) of the movie data not found")
|
||||
return None
|
||||
|
@ -6,13 +6,16 @@ import pandas as pd
|
||||
import config
|
||||
import json
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
|
||||
def imdb_scrapper(default_sleep_time=5):
|
||||
# Try to load already scrapped data
|
||||
try:
|
||||
with open(config.data_save_location+"interests_movie_links.json", 'r') as f:
|
||||
interests_movie_links = json.load(f)
|
||||
print("Loaded already scrapper movie links...")
|
||||
except:
|
||||
print("Getting movie links..")
|
||||
interests=get_interests.get_interests()
|
||||
interests_movie_links={}
|
||||
number_of_genres=len(interests.keys())
|
||||
@ -43,6 +46,7 @@ def imdb_scrapper(default_sleep_time=5):
|
||||
json.dump(interests_movie_links, f, indent=4)
|
||||
|
||||
# Now get data for each movie
|
||||
print("Getting movie data...")
|
||||
movies=pd.DataFrame(columns=config.movie_columns)
|
||||
number_of_genres=len(interests_movie_links.keys())
|
||||
i=0
|
||||
@ -51,7 +55,9 @@ def imdb_scrapper(default_sleep_time=5):
|
||||
# Wait 60s after each genre to avoid rate limit
|
||||
if(i>0):
|
||||
time.sleep(60)
|
||||
for movie_link in interests_movie_links[genre]:
|
||||
movies_count=len(interests_movie_links[genre])
|
||||
j=0
|
||||
for movie_link in tqdm(interests_movie_links[genre], desc=f"Processing {genre}", unit="movie"):
|
||||
# Wait 5s after each movie to avoid rate limit
|
||||
time.sleep(default_sleep_time)
|
||||
movie_data=get_movie_data.get_movie_data(movie_link)
|
||||
@ -89,6 +95,7 @@ def imdb_scrapper(default_sleep_time=5):
|
||||
error_saver.error_data_saver(e, movie_data, movie_link)
|
||||
continue
|
||||
movies.to_csv(config.data_save_location+"movies_data.csv", index=False)
|
||||
j+=1
|
||||
i+=1
|
||||
movies.to_csv(config.data_save_location+"movies_data.csv", index=False)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user