Add scripts for scraping

This commit is contained in:
nlitkowski 2021-06-21 23:18:28 +02:00
parent 01350f361d
commit 7b9b02f5fc
5 changed files with 107 additions and 1 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
**/.vscode/*
**/.vscode/*
**.log

BIN
binaries/geckodriver.exe Normal file

Binary file not shown.

0
output/out_en.txt Normal file
View File

0
output/out_hr.txt Normal file
View File

View File

@ -0,0 +1,105 @@
import requests
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
SITES = [
"https://www.esky.hr/hoteli/ci/spu/hoteli-split",
"https://www.esky.hr/hoteli/ci/zag/hoteli-zagreb"
]
BASE_LINK = "https://www.esky.hr"
OUTPUT_DIR = "output"
OUT_FILE_NAME_HR = "out_hr.txt"
OUT_FILE_NAME_EN = "out_en.txt"
LINE_SEP = "\n"
WD_DELAY = 1
def main():
res = []
for s in SITES:
res.extend(scrape_list(s))
try:
os.mkdir(OUTPUT_DIR)
except FileExistsError:
pass
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
for h, e in res:
f_hr.write(h + LINE_SEP)
f_en.write(e + LINE_SEP)
def transform_link(link: str) -> str:
link = link.replace("esky.hr/hoteli", "esky.com/hotels")
return link
def get_soup_text(soup: BeautifulSoup) -> str:
t = soup.find('dd', {'class': 'hotel-description'})
return t.get_text()
def scrape_list(website_url):
opts = Options()
opts.headless = True
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
dr_path = os.getcwd() + "\\binaries\\geckodriver.exe"
driver = webdriver.Firefox(options=opts, executable_path=dr_path)
driver.get(website_url)
try:
WebDriverWait(driver, WD_DELAY).until(
EC.presence_of_element_located((By.CLASS_NAME, 'name-link')))
except TimeoutException:
raise Exception("Too long!")
soup = BeautifulSoup(driver.page_source, 'html.parser')
hotels = soup.find_all('a', {'class': 'name-link'})
res = []
for h in hotels:
if h.has_attr('href'):
href = h['href']
if not re.match(r"^https?:\/\/", href):
href = BASE_LINK + href
driver.get(href)
try:
WebDriverWait(driver, WD_DELAY).until(
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
except TimeoutException:
raise Exception("Too long!")
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
text_hr = get_soup_text(sub_soup)
href = transform_link(href)
driver.get(href)
try:
WebDriverWait(driver, WD_DELAY).until(
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
except TimeoutException:
raise Exception("Too long!")
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
text_en = get_soup_text(sub_soup)
res.append((text_hr, text_en))
driver.quit()
return res
if __name__ == "__main__":
main()