Add scripts for scraping

2021-06-21 23:18:28 +02:00 · 2021-06-21 23:18:28 +02:00 · 7b9b02f5fc
commit 7b9b02f5fc
parent 01350f361d
5 changed files with 107 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-**/.vscode/*
+**/.vscode/*
+**.log
--- a/binaries/geckodriver.exe
+++ b/binaries/geckodriver.exe
--- a/output/out_en.txt
+++ b/output/out_en.txt
--- a/output/out_hr.txt
+++ b/output/out_hr.txt
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,105 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import os
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+
+
+SITES = [
+    "https://www.esky.hr/hoteli/ci/spu/hoteli-split",
+    "https://www.esky.hr/hoteli/ci/zag/hoteli-zagreb"
+]
+BASE_LINK = "https://www.esky.hr"
+OUTPUT_DIR = "output"
+OUT_FILE_NAME_HR = "out_hr.txt"
+OUT_FILE_NAME_EN = "out_en.txt"
+LINE_SEP = "\n"
+WD_DELAY = 1
+
+
+def main():
+    res = []
+    for s in SITES:
+        res.extend(scrape_list(s))
+
+    try:
+        os.mkdir(OUTPUT_DIR)
+    except FileExistsError:
+        pass
+
+    with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
+        with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
+            for h, e in res:
+                f_hr.write(h + LINE_SEP)
+                f_en.write(e + LINE_SEP)
+
+
+def transform_link(link: str) -> str:
+    link = link.replace("esky.hr/hoteli", "esky.com/hotels")
+    return link
+
+
+def get_soup_text(soup: BeautifulSoup) -> str:
+    t = soup.find('dd', {'class': 'hotel-description'})
+    return t.get_text()
+
+
+def scrape_list(website_url):
+    opts = Options()
+    opts.headless = True
+    opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
+    dr_path = os.getcwd() + "\\binaries\\geckodriver.exe"
+    driver = webdriver.Firefox(options=opts, executable_path=dr_path)
+    driver.get(website_url)
+
+    try:
+        WebDriverWait(driver, WD_DELAY).until(
+            EC.presence_of_element_located((By.CLASS_NAME, 'name-link')))
+    except TimeoutException:
+        raise Exception("Too long!")
+
+    soup = BeautifulSoup(driver.page_source, 'html.parser')
+
+    hotels = soup.find_all('a', {'class': 'name-link'})
+
+    res = []
+    for h in hotels:
+        if h.has_attr('href'):
+            href = h['href']
+            if not re.match(r"^https?:\/\/", href):
+                href = BASE_LINK + href
+            driver.get(href)
+            try:
+                WebDriverWait(driver, WD_DELAY).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
+            except TimeoutException:
+                raise Exception("Too long!")
+
+            sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
+            text_hr = get_soup_text(sub_soup)
+
+            href = transform_link(href)
+
+            driver.get(href)
+            try:
+                WebDriverWait(driver, WD_DELAY).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
+            except TimeoutException:
+                raise Exception("Too long!")
+
+            sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
+            text_en = get_soup_text(sub_soup)
+
+            res.append((text_hr, text_en))
+
+    driver.quit()
+    return res
+
+
+if __name__ == "__main__":
+    main()