Add scripts for scraping

2021-06-21 23:18:28 +02:00 · 2021-06-21 23:18:28 +02:00 · 7b9b02f5fc
commit 7b9b02f5fc
parent 01350f361d
5 changed files with 107 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 **/.vscode/*
 **.log
--- a/binaries/geckodriver.exe
+++ b/binaries/geckodriver.exe
--- a/output/out_en.txt
+++ b/output/out_en.txt
--- a/output/out_hr.txt
+++ b/output/out_hr.txt
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,105 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 import os
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import TimeoutException
 SITES = [
    "https://www.esky.hr/hoteli/ci/spu/hoteli-split",
    "https://www.esky.hr/hoteli/ci/zag/hoteli-zagreb"
 ]
 BASE_LINK = "https://www.esky.hr"
 OUTPUT_DIR = "output"
 OUT_FILE_NAME_HR = "out_hr.txt"
 OUT_FILE_NAME_EN = "out_en.txt"
 LINE_SEP = "\n"
 WD_DELAY = 1
 def main():
    res = []
    for s in SITES:
        res.extend(scrape_list(s))
    try:
        os.mkdir(OUTPUT_DIR)
    except FileExistsError:
        pass
    with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
        with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
            for h, e in res:
                f_hr.write(h + LINE_SEP)
                f_en.write(e + LINE_SEP)
 def transform_link(link: str) -> str:
    link = link.replace("esky.hr/hoteli", "esky.com/hotels")
    return link
 def get_soup_text(soup: BeautifulSoup) -> str:
    t = soup.find('dd', {'class': 'hotel-description'})
    return t.get_text()
 def scrape_list(website_url):
    opts = Options()
    opts.headless = True
    opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
    dr_path = os.getcwd() + "\\binaries\\geckodriver.exe"
    driver = webdriver.Firefox(options=opts, executable_path=dr_path)
    driver.get(website_url)
    try:
        WebDriverWait(driver, WD_DELAY).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'name-link')))
    except TimeoutException:
        raise Exception("Too long!")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    hotels = soup.find_all('a', {'class': 'name-link'})
    res = []
    for h in hotels:
        if h.has_attr('href'):
            href = h['href']
            if not re.match(r"^https?:\/\/", href):
                href = BASE_LINK + href
            driver.get(href)
            try:
                WebDriverWait(driver, WD_DELAY).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
            except TimeoutException:
                raise Exception("Too long!")
            sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
            text_hr = get_soup_text(sub_soup)
            href = transform_link(href)
            driver.get(href)
            try:
                WebDriverWait(driver, WD_DELAY).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
            except TimeoutException:
                raise Exception("Too long!")
            sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
            text_en = get_soup_text(sub_soup)
            res.append((text_hr, text_en))
    driver.quit()
    return res
 if __name__ == "__main__":
    main()