Add scripts for scraping
This commit is contained in:
parent
01350f361d
commit
7b9b02f5fc
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
**/.vscode/*
|
**/.vscode/*
|
||||||
|
**.log
|
BIN
binaries/geckodriver.exe
Normal file
BIN
binaries/geckodriver.exe
Normal file
Binary file not shown.
0
output/out_en.txt
Normal file
0
output/out_en.txt
Normal file
0
output/out_hr.txt
Normal file
0
output/out_hr.txt
Normal file
105
src/main.py
105
src/main.py
@ -0,0 +1,105 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
|
||||||
|
|
||||||
|
SITES = [
|
||||||
|
"https://www.esky.hr/hoteli/ci/spu/hoteli-split",
|
||||||
|
"https://www.esky.hr/hoteli/ci/zag/hoteli-zagreb"
|
||||||
|
]
|
||||||
|
BASE_LINK = "https://www.esky.hr"
|
||||||
|
OUTPUT_DIR = "output"
|
||||||
|
OUT_FILE_NAME_HR = "out_hr.txt"
|
||||||
|
OUT_FILE_NAME_EN = "out_en.txt"
|
||||||
|
LINE_SEP = "\n"
|
||||||
|
WD_DELAY = 1
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
res = []
|
||||||
|
for s in SITES:
|
||||||
|
res.extend(scrape_list(s))
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.mkdir(OUTPUT_DIR)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
|
||||||
|
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
|
||||||
|
for h, e in res:
|
||||||
|
f_hr.write(h + LINE_SEP)
|
||||||
|
f_en.write(e + LINE_SEP)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_link(link: str) -> str:
|
||||||
|
link = link.replace("esky.hr/hoteli", "esky.com/hotels")
|
||||||
|
return link
|
||||||
|
|
||||||
|
|
||||||
|
def get_soup_text(soup: BeautifulSoup) -> str:
|
||||||
|
t = soup.find('dd', {'class': 'hotel-description'})
|
||||||
|
return t.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_list(website_url):
|
||||||
|
opts = Options()
|
||||||
|
opts.headless = True
|
||||||
|
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
|
||||||
|
dr_path = os.getcwd() + "\\binaries\\geckodriver.exe"
|
||||||
|
driver = webdriver.Firefox(options=opts, executable_path=dr_path)
|
||||||
|
driver.get(website_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, WD_DELAY).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, 'name-link')))
|
||||||
|
except TimeoutException:
|
||||||
|
raise Exception("Too long!")
|
||||||
|
|
||||||
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
|
||||||
|
hotels = soup.find_all('a', {'class': 'name-link'})
|
||||||
|
|
||||||
|
res = []
|
||||||
|
for h in hotels:
|
||||||
|
if h.has_attr('href'):
|
||||||
|
href = h['href']
|
||||||
|
if not re.match(r"^https?:\/\/", href):
|
||||||
|
href = BASE_LINK + href
|
||||||
|
driver.get(href)
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, WD_DELAY).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
|
||||||
|
except TimeoutException:
|
||||||
|
raise Exception("Too long!")
|
||||||
|
|
||||||
|
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
text_hr = get_soup_text(sub_soup)
|
||||||
|
|
||||||
|
href = transform_link(href)
|
||||||
|
|
||||||
|
driver.get(href)
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, WD_DELAY).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
|
||||||
|
except TimeoutException:
|
||||||
|
raise Exception("Too long!")
|
||||||
|
|
||||||
|
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
text_en = get_soup_text(sub_soup)
|
||||||
|
|
||||||
|
res.append((text_hr, text_en))
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user