Add scripts for scraping
This commit is contained in:
parent
01350f361d
commit
7b9b02f5fc
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
**/.vscode/*
|
||||
**.log
|
BIN
binaries/geckodriver.exe
Normal file
BIN
binaries/geckodriver.exe
Normal file
Binary file not shown.
0
output/out_en.txt
Normal file
0
output/out_en.txt
Normal file
0
output/out_hr.txt
Normal file
0
output/out_hr.txt
Normal file
105
src/main.py
105
src/main.py
@ -0,0 +1,105 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
SITES = [
|
||||
"https://www.esky.hr/hoteli/ci/spu/hoteli-split",
|
||||
"https://www.esky.hr/hoteli/ci/zag/hoteli-zagreb"
|
||||
]
|
||||
BASE_LINK = "https://www.esky.hr"
|
||||
OUTPUT_DIR = "output"
|
||||
OUT_FILE_NAME_HR = "out_hr.txt"
|
||||
OUT_FILE_NAME_EN = "out_en.txt"
|
||||
LINE_SEP = "\n"
|
||||
WD_DELAY = 1
|
||||
|
||||
|
||||
def main():
|
||||
res = []
|
||||
for s in SITES:
|
||||
res.extend(scrape_list(s))
|
||||
|
||||
try:
|
||||
os.mkdir(OUTPUT_DIR)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
|
||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
|
||||
for h, e in res:
|
||||
f_hr.write(h + LINE_SEP)
|
||||
f_en.write(e + LINE_SEP)
|
||||
|
||||
|
||||
def transform_link(link: str) -> str:
|
||||
link = link.replace("esky.hr/hoteli", "esky.com/hotels")
|
||||
return link
|
||||
|
||||
|
||||
def get_soup_text(soup: BeautifulSoup) -> str:
|
||||
t = soup.find('dd', {'class': 'hotel-description'})
|
||||
return t.get_text()
|
||||
|
||||
|
||||
def scrape_list(website_url):
|
||||
opts = Options()
|
||||
opts.headless = True
|
||||
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
|
||||
dr_path = os.getcwd() + "\\binaries\\geckodriver.exe"
|
||||
driver = webdriver.Firefox(options=opts, executable_path=dr_path)
|
||||
driver.get(website_url)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, WD_DELAY).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, 'name-link')))
|
||||
except TimeoutException:
|
||||
raise Exception("Too long!")
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
|
||||
hotels = soup.find_all('a', {'class': 'name-link'})
|
||||
|
||||
res = []
|
||||
for h in hotels:
|
||||
if h.has_attr('href'):
|
||||
href = h['href']
|
||||
if not re.match(r"^https?:\/\/", href):
|
||||
href = BASE_LINK + href
|
||||
driver.get(href)
|
||||
try:
|
||||
WebDriverWait(driver, WD_DELAY).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
|
||||
except TimeoutException:
|
||||
raise Exception("Too long!")
|
||||
|
||||
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
text_hr = get_soup_text(sub_soup)
|
||||
|
||||
href = transform_link(href)
|
||||
|
||||
driver.get(href)
|
||||
try:
|
||||
WebDriverWait(driver, WD_DELAY).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, 'hotel-description')))
|
||||
except TimeoutException:
|
||||
raise Exception("Too long!")
|
||||
|
||||
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
text_en = get_soup_text(sub_soup)
|
||||
|
||||
res.append((text_hr, text_en))
|
||||
|
||||
driver.quit()
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user