Add tokenizing, preparing output for fast_align
This commit is contained in:
parent
7b9b02f5fc
commit
97e397cfc9
1
output/out_hr.en.txt
Normal file
1
output/out_hr.en.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Hotel Atrium smješten je 1 km od Dioklecijanove plače , koja je pod zaštitom UNESCO-a . Objekt uključuje elegantno namještene i klimatizirane sobe te besplatan pristup spa centru Aurelia s unutarnjim bazenom , hidromasažnom kadom i saunama . Prostorija za fitness nedavno je obnovljena . Kockarnica Admiral u sklopu objekta nudi glazbu uživo . Udobne i luksuzne sobe hotela Atrium zvučno su izolirane te sadrže ogrtače , papuče i kozmetički pribor . Smještajne jedinice uređene su u bež i tamnosivim nijansama , a sve uključuju besplatni WiFi i TV ravnog ekrana . U hotelu Atrium dostupan je izdašan američki doručak , koji je moguće poslužiti i u sobama . Restoran s jelima po narudžbi Cardo poslužuje jela mediteranske i međunarodne kuhinje . U modernom baru City gosti mogu popiti kavu ili ukusan koktel . Kockarnica Admiral , otvorena 24 sata dnevno , prostire se na više od 1000 m² i najveća je kockarnica u Dalmaciji . Gostima su 24 sata dnevno na raspolaganju recepcija i posluga u sobu , a u sklopu objekta također su dostupne kemijska čistionica , usluga najma automobila te usluga prijevoza iz/do zračne luke . Osoblje na recepciji rado će gostima pomoći oko organiziranja raznih izleta i obilazaka . Splitska zračna luka udaljena je 22 km , a do Trogira ima svega 24 km . ||| Hotel Atrium is set 1 km from the UNESCO-listed Diocletian 's Palace . Its air-conditioned rooms are elegantly furnished and guests enjoy free access to Aurelia Spa with indoor pool , hot tub and saunas . The fitness room is newly equipped , while the on-site Admiral Casino features live music . The comfortable and luxurious rooms at Atrium Hotel are soundproofed and come with bathrobes , slippers and bathroom cosmetics . They are decorated in shades of beige and dark greys and all rooms are equipped with free Wi-Fi and flat-screen TV . The Atrium serves a rich American breakfast , which can be enjoyed in the room , the à la carte Restaurant Cardo offers Mediterranean and international dishes . For a delicious cocktail or coffee , guests can visit the trendy City Bar . Spreading over 1000 m² , the Admiral Casino is the largest casino in Dalmatia operating 24 hours a day . With a 24-hour reception and room service , the Atrium offers additional services such as airport transfers , car rental and dry cleaning . Front desk can organise various trips and excursions . The distance to Split Kaštela Airport is 22 km , while Trogir is only 24 km away .
|
33
src/main.py
33
src/main.py
@ -8,6 +8,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
|
||||||
SITES = [
|
SITES = [
|
||||||
@ -16,27 +17,27 @@ SITES = [
|
|||||||
]
|
]
|
||||||
BASE_LINK = "https://www.esky.hr"
|
BASE_LINK = "https://www.esky.hr"
|
||||||
OUTPUT_DIR = "output"
|
OUTPUT_DIR = "output"
|
||||||
OUT_FILE_NAME_HR = "out_hr.txt"
|
OUT_FILE_NAME = "out_hr.en.txt"
|
||||||
OUT_FILE_NAME_EN = "out_en.txt"
|
|
||||||
LINE_SEP = "\n"
|
LINE_SEP = "\n"
|
||||||
WD_DELAY = 1
|
WD_DELAY = 3
|
||||||
|
ENC = "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
res = []
|
|
||||||
for s in SITES:
|
|
||||||
res.extend(scrape_list(s))
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.mkdir(OUTPUT_DIR)
|
os.mkdir(OUTPUT_DIR)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
|
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
|
||||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
|
|
||||||
for h, e in res:
|
os.remove(outf)
|
||||||
f_hr.write(h + LINE_SEP)
|
with open(outf, 'w', encoding=ENC) as _:
|
||||||
f_en.write(e + LINE_SEP)
|
pass # only create new file
|
||||||
|
|
||||||
|
for s in SITES:
|
||||||
|
scrape_list(s, outf)
|
||||||
|
|
||||||
|
|
||||||
def transform_link(link: str) -> str:
|
def transform_link(link: str) -> str:
|
||||||
@ -49,7 +50,7 @@ def get_soup_text(soup: BeautifulSoup) -> str:
|
|||||||
return t.get_text()
|
return t.get_text()
|
||||||
|
|
||||||
|
|
||||||
def scrape_list(website_url):
|
def scrape_list(website_url, out_filepath):
|
||||||
opts = Options()
|
opts = Options()
|
||||||
opts.headless = True
|
opts.headless = True
|
||||||
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
|
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
|
||||||
@ -67,7 +68,6 @@ def scrape_list(website_url):
|
|||||||
|
|
||||||
hotels = soup.find_all('a', {'class': 'name-link'})
|
hotels = soup.find_all('a', {'class': 'name-link'})
|
||||||
|
|
||||||
res = []
|
|
||||||
for h in hotels:
|
for h in hotels:
|
||||||
if h.has_attr('href'):
|
if h.has_attr('href'):
|
||||||
href = h['href']
|
href = h['href']
|
||||||
@ -95,10 +95,13 @@ def scrape_list(website_url):
|
|||||||
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
text_en = get_soup_text(sub_soup)
|
text_en = get_soup_text(sub_soup)
|
||||||
|
|
||||||
res.append((text_hr, text_en))
|
text_en_tokenized = ' '.join(word_tokenize(text_en))
|
||||||
|
text_hr_tokenized = ' '.join(word_tokenize(text_hr))
|
||||||
|
|
||||||
|
with open(out_filepath, 'a', encoding=ENC) as f:
|
||||||
|
f.write(f"{text_hr_tokenized} ||| {text_en_tokenized}")
|
||||||
|
|
||||||
driver.quit()
|
driver.quit()
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user