Add tokenizing, preparing output for fast_align
This commit is contained in:
parent
7b9b02f5fc
commit
97e397cfc9
1
output/out_hr.en.txt
Normal file
1
output/out_hr.en.txt
Normal file
@ -0,0 +1 @@
|
||||
Hotel Atrium smješten je 1 km od Dioklecijanove plače , koja je pod zaštitom UNESCO-a . Objekt uključuje elegantno namještene i klimatizirane sobe te besplatan pristup spa centru Aurelia s unutarnjim bazenom , hidromasažnom kadom i saunama . Prostorija za fitness nedavno je obnovljena . Kockarnica Admiral u sklopu objekta nudi glazbu uživo . Udobne i luksuzne sobe hotela Atrium zvučno su izolirane te sadrže ogrtače , papuče i kozmetički pribor . Smještajne jedinice uređene su u bež i tamnosivim nijansama , a sve uključuju besplatni WiFi i TV ravnog ekrana . U hotelu Atrium dostupan je izdašan američki doručak , koji je moguće poslužiti i u sobama . Restoran s jelima po narudžbi Cardo poslužuje jela mediteranske i međunarodne kuhinje . U modernom baru City gosti mogu popiti kavu ili ukusan koktel . Kockarnica Admiral , otvorena 24 sata dnevno , prostire se na više od 1000 m² i najveća je kockarnica u Dalmaciji . Gostima su 24 sata dnevno na raspolaganju recepcija i posluga u sobu , a u sklopu objekta također su dostupne kemijska čistionica , usluga najma automobila te usluga prijevoza iz/do zračne luke . Osoblje na recepciji rado će gostima pomoći oko organiziranja raznih izleta i obilazaka . Splitska zračna luka udaljena je 22 km , a do Trogira ima svega 24 km . ||| Hotel Atrium is set 1 km from the UNESCO-listed Diocletian 's Palace . Its air-conditioned rooms are elegantly furnished and guests enjoy free access to Aurelia Spa with indoor pool , hot tub and saunas . The fitness room is newly equipped , while the on-site Admiral Casino features live music . The comfortable and luxurious rooms at Atrium Hotel are soundproofed and come with bathrobes , slippers and bathroom cosmetics . They are decorated in shades of beige and dark greys and all rooms are equipped with free Wi-Fi and flat-screen TV . The Atrium serves a rich American breakfast , which can be enjoyed in the room , the à la carte Restaurant Cardo offers Mediterranean and international dishes . For a delicious cocktail or coffee , guests can visit the trendy City Bar . Spreading over 1000 m² , the Admiral Casino is the largest casino in Dalmatia operating 24 hours a day . With a 24-hour reception and room service , the Atrium offers additional services such as airport transfers , car rental and dry cleaning . Front desk can organise various trips and excursions . The distance to Split Kaštela Airport is 22 km , while Trogir is only 24 km away .
|
33
src/main.py
33
src/main.py
@ -8,6 +8,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
|
||||
SITES = [
|
||||
@ -16,27 +17,27 @@ SITES = [
|
||||
]
|
||||
BASE_LINK = "https://www.esky.hr"
|
||||
OUTPUT_DIR = "output"
|
||||
OUT_FILE_NAME_HR = "out_hr.txt"
|
||||
OUT_FILE_NAME_EN = "out_en.txt"
|
||||
OUT_FILE_NAME = "out_hr.en.txt"
|
||||
LINE_SEP = "\n"
|
||||
WD_DELAY = 1
|
||||
WD_DELAY = 3
|
||||
ENC = "utf-8"
|
||||
|
||||
|
||||
def main():
|
||||
res = []
|
||||
for s in SITES:
|
||||
res.extend(scrape_list(s))
|
||||
|
||||
try:
|
||||
os.mkdir(OUTPUT_DIR)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
|
||||
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
|
||||
for h, e in res:
|
||||
f_hr.write(h + LINE_SEP)
|
||||
f_en.write(e + LINE_SEP)
|
||||
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
|
||||
|
||||
os.remove(outf)
|
||||
with open(outf, 'w', encoding=ENC) as _:
|
||||
pass # only create new file
|
||||
|
||||
for s in SITES:
|
||||
scrape_list(s, outf)
|
||||
|
||||
|
||||
def transform_link(link: str) -> str:
|
||||
@ -49,7 +50,7 @@ def get_soup_text(soup: BeautifulSoup) -> str:
|
||||
return t.get_text()
|
||||
|
||||
|
||||
def scrape_list(website_url):
|
||||
def scrape_list(website_url, out_filepath):
|
||||
opts = Options()
|
||||
opts.headless = True
|
||||
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
|
||||
@ -67,7 +68,6 @@ def scrape_list(website_url):
|
||||
|
||||
hotels = soup.find_all('a', {'class': 'name-link'})
|
||||
|
||||
res = []
|
||||
for h in hotels:
|
||||
if h.has_attr('href'):
|
||||
href = h['href']
|
||||
@ -95,10 +95,13 @@ def scrape_list(website_url):
|
||||
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
text_en = get_soup_text(sub_soup)
|
||||
|
||||
res.append((text_hr, text_en))
|
||||
text_en_tokenized = ' '.join(word_tokenize(text_en))
|
||||
text_hr_tokenized = ' '.join(word_tokenize(text_hr))
|
||||
|
||||
with open(out_filepath, 'a', encoding=ENC) as f:
|
||||
f.write(f"{text_hr_tokenized} ||| {text_en_tokenized}")
|
||||
|
||||
driver.quit()
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
Reference in New Issue
Block a user