Add tokenizing, preparing output for fast_align

This commit is contained in:
nlitkowski 2021-06-21 23:37:51 +02:00
parent 7b9b02f5fc
commit 97e397cfc9
2 changed files with 19 additions and 15 deletions

1
output/out_hr.en.txt Normal file
View File

@ -0,0 +1 @@
Hotel Atrium smješten je 1 km od Dioklecijanove plače , koja je pod zaštitom UNESCO-a . Objekt uključuje elegantno namještene i klimatizirane sobe te besplatan pristup spa centru Aurelia s unutarnjim bazenom , hidromasažnom kadom i saunama . Prostorija za fitness nedavno je obnovljena . Kockarnica Admiral u sklopu objekta nudi glazbu uživo . Udobne i luksuzne sobe hotela Atrium zvučno su izolirane te sadrže ogrtače , papuče i kozmetički pribor . Smještajne jedinice uređene su u bež i tamnosivim nijansama , a sve uključuju besplatni WiFi i TV ravnog ekrana . U hotelu Atrium dostupan je izdašan američki doručak , koji je moguće poslužiti i u sobama . Restoran s jelima po narudžbi Cardo poslužuje jela mediteranske i međunarodne kuhinje . U modernom baru City gosti mogu popiti kavu ili ukusan koktel . Kockarnica Admiral , otvorena 24 sata dnevno , prostire se na više od 1000 m² i najveća je kockarnica u Dalmaciji . Gostima su 24 sata dnevno na raspolaganju recepcija i posluga u sobu , a u sklopu objekta također su dostupne kemijska čistionica , usluga najma automobila te usluga prijevoza iz/do zračne luke . Osoblje na recepciji rado će gostima pomoći oko organiziranja raznih izleta i obilazaka . Splitska zračna luka udaljena je 22 km , a do Trogira ima svega 24 km . ||| Hotel Atrium is set 1 km from the UNESCO-listed Diocletian 's Palace . Its air-conditioned rooms are elegantly furnished and guests enjoy free access to Aurelia Spa with indoor pool , hot tub and saunas . The fitness room is newly equipped , while the on-site Admiral Casino features live music . The comfortable and luxurious rooms at Atrium Hotel are soundproofed and come with bathrobes , slippers and bathroom cosmetics . They are decorated in shades of beige and dark greys and all rooms are equipped with free Wi-Fi and flat-screen TV . The Atrium serves a rich American breakfast , which can be enjoyed in the room , the à la carte Restaurant Cardo offers Mediterranean and international dishes . For a delicious cocktail or coffee , guests can visit the trendy City Bar . Spreading over 1000 m² , the Admiral Casino is the largest casino in Dalmatia operating 24 hours a day . With a 24-hour reception and room service , the Atrium offers additional services such as airport transfers , car rental and dry cleaning . Front desk can organise various trips and excursions . The distance to Split Kaštela Airport is 22 km , while Trogir is only 24 km away .

View File

@ -8,6 +8,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from nltk.tokenize import word_tokenize
SITES = [
@ -16,27 +17,27 @@ SITES = [
]
BASE_LINK = "https://www.esky.hr"
OUTPUT_DIR = "output"
OUT_FILE_NAME_HR = "out_hr.txt"
OUT_FILE_NAME_EN = "out_en.txt"
OUT_FILE_NAME = "out_hr.en.txt"
LINE_SEP = "\n"
WD_DELAY = 1
WD_DELAY = 3
ENC = "utf-8"
def main():
res = []
for s in SITES:
res.extend(scrape_list(s))
try:
os.mkdir(OUTPUT_DIR)
except FileExistsError:
pass
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr:
with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en:
for h, e in res:
f_hr.write(h + LINE_SEP)
f_en.write(e + LINE_SEP)
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
os.remove(outf)
with open(outf, 'w', encoding=ENC) as _:
pass # only create new file
for s in SITES:
scrape_list(s, outf)
def transform_link(link: str) -> str:
@ -49,7 +50,7 @@ def get_soup_text(soup: BeautifulSoup) -> str:
return t.get_text()
def scrape_list(website_url):
def scrape_list(website_url, out_filepath):
opts = Options()
opts.headless = True
opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
@ -67,7 +68,6 @@ def scrape_list(website_url):
hotels = soup.find_all('a', {'class': 'name-link'})
res = []
for h in hotels:
if h.has_attr('href'):
href = h['href']
@ -95,10 +95,13 @@ def scrape_list(website_url):
sub_soup = BeautifulSoup(driver.page_source, 'html.parser')
text_en = get_soup_text(sub_soup)
res.append((text_hr, text_en))
text_en_tokenized = ' '.join(word_tokenize(text_en))
text_hr_tokenized = ' '.join(word_tokenize(text_hr))
with open(out_filepath, 'a', encoding=ENC) as f:
f.write(f"{text_hr_tokenized} ||| {text_en_tokenized}")
driver.quit()
return res
if __name__ == "__main__":