From 97e397cfc9d55720104f1d4788bd46fd31b7aa04 Mon Sep 17 00:00:00 2001 From: nlitkowski Date: Mon, 21 Jun 2021 23:37:51 +0200 Subject: [PATCH] Add tokenizing, preparing output for fast_align --- output/out_hr.en.txt | 1 + src/main.py | 33 ++++++++++++++++++--------------- 2 files changed, 19 insertions(+), 15 deletions(-) create mode 100644 output/out_hr.en.txt diff --git a/output/out_hr.en.txt b/output/out_hr.en.txt new file mode 100644 index 0000000..ae72ac9 --- /dev/null +++ b/output/out_hr.en.txt @@ -0,0 +1 @@ +Hotel Atrium smješten je 1 km od Dioklecijanove plače , koja je pod zaštitom UNESCO-a . Objekt uključuje elegantno namještene i klimatizirane sobe te besplatan pristup spa centru Aurelia s unutarnjim bazenom , hidromasažnom kadom i saunama . Prostorija za fitness nedavno je obnovljena . Kockarnica Admiral u sklopu objekta nudi glazbu uživo . Udobne i luksuzne sobe hotela Atrium zvučno su izolirane te sadrže ogrtače , papuče i kozmetički pribor . Smještajne jedinice uređene su u bež i tamnosivim nijansama , a sve uključuju besplatni WiFi i TV ravnog ekrana . U hotelu Atrium dostupan je izdašan američki doručak , koji je moguće poslužiti i u sobama . Restoran s jelima po narudžbi Cardo poslužuje jela mediteranske i međunarodne kuhinje . U modernom baru City gosti mogu popiti kavu ili ukusan koktel . Kockarnica Admiral , otvorena 24 sata dnevno , prostire se na više od 1000 m² i najveća je kockarnica u Dalmaciji . Gostima su 24 sata dnevno na raspolaganju recepcija i posluga u sobu , a u sklopu objekta također su dostupne kemijska čistionica , usluga najma automobila te usluga prijevoza iz/do zračne luke . Osoblje na recepciji rado će gostima pomoći oko organiziranja raznih izleta i obilazaka . Splitska zračna luka udaljena je 22 km , a do Trogira ima svega 24 km . ||| Hotel Atrium is set 1 km from the UNESCO-listed Diocletian 's Palace . Its air-conditioned rooms are elegantly furnished and guests enjoy free access to Aurelia Spa with indoor pool , hot tub and saunas . The fitness room is newly equipped , while the on-site Admiral Casino features live music . The comfortable and luxurious rooms at Atrium Hotel are soundproofed and come with bathrobes , slippers and bathroom cosmetics . They are decorated in shades of beige and dark greys and all rooms are equipped with free Wi-Fi and flat-screen TV . The Atrium serves a rich American breakfast , which can be enjoyed in the room , the à la carte Restaurant Cardo offers Mediterranean and international dishes . For a delicious cocktail or coffee , guests can visit the trendy City Bar . Spreading over 1000 m² , the Admiral Casino is the largest casino in Dalmatia operating 24 hours a day . With a 24-hour reception and room service , the Atrium offers additional services such as airport transfers , car rental and dry cleaning . Front desk can organise various trips and excursions . The distance to Split Kaštela Airport is 22 km , while Trogir is only 24 km away . \ No newline at end of file diff --git a/src/main.py b/src/main.py index 1931671..93b683c 100644 --- a/src/main.py +++ b/src/main.py @@ -8,6 +8,7 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException +from nltk.tokenize import word_tokenize SITES = [ @@ -16,27 +17,27 @@ SITES = [ ] BASE_LINK = "https://www.esky.hr" OUTPUT_DIR = "output" -OUT_FILE_NAME_HR = "out_hr.txt" -OUT_FILE_NAME_EN = "out_en.txt" +OUT_FILE_NAME = "out_hr.en.txt" LINE_SEP = "\n" -WD_DELAY = 1 +WD_DELAY = 3 +ENC = "utf-8" def main(): - res = [] - for s in SITES: - res.extend(scrape_list(s)) try: os.mkdir(OUTPUT_DIR) except FileExistsError: pass - with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_HR), "w") as f_hr: - with open(os.path.join(OUTPUT_DIR, OUT_FILE_NAME_EN), "w") as f_en: - for h, e in res: - f_hr.write(h + LINE_SEP) - f_en.write(e + LINE_SEP) + outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME) + + os.remove(outf) + with open(outf, 'w', encoding=ENC) as _: + pass # only create new file + + for s in SITES: + scrape_list(s, outf) def transform_link(link: str) -> str: @@ -49,7 +50,7 @@ def get_soup_text(soup: BeautifulSoup) -> str: return t.get_text() -def scrape_list(website_url): +def scrape_list(website_url, out_filepath): opts = Options() opts.headless = True opts.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe" @@ -67,7 +68,6 @@ def scrape_list(website_url): hotels = soup.find_all('a', {'class': 'name-link'}) - res = [] for h in hotels: if h.has_attr('href'): href = h['href'] @@ -95,10 +95,13 @@ def scrape_list(website_url): sub_soup = BeautifulSoup(driver.page_source, 'html.parser') text_en = get_soup_text(sub_soup) - res.append((text_hr, text_en)) + text_en_tokenized = ' '.join(word_tokenize(text_en)) + text_hr_tokenized = ' '.join(word_tokenize(text_hr)) + + with open(out_filepath, 'a', encoding=ENC) as f: + f.write(f"{text_hr_tokenized} ||| {text_en_tokenized}") driver.quit() - return res if __name__ == "__main__":