######################################################################## # # Functions for downloading the Europarl data-set from the internet # and loading it into memory. This data-set is used for translation # between English and most European languages. # # http://www.statmt.org/europarl/ # # Implemented in Python 3.6 # # Usage: # 1) Set the variable data_dir with the desired storage directory. # 2) Determine the language-code to use e.g. "da" for Danish. # 3) Call maybe_download_and_extract() to download the data-set # if it is not already located in the given data_dir. # 4) Call load_data(english=True) and load_data(english=False) # to load the two data-files. # 5) Use the returned data in your own program. # # Format: # The Europarl data-set contains millions of text-pairs between English # and most European languages. The data is stored in two text-files. # The data is returned as lists of strings by the load_data() function. # # The list of currently supported languages and their codes are as follows: # # bg - Bulgarian # cs - Czech # da - Danish # de - German # el - Greek # es - Spanish # et - Estonian # fi - Finnish # fr - French # hu - Hungarian # it - Italian # lt - Lithuanian # lv - Latvian # nl - Dutch # pl - Polish # pt - Portuguese # ro - Romanian # sk - Slovak # sl - Slovene # sv - Swedish # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2018 by Magnus Erik Hvass Pedersen # ######################################################################## import os import download ######################################################################## # Directory where you want to download and save the data-set. # Set this before you start calling any of the functions below. data_dir = "data/europarl/" # Base-URL for the data-sets on the internet. data_url = "http://www.statmt.org/europarl/v7/" ######################################################################## # Public functions that you may call to download the data-set from # the internet and load the data into memory. def maybe_download_and_extract(language_code="da"): """ Download and extract the Europarl data-set if the data-file doesn't already exist in data_dir. The data-set is for translating between English and the given language-code (e.g. 'da' for Danish, see the list of available language-codes above). """ # Create the full URL for the file with this data-set. url = data_url + language_code + "-en.tgz" download.maybe_download_and_extract(url=url, download_dir=data_dir) def load_data(english=True, language_code="da", start="", end=""): """ Load the data-file for either the English-language texts or for the other language (e.g. "da" for Danish). All lines of the data-file are returned as a list of strings. :param english: Boolean whether to load the data-file for English (True) or the other language (False). :param language_code: Two-char code for the other language e.g. "da" for Danish. See list of available codes above. :param start: Prepend each line with this text e.g. "ssss " to indicate start of line. :param end: Append each line with this text e.g. " eeee" to indicate end of line. :return: List of strings with all the lines of the data-file. """ if english: # Load the English data. filename = "europarl-v7.{0}-en.en".format(language_code) else: # Load the other language. filename = "europarl-v7.{0}-en.{0}".format(language_code) # Full path for the data-file. path = os.path.join(data_dir, filename) # Open and read all the contents of the data-file. with open(path, encoding="utf-8") as file: # Read the line from file, strip leading and trailing whitespace, # prepend the start-text and append the end-text. texts = [start + line.strip() + end for line in file] return texts ########################################################################