init

2021-06-21 20:19:48 +02:00 · 2021-06-21 20:19:48 +02:00 · bcb0975e8c
commit bcb0975e8c
5 changed files with 1562 additions and 0 deletions
--- a/dinu.pdf
+++ b/dinu.pdf
--- a/download.py
+++ b/download.py
@ -0,0 +1,130 @@
+########################################################################
+#
+# Functions for downloading and extracting data-files from the internet.
+#
+# Implemented in Python 3.5
+#
+########################################################################
+#
+# This file is part of the TensorFlow Tutorials available at:
+#
+# https://github.com/Hvass-Labs/TensorFlow-Tutorials
+#
+# Published under the MIT License. See the file LICENSE for details.
+#
+# Copyright 2016 by Magnus Erik Hvass Pedersen
+#
+########################################################################
+
+import sys
+import os
+import urllib.request
+import tarfile
+import zipfile
+
+########################################################################
+
+
+def _print_download_progress(count, block_size, total_size):
+    """
+    Function used for printing the download progress.
+    Used as a call-back function in maybe_download_and_extract().
+    """
+
+    # Percentage completion.
+    pct_complete = float(count * block_size) / total_size
+
+    # Limit it because rounding errors may cause it to exceed 100%.
+    pct_complete = min(1.0, pct_complete)
+
+    # Status-message. Note the \r which means the line should overwrite itself.
+    msg = "\r- Download progress: {0:.1%}".format(pct_complete)
+
+    # Print it.
+    sys.stdout.write(msg)
+    sys.stdout.flush()
+
+
+########################################################################
+
+def download(base_url, filename, download_dir):
+    """
+    Download the given file if it does not already exist in the download_dir.
+
+    :param base_url: The internet URL without the filename.
+    :param filename: The filename that will be added to the base_url.
+    :param download_dir: Local directory for storing the file.
+    :return: Nothing.
+    """
+
+    # Path for local file.
+    save_path = os.path.join(download_dir, filename)
+
+    # Check if the file already exists, otherwise we need to download it now.
+    if not os.path.exists(save_path):
+        # Check if the download directory exists, otherwise create it.
+        if not os.path.exists(download_dir):
+            os.makedirs(download_dir)
+
+        print("Downloading", filename, "...")
+
+        # Download the file from the internet.
+        url = base_url + filename
+        file_path, _ = urllib.request.urlretrieve(url=url,
+                                                  filename=save_path,
+                                                  reporthook=_print_download_progress)
+
+        print(" Done!")
+
+
+def maybe_download_and_extract(url, download_dir):
+    """
+    Download and extract the data if it doesn't already exist.
+    Assumes the url is a tar-ball file.
+
+    :param url:
+        Internet URL for the tar-file to download.
+        Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+
+    :param download_dir:
+        Directory where the downloaded file is saved.
+        Example: "data/CIFAR-10/"
+
+    :return:
+        Nothing.
+    """
+
+    # Filename for saving the file downloaded from the internet.
+    # Use the filename from the URL and add it to the download_dir.
+    filename = url.split('/')[-1]
+    file_path = os.path.join(download_dir, filename)
+
+    # Check if the file already exists.
+    # If it exists then we assume it has also been extracted,
+    # otherwise we need to download and extract it now.
+    if not os.path.exists(file_path):
+        # Check if the download directory exists, otherwise create it.
+        if not os.path.exists(download_dir):
+            os.makedirs(download_dir)
+
+        # Download the file from the internet.
+        file_path, _ = urllib.request.urlretrieve(url=url,
+                                                  filename=file_path,
+                                                  reporthook=_print_download_progress)
+
+        print()
+        print("Download finished. Extracting files.")
+
+        if file_path.endswith(".zip"):
+            # Unpack the zip-file.
+            zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
+        elif file_path.endswith((".tar.gz", ".tgz")):
+            # Unpack the tar-ball.
+            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
+
+        print("Done.")
+    else:
+        print("Data has apparently already been downloaded and unpacked.")
+
+
+########################################################################
--- a/europarl.py
+++ b/europarl.py
@ -0,0 +1,136 @@
+########################################################################
+#
+# Functions for downloading the Europarl data-set from the internet
+# and loading it into memory. This data-set is used for translation
+# between English and most European languages.
+#
+# http://www.statmt.org/europarl/
+#
+# Implemented in Python 3.6
+#
+# Usage:
+# 1) Set the variable data_dir with the desired storage directory.
+# 2) Determine the language-code to use e.g. "da" for Danish.
+# 3) Call maybe_download_and_extract() to download the data-set
+#    if it is not already located in the given data_dir.
+# 4) Call load_data(english=True) and load_data(english=False)
+#    to load the two data-files.
+# 5) Use the returned data in your own program.
+#
+# Format:
+# The Europarl data-set contains millions of text-pairs between English
+# and most European languages. The data is stored in two text-files.
+# The data is returned as lists of strings by the load_data() function.
+#
+# The list of currently supported languages and their codes are as follows:
+#
+# bg - Bulgarian
+# cs - Czech
+# da - Danish
+# de - German
+# el - Greek
+# es - Spanish
+# et - Estonian
+# fi - Finnish
+# fr - French
+# hu - Hungarian
+# it - Italian
+# lt - Lithuanian
+# lv - Latvian
+# nl - Dutch
+# pl - Polish
+# pt - Portuguese
+# ro - Romanian
+# sk - Slovak
+# sl - Slovene
+# sv - Swedish
+#
+########################################################################
+#
+# This file is part of the TensorFlow Tutorials available at:
+#
+# https://github.com/Hvass-Labs/TensorFlow-Tutorials
+#
+# Published under the MIT License. See the file LICENSE for details.
+#
+# Copyright 2018 by Magnus Erik Hvass Pedersen
+#
+########################################################################
+
+import os
+import download
+
+########################################################################
+
+# Directory where you want to download and save the data-set.
+# Set this before you start calling any of the functions below.
+data_dir = "data/europarl/"
+
+# Base-URL for the data-sets on the internet.
+data_url = "http://www.statmt.org/europarl/v7/"
+
+
+########################################################################
+# Public functions that you may call to download the data-set from
+# the internet and load the data into memory.
+
+
+def maybe_download_and_extract(language_code="da"):
+    """
+    Download and extract the Europarl data-set if the data-file doesn't
+    already exist in data_dir. The data-set is for translating between
+    English and the given language-code (e.g. 'da' for Danish, see the
+    list of available language-codes above).
+    """
+
+    # Create the full URL for the file with this data-set.
+    url = data_url + language_code + "-en.tgz"
+
+    download.maybe_download_and_extract(url=url, download_dir=data_dir)
+
+
+def load_data(english=True, language_code="da", start="", end=""):
+    """
+    Load the data-file for either the English-language texts or
+    for the other language (e.g. "da" for Danish).
+
+    All lines of the data-file are returned as a list of strings.
+
+    :param english:
+      Boolean whether to load the data-file for
+      English (True) or the other language (False).
+
+    :param language_code:
+      Two-char code for the other language e.g. "da" for Danish.
+      See list of available codes above.
+
+    :param start:
+      Prepend each line with this text e.g. "ssss " to indicate start of line.
+
+    :param end:
+      Append each line with this text e.g. " eeee" to indicate end of line.
+
+    :return:
+      List of strings with all the lines of the data-file.
+    """
+
+    if english:
+        # Load the English data.
+        filename = "europarl-v7.{0}-en.en".format(language_code)
+    else:
+        # Load the other language.
+        filename = "europarl-v7.{0}-en.{0}".format(language_code)
+
+    # Full path for the data-file.
+    path = os.path.join(data_dir, filename)
+
+    # Open and read all the contents of the data-file.
+    with open(path, encoding="utf-8") as file:
+        # Read the line from file, strip leading and trailing whitespace,
+        # prepend the start-text and append the end-text.
+        texts = [start + line.strip() + end for line in file]
+
+    return texts
+
+
+########################################################################
--- a/kompendium.tsv
+++ b/kompendium.tsv
--- a/preprare_corpus.py
+++ b/preprare_corpus.py
@ -0,0 +1,99 @@
+import csv
+import pandas as pd
+import europarl
+
+from sklearn.model_selection import train_test_split
+
+
+def inject_translations(corpus, dictionary):
+    llist = []
+    corpus = strip_lower(corpus)
+    ctr = 0
+    for idx, sentence in enumerate(corpus):
+        possible_translations = []
+        for key in list(dictionary):
+            # todo: approximate matching
+            if (space_wrap(sentence)).find(space_wrap(key)) != -1:
+                possible_translations.append(key)
+                ctr += 1
+
+        if len(possible_translations) > 0:
+            chosen_key = choose_translation(possible_translations)
+            llist.append(add_translation(sentence, chosen_key, dictionary[chosen_key]))
+        else:
+            llist.append(sentence)
+        if idx % 50000 == 0:
+            print(idx)
+    print(f'injected {ctr} words.')
+    return llist
+
+
+def strip_lower(corpus):
+    return [strip(sentence.lower()) for sentence in corpus]
+
+
+def strip(sentence):
+    chars = '`~!@#$%^&*()-_=+[{]}\\|;:\'\",<.>/?'
+    for char in chars:
+        sentence = sentence.replace(char, '')
+    return sentence
+
+
+def add_translation(sen, key, value):
+    return sen[:space_wrap(sen).find(key) + len(key) - 1] + ' ' + value + sen[space_wrap(sen).find(key) + len(key) - 1:]
+
+
+def choose_translation(translations):
+    return sorted(translations, key=lambda x: len(x.split(' ')), reverse=True)[0]
+
+
+def space_wrap(word):
+    return ' ' + word + ' '
+
+
+mark_start = 'ssss '
+mark_end = ' eeee'
+language_code = 'pl'
+
+europarl.maybe_download_and_extract(language_code=language_code)
+data_src = europarl.load_data(english=True, language_code=language_code)
+data_dest = europarl.load_data(english=False,
+                               language_code=language_code)
+
+test_size = 0.25
+
+df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0)
+dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42)
+print('dictionary len: ', len(df_dict))
+print('train dictionary len: ', len(dtr))
+print('test dictionary len: ', len(dts))
+
+pd.DataFrame(dtr).to_csv('data/dictionary_train.csv', header=False)
+pd.DataFrame(dts).to_csv('data/dictionary_test.csv', header=False)
+
+dict_reader_tr = csv.reader(open('data/dictionary_train.csv', 'r'))
+dictionary_train = {}
+for row in dict_reader_tr:
+    k, v = row
+    dictionary_train[k] = v
+
+dict_reader_ts = csv.reader(open('data/dictionary_test.csv', 'r'))
+dictionary_test = {}
+for row in dict_reader_ts:
+    k, v = row
+    dictionary_test[k] = v
+
+data_src_train, data_src_test, data_dest_train, data_dest_test = \
+    train_test_split(data_src, data_dest, test_size=test_size, random_state=42)
+
+print('data len: ', len(data_src))
+print('train len: ', len(data_src_train))
+print('test len: ', len(data_src_test))
+
+data_src_train = inject_translations(data_src_train, dictionary_train)
+data_src_test = inject_translations(data_src_test, dictionary_test)
+
+pd.DataFrame(data_src_train).to_csv('data/orig/train.en', header=False, index=False)
+pd.DataFrame(data_src_test).to_csv('data/orig/test.en', header=False, index=False)
+pd.DataFrame(data_dest_train).to_csv('data/orig/train.pl', header=False, index=False)
+pd.DataFrame(data_dest_test).to_csv('data/orig/test.pl', header=False, index=False)