init
This commit is contained in:
commit
bcb0975e8c
130
download.py
Normal file
130
download.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Functions for downloading and extracting data-files from the internet.
|
||||||
|
#
|
||||||
|
# Implemented in Python 3.5
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# This file is part of the TensorFlow Tutorials available at:
|
||||||
|
#
|
||||||
|
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
|
||||||
|
#
|
||||||
|
# Published under the MIT License. See the file LICENSE for details.
|
||||||
|
#
|
||||||
|
# Copyright 2016 by Magnus Erik Hvass Pedersen
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import urllib.request
|
||||||
|
import tarfile
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def _print_download_progress(count, block_size, total_size):
|
||||||
|
"""
|
||||||
|
Function used for printing the download progress.
|
||||||
|
Used as a call-back function in maybe_download_and_extract().
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Percentage completion.
|
||||||
|
pct_complete = float(count * block_size) / total_size
|
||||||
|
|
||||||
|
# Limit it because rounding errors may cause it to exceed 100%.
|
||||||
|
pct_complete = min(1.0, pct_complete)
|
||||||
|
|
||||||
|
# Status-message. Note the \r which means the line should overwrite itself.
|
||||||
|
msg = "\r- Download progress: {0:.1%}".format(pct_complete)
|
||||||
|
|
||||||
|
# Print it.
|
||||||
|
sys.stdout.write(msg)
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
def download(base_url, filename, download_dir):
|
||||||
|
"""
|
||||||
|
Download the given file if it does not already exist in the download_dir.
|
||||||
|
|
||||||
|
:param base_url: The internet URL without the filename.
|
||||||
|
:param filename: The filename that will be added to the base_url.
|
||||||
|
:param download_dir: Local directory for storing the file.
|
||||||
|
:return: Nothing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Path for local file.
|
||||||
|
save_path = os.path.join(download_dir, filename)
|
||||||
|
|
||||||
|
# Check if the file already exists, otherwise we need to download it now.
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
# Check if the download directory exists, otherwise create it.
|
||||||
|
if not os.path.exists(download_dir):
|
||||||
|
os.makedirs(download_dir)
|
||||||
|
|
||||||
|
print("Downloading", filename, "...")
|
||||||
|
|
||||||
|
# Download the file from the internet.
|
||||||
|
url = base_url + filename
|
||||||
|
file_path, _ = urllib.request.urlretrieve(url=url,
|
||||||
|
filename=save_path,
|
||||||
|
reporthook=_print_download_progress)
|
||||||
|
|
||||||
|
print(" Done!")
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_download_and_extract(url, download_dir):
|
||||||
|
"""
|
||||||
|
Download and extract the data if it doesn't already exist.
|
||||||
|
Assumes the url is a tar-ball file.
|
||||||
|
|
||||||
|
:param url:
|
||||||
|
Internet URL for the tar-file to download.
|
||||||
|
Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
|
||||||
|
|
||||||
|
:param download_dir:
|
||||||
|
Directory where the downloaded file is saved.
|
||||||
|
Example: "data/CIFAR-10/"
|
||||||
|
|
||||||
|
:return:
|
||||||
|
Nothing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Filename for saving the file downloaded from the internet.
|
||||||
|
# Use the filename from the URL and add it to the download_dir.
|
||||||
|
filename = url.split('/')[-1]
|
||||||
|
file_path = os.path.join(download_dir, filename)
|
||||||
|
|
||||||
|
# Check if the file already exists.
|
||||||
|
# If it exists then we assume it has also been extracted,
|
||||||
|
# otherwise we need to download and extract it now.
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
# Check if the download directory exists, otherwise create it.
|
||||||
|
if not os.path.exists(download_dir):
|
||||||
|
os.makedirs(download_dir)
|
||||||
|
|
||||||
|
# Download the file from the internet.
|
||||||
|
file_path, _ = urllib.request.urlretrieve(url=url,
|
||||||
|
filename=file_path,
|
||||||
|
reporthook=_print_download_progress)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Download finished. Extracting files.")
|
||||||
|
|
||||||
|
if file_path.endswith(".zip"):
|
||||||
|
# Unpack the zip-file.
|
||||||
|
zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
|
||||||
|
elif file_path.endswith((".tar.gz", ".tgz")):
|
||||||
|
# Unpack the tar-ball.
|
||||||
|
tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
|
||||||
|
|
||||||
|
print("Done.")
|
||||||
|
else:
|
||||||
|
print("Data has apparently already been downloaded and unpacked.")
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
136
europarl.py
Normal file
136
europarl.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Functions for downloading the Europarl data-set from the internet
|
||||||
|
# and loading it into memory. This data-set is used for translation
|
||||||
|
# between English and most European languages.
|
||||||
|
#
|
||||||
|
# http://www.statmt.org/europarl/
|
||||||
|
#
|
||||||
|
# Implemented in Python 3.6
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# 1) Set the variable data_dir with the desired storage directory.
|
||||||
|
# 2) Determine the language-code to use e.g. "da" for Danish.
|
||||||
|
# 3) Call maybe_download_and_extract() to download the data-set
|
||||||
|
# if it is not already located in the given data_dir.
|
||||||
|
# 4) Call load_data(english=True) and load_data(english=False)
|
||||||
|
# to load the two data-files.
|
||||||
|
# 5) Use the returned data in your own program.
|
||||||
|
#
|
||||||
|
# Format:
|
||||||
|
# The Europarl data-set contains millions of text-pairs between English
|
||||||
|
# and most European languages. The data is stored in two text-files.
|
||||||
|
# The data is returned as lists of strings by the load_data() function.
|
||||||
|
#
|
||||||
|
# The list of currently supported languages and their codes are as follows:
|
||||||
|
#
|
||||||
|
# bg - Bulgarian
|
||||||
|
# cs - Czech
|
||||||
|
# da - Danish
|
||||||
|
# de - German
|
||||||
|
# el - Greek
|
||||||
|
# es - Spanish
|
||||||
|
# et - Estonian
|
||||||
|
# fi - Finnish
|
||||||
|
# fr - French
|
||||||
|
# hu - Hungarian
|
||||||
|
# it - Italian
|
||||||
|
# lt - Lithuanian
|
||||||
|
# lv - Latvian
|
||||||
|
# nl - Dutch
|
||||||
|
# pl - Polish
|
||||||
|
# pt - Portuguese
|
||||||
|
# ro - Romanian
|
||||||
|
# sk - Slovak
|
||||||
|
# sl - Slovene
|
||||||
|
# sv - Swedish
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# This file is part of the TensorFlow Tutorials available at:
|
||||||
|
#
|
||||||
|
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
|
||||||
|
#
|
||||||
|
# Published under the MIT License. See the file LICENSE for details.
|
||||||
|
#
|
||||||
|
# Copyright 2018 by Magnus Erik Hvass Pedersen
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
import os
|
||||||
|
import download
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
# Directory where you want to download and save the data-set.
|
||||||
|
# Set this before you start calling any of the functions below.
|
||||||
|
data_dir = "data/europarl/"
|
||||||
|
|
||||||
|
# Base-URL for the data-sets on the internet.
|
||||||
|
data_url = "http://www.statmt.org/europarl/v7/"
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# Public functions that you may call to download the data-set from
|
||||||
|
# the internet and load the data into memory.
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_download_and_extract(language_code="da"):
|
||||||
|
"""
|
||||||
|
Download and extract the Europarl data-set if the data-file doesn't
|
||||||
|
already exist in data_dir. The data-set is for translating between
|
||||||
|
English and the given language-code (e.g. 'da' for Danish, see the
|
||||||
|
list of available language-codes above).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create the full URL for the file with this data-set.
|
||||||
|
url = data_url + language_code + "-en.tgz"
|
||||||
|
|
||||||
|
download.maybe_download_and_extract(url=url, download_dir=data_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(english=True, language_code="da", start="", end=""):
|
||||||
|
"""
|
||||||
|
Load the data-file for either the English-language texts or
|
||||||
|
for the other language (e.g. "da" for Danish).
|
||||||
|
|
||||||
|
All lines of the data-file are returned as a list of strings.
|
||||||
|
|
||||||
|
:param english:
|
||||||
|
Boolean whether to load the data-file for
|
||||||
|
English (True) or the other language (False).
|
||||||
|
|
||||||
|
:param language_code:
|
||||||
|
Two-char code for the other language e.g. "da" for Danish.
|
||||||
|
See list of available codes above.
|
||||||
|
|
||||||
|
:param start:
|
||||||
|
Prepend each line with this text e.g. "ssss " to indicate start of line.
|
||||||
|
|
||||||
|
:param end:
|
||||||
|
Append each line with this text e.g. " eeee" to indicate end of line.
|
||||||
|
|
||||||
|
:return:
|
||||||
|
List of strings with all the lines of the data-file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if english:
|
||||||
|
# Load the English data.
|
||||||
|
filename = "europarl-v7.{0}-en.en".format(language_code)
|
||||||
|
else:
|
||||||
|
# Load the other language.
|
||||||
|
filename = "europarl-v7.{0}-en.{0}".format(language_code)
|
||||||
|
|
||||||
|
# Full path for the data-file.
|
||||||
|
path = os.path.join(data_dir, filename)
|
||||||
|
|
||||||
|
# Open and read all the contents of the data-file.
|
||||||
|
with open(path, encoding="utf-8") as file:
|
||||||
|
# Read the line from file, strip leading and trailing whitespace,
|
||||||
|
# prepend the start-text and append the end-text.
|
||||||
|
texts = [start + line.strip() + end for line in file]
|
||||||
|
|
||||||
|
return texts
|
||||||
|
|
||||||
|
|
||||||
|
########################################################################
|
1197
kompendium.tsv
Normal file
1197
kompendium.tsv
Normal file
File diff suppressed because it is too large
Load Diff
99
preprare_corpus.py
Normal file
99
preprare_corpus.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import csv
|
||||||
|
import pandas as pd
|
||||||
|
import europarl
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
def inject_translations(corpus, dictionary):
|
||||||
|
llist = []
|
||||||
|
corpus = strip_lower(corpus)
|
||||||
|
ctr = 0
|
||||||
|
for idx, sentence in enumerate(corpus):
|
||||||
|
possible_translations = []
|
||||||
|
for key in list(dictionary):
|
||||||
|
# todo: approximate matching
|
||||||
|
if (space_wrap(sentence)).find(space_wrap(key)) != -1:
|
||||||
|
possible_translations.append(key)
|
||||||
|
ctr += 1
|
||||||
|
|
||||||
|
if len(possible_translations) > 0:
|
||||||
|
chosen_key = choose_translation(possible_translations)
|
||||||
|
llist.append(add_translation(sentence, chosen_key, dictionary[chosen_key]))
|
||||||
|
else:
|
||||||
|
llist.append(sentence)
|
||||||
|
if idx % 50000 == 0:
|
||||||
|
print(idx)
|
||||||
|
print(f'injected {ctr} words.')
|
||||||
|
return llist
|
||||||
|
|
||||||
|
|
||||||
|
def strip_lower(corpus):
|
||||||
|
return [strip(sentence.lower()) for sentence in corpus]
|
||||||
|
|
||||||
|
|
||||||
|
def strip(sentence):
|
||||||
|
chars = '`~!@#$%^&*()-_=+[{]}\\|;:\'\",<.>/?'
|
||||||
|
for char in chars:
|
||||||
|
sentence = sentence.replace(char, '')
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def add_translation(sen, key, value):
|
||||||
|
return sen[:space_wrap(sen).find(key) + len(key) - 1] + ' ' + value + sen[space_wrap(sen).find(key) + len(key) - 1:]
|
||||||
|
|
||||||
|
|
||||||
|
def choose_translation(translations):
|
||||||
|
return sorted(translations, key=lambda x: len(x.split(' ')), reverse=True)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def space_wrap(word):
|
||||||
|
return ' ' + word + ' '
|
||||||
|
|
||||||
|
|
||||||
|
mark_start = 'ssss '
|
||||||
|
mark_end = ' eeee'
|
||||||
|
language_code = 'pl'
|
||||||
|
|
||||||
|
europarl.maybe_download_and_extract(language_code=language_code)
|
||||||
|
data_src = europarl.load_data(english=True, language_code=language_code)
|
||||||
|
data_dest = europarl.load_data(english=False,
|
||||||
|
language_code=language_code)
|
||||||
|
|
||||||
|
test_size = 0.25
|
||||||
|
|
||||||
|
df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0)
|
||||||
|
dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42)
|
||||||
|
print('dictionary len: ', len(df_dict))
|
||||||
|
print('train dictionary len: ', len(dtr))
|
||||||
|
print('test dictionary len: ', len(dts))
|
||||||
|
|
||||||
|
pd.DataFrame(dtr).to_csv('data/dictionary_train.csv', header=False)
|
||||||
|
pd.DataFrame(dts).to_csv('data/dictionary_test.csv', header=False)
|
||||||
|
|
||||||
|
dict_reader_tr = csv.reader(open('data/dictionary_train.csv', 'r'))
|
||||||
|
dictionary_train = {}
|
||||||
|
for row in dict_reader_tr:
|
||||||
|
k, v = row
|
||||||
|
dictionary_train[k] = v
|
||||||
|
|
||||||
|
dict_reader_ts = csv.reader(open('data/dictionary_test.csv', 'r'))
|
||||||
|
dictionary_test = {}
|
||||||
|
for row in dict_reader_ts:
|
||||||
|
k, v = row
|
||||||
|
dictionary_test[k] = v
|
||||||
|
|
||||||
|
data_src_train, data_src_test, data_dest_train, data_dest_test = \
|
||||||
|
train_test_split(data_src, data_dest, test_size=test_size, random_state=42)
|
||||||
|
|
||||||
|
print('data len: ', len(data_src))
|
||||||
|
print('train len: ', len(data_src_train))
|
||||||
|
print('test len: ', len(data_src_test))
|
||||||
|
|
||||||
|
data_src_train = inject_translations(data_src_train, dictionary_train)
|
||||||
|
data_src_test = inject_translations(data_src_test, dictionary_test)
|
||||||
|
|
||||||
|
pd.DataFrame(data_src_train).to_csv('data/orig/train.en', header=False, index=False)
|
||||||
|
pd.DataFrame(data_src_test).to_csv('data/orig/test.en', header=False, index=False)
|
||||||
|
pd.DataFrame(data_dest_train).to_csv('data/orig/train.pl', header=False, index=False)
|
||||||
|
pd.DataFrame(data_dest_test).to_csv('data/orig/test.pl', header=False, index=False)
|
Loading…
Reference in New Issue
Block a user