diff --git a/PH.py b/PH.py index fdeb1a5..e2f2254 100644 --- a/PH.py +++ b/PH.py @@ -34,21 +34,7 @@ IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm' 'u', 'v', 'w', 'x', 'y', 'z', '$', '^'] -def extract_alphabet(): - """ - This function has been used to extract the alphabet but then it was hard-coded directly into - code in order to speed up loading time - """ - with open(DATA_FILE) as f: - in_alph = set() - out_alph = set() - for line in f: - text, phonemes = line.strip().split("\t") - phonemes = phonemes.split(",")[0] - for letter in phonemes: - out_alph.add(letter) - in_alph.add(letter) - return in_alph, out_alph + IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)} diff --git a/create_dataset.py b/create_dataset.py new file mode 100644 index 0000000..d71c3c6 --- /dev/null +++ b/create_dataset.py @@ -0,0 +1,29 @@ +import re + +import requests + +URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/' +DATA_FILE = 'en_US.txt' +open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content) + +PREPROCESSED = 'preprocessed.tsv' +with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p: + in_alph = set() + out_alph = set() + for line in f: + text, phonemes = line.strip().split("\t") + phonemes = phonemes.split(",")[0] + phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$' + text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$' + for letter in phonemes: + out_alph.add(letter) + for letter in text: + in_alph.add(letter) + p.write(text + '\t' + phonemes+'\n') + +print(in_alph) +print(out_alph) +with open('in_alphabet', 'w+') as p: + p.write(''.join(in_alph)) +with open('out_alphabet', 'w+') as p: + p.write(''.join(out_alph))