split functionality into separate files

This commit is contained in:
Alagris 2021-03-22 13:03:07 +01:00
parent a72ad9e743
commit faaa014167
2 changed files with 30 additions and 15 deletions

16
PH.py
View File

@ -34,21 +34,7 @@ IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'
'u', 'v', 'w', 'x', 'y', 'z', '$', '^'] 'u', 'v', 'w', 'x', 'y', 'z', '$', '^']
def extract_alphabet():
"""
This function has been used to extract the alphabet but then it was hard-coded directly into
code in order to speed up loading time
"""
with open(DATA_FILE) as f:
in_alph = set()
out_alph = set()
for line in f:
text, phonemes = line.strip().split("\t")
phonemes = phonemes.split(",")[0]
for letter in phonemes:
out_alph.add(letter)
in_alph.add(letter)
return in_alph, out_alph
IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)} IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}

29
create_dataset.py Normal file
View File

@ -0,0 +1,29 @@
import re
import requests
URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
DATA_FILE = 'en_US.txt'
open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
PREPROCESSED = 'preprocessed.tsv'
with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
in_alph = set()
out_alph = set()
for line in f:
text, phonemes = line.strip().split("\t")
phonemes = phonemes.split(",")[0]
phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$'
text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$'
for letter in phonemes:
out_alph.add(letter)
for letter in text:
in_alph.add(letter)
p.write(text + '\t' + phonemes+'\n')
print(in_alph)
print(out_alph)
with open('in_alphabet', 'w+') as p:
p.write(''.join(in_alph))
with open('out_alphabet', 'w+') as p:
p.write(''.join(out_alph))