split functionality into separate files
This commit is contained in:
parent
a72ad9e743
commit
faaa014167
16
PH.py
16
PH.py
@ -34,21 +34,7 @@ IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'
|
||||
'u', 'v', 'w', 'x', 'y', 'z', '$', '^']
|
||||
|
||||
|
||||
def extract_alphabet():
|
||||
"""
|
||||
This function has been used to extract the alphabet but then it was hard-coded directly into
|
||||
code in order to speed up loading time
|
||||
"""
|
||||
with open(DATA_FILE) as f:
|
||||
in_alph = set()
|
||||
out_alph = set()
|
||||
for line in f:
|
||||
text, phonemes = line.strip().split("\t")
|
||||
phonemes = phonemes.split(",")[0]
|
||||
for letter in phonemes:
|
||||
out_alph.add(letter)
|
||||
in_alph.add(letter)
|
||||
return in_alph, out_alph
|
||||
|
||||
|
||||
|
||||
IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}
|
||||
|
29
create_dataset.py
Normal file
29
create_dataset.py
Normal file
@ -0,0 +1,29 @@
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
|
||||
DATA_FILE = 'en_US.txt'
|
||||
open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
|
||||
|
||||
PREPROCESSED = 'preprocessed.tsv'
|
||||
with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
|
||||
in_alph = set()
|
||||
out_alph = set()
|
||||
for line in f:
|
||||
text, phonemes = line.strip().split("\t")
|
||||
phonemes = phonemes.split(",")[0]
|
||||
phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$'
|
||||
text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$'
|
||||
for letter in phonemes:
|
||||
out_alph.add(letter)
|
||||
for letter in text:
|
||||
in_alph.add(letter)
|
||||
p.write(text + '\t' + phonemes+'\n')
|
||||
|
||||
print(in_alph)
|
||||
print(out_alph)
|
||||
with open('in_alphabet', 'w+') as p:
|
||||
p.write(''.join(in_alph))
|
||||
with open('out_alphabet', 'w+') as p:
|
||||
p.write(''.join(out_alph))
|
Loading…
Reference in New Issue
Block a user