split functionality into separate files
This commit is contained in:
parent
a72ad9e743
commit
faaa014167
16
PH.py
16
PH.py
@ -34,21 +34,7 @@ IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'
|
|||||||
'u', 'v', 'w', 'x', 'y', 'z', '$', '^']
|
'u', 'v', 'w', 'x', 'y', 'z', '$', '^']
|
||||||
|
|
||||||
|
|
||||||
def extract_alphabet():
|
|
||||||
"""
|
|
||||||
This function has been used to extract the alphabet but then it was hard-coded directly into
|
|
||||||
code in order to speed up loading time
|
|
||||||
"""
|
|
||||||
with open(DATA_FILE) as f:
|
|
||||||
in_alph = set()
|
|
||||||
out_alph = set()
|
|
||||||
for line in f:
|
|
||||||
text, phonemes = line.strip().split("\t")
|
|
||||||
phonemes = phonemes.split(",")[0]
|
|
||||||
for letter in phonemes:
|
|
||||||
out_alph.add(letter)
|
|
||||||
in_alph.add(letter)
|
|
||||||
return in_alph, out_alph
|
|
||||||
|
|
||||||
|
|
||||||
IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}
|
IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}
|
||||||
|
29
create_dataset.py
Normal file
29
create_dataset.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
|
||||||
|
DATA_FILE = 'en_US.txt'
|
||||||
|
open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
|
||||||
|
|
||||||
|
PREPROCESSED = 'preprocessed.tsv'
|
||||||
|
with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
|
||||||
|
in_alph = set()
|
||||||
|
out_alph = set()
|
||||||
|
for line in f:
|
||||||
|
text, phonemes = line.strip().split("\t")
|
||||||
|
phonemes = phonemes.split(",")[0]
|
||||||
|
phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$'
|
||||||
|
text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$'
|
||||||
|
for letter in phonemes:
|
||||||
|
out_alph.add(letter)
|
||||||
|
for letter in text:
|
||||||
|
in_alph.add(letter)
|
||||||
|
p.write(text + '\t' + phonemes+'\n')
|
||||||
|
|
||||||
|
print(in_alph)
|
||||||
|
print(out_alph)
|
||||||
|
with open('in_alphabet', 'w+') as p:
|
||||||
|
p.write(''.join(in_alph))
|
||||||
|
with open('out_alphabet', 'w+') as p:
|
||||||
|
p.write(''.join(out_alph))
|
Loading…
Reference in New Issue
Block a user