2021-03-22 13:03:07 +01:00
|
|
|
import re
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
|
|
|
|
DATA_FILE = 'en_US.txt'
|
|
|
|
open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
|
|
|
|
|
|
|
|
PREPROCESSED = 'preprocessed.tsv'
|
|
|
|
with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
|
|
|
|
in_alph = set()
|
|
|
|
out_alph = set()
|
|
|
|
for line in f:
|
|
|
|
text, phonemes = line.strip().split("\t")
|
|
|
|
phonemes = phonemes.split(",")[0]
|
2021-04-25 20:55:45 +02:00
|
|
|
phonemes = re.sub(r'[/\'ˈˌ]', '', phonemes)
|
|
|
|
text = re.sub(r'[^a-z]', '', text.strip())
|
2021-03-22 13:03:07 +01:00
|
|
|
for letter in phonemes:
|
|
|
|
out_alph.add(letter)
|
|
|
|
for letter in text:
|
|
|
|
in_alph.add(letter)
|
|
|
|
p.write(text + '\t' + phonemes+'\n')
|
|
|
|
|
2021-04-25 20:55:45 +02:00
|
|
|
|
2021-03-22 13:03:07 +01:00
|
|
|
print(in_alph)
|
|
|
|
print(out_alph)
|
|
|
|
with open('in_alphabet', 'w+') as p:
|
|
|
|
p.write(''.join(in_alph))
|
|
|
|
with open('out_alphabet', 'w+') as p:
|
|
|
|
p.write(''.join(out_alph))
|