ium_434749/create_dataset.py

31 lines
887 B
Python

import re
import requests
URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
DATA_FILE = 'en_US.txt'
open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
PREPROCESSED = 'preprocessed.tsv'
with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
in_alph = set()
out_alph = set()
for line in f:
text, phonemes = line.strip().split("\t")
phonemes = phonemes.split(",")[0]
phonemes = re.sub(r'[/\'ˈˌ]', '', phonemes)
text = re.sub(r'[^a-z]', '', text.strip())
for letter in phonemes:
out_alph.add(letter)
for letter in text:
in_alph.add(letter)
p.write(text + '\t' + phonemes+'\n')
print(in_alph)
print(out_alph)
with open('in_alphabet', 'w+') as p:
p.write(''.join(in_alph))
with open('out_alphabet', 'w+') as p:
p.write(''.join(out_alph))