import re import requests URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/' DATA_FILE = 'en_US.txt' open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content) PREPROCESSED = 'preprocessed.tsv' with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p: in_alph = set() out_alph = set() for line in f: text, phonemes = line.strip().split("\t") phonemes = phonemes.split(",")[0] phonemes = re.sub(r'[/\'ˈˌ]', '', phonemes) text = re.sub(r'[^a-z]', '', text.strip()) for letter in phonemes: out_alph.add(letter) for letter in text: in_alph.add(letter) p.write(text + '\t' + phonemes+'\n') print(in_alph) print(out_alph) with open('in_alphabet', 'w+') as p: p.write(''.join(in_alph)) with open('out_alphabet', 'w+') as p: p.write(''.join(out_alph))