split functionality into separate files

2021-03-22 13:03:07 +01:00 · 2021-03-22 13:03:07 +01:00 · faaa014167
commit faaa014167
parent a72ad9e743
2 changed files with 30 additions and 15 deletions
--- a/PH.py
+++ b/PH.py
@ -34,21 +34,7 @@ IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'
             'u', 'v', 'w', 'x', 'y', 'z', '$', '^']


-def extract_alphabet():
-    """
-    This function has been used to extract the alphabet but then it was hard-coded directly into
-    code in order to speed up loading time
-    """
-    with open(DATA_FILE) as f:
-        in_alph = set()
-        out_alph = set()
-        for line in f:
-            text, phonemes = line.strip().split("\t")
-            phonemes = phonemes.split(",")[0]
-            for letter in phonemes:
-                out_alph.add(letter)
-                in_alph.add(letter)
-        return in_alph, out_alph
+


 IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}
--- a/create_dataset.py
+++ b/create_dataset.py
@ -0,0 +1,29 @@
+import re
+
+import requests
+
+URL = 'https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/'
+DATA_FILE = 'en_US.txt'
+open(DATA_FILE, 'wb').write(requests.get(URL + DATA_FILE).content)
+
+PREPROCESSED = 'preprocessed.tsv'
+with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p:
+    in_alph = set()
+    out_alph = set()
+    for line in f:
+        text, phonemes = line.strip().split("\t")
+        phonemes = phonemes.split(",")[0]
+        phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$'
+        text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$'
+        for letter in phonemes:
+            out_alph.add(letter)
+        for letter in text:
+            in_alph.add(letter)
+        p.write(text + '\t' + phonemes+'\n')
+
+print(in_alph)
+print(out_alph)
+with open('in_alphabet', 'w+') as p:
+    p.write(''.join(in_alph))
+with open('out_alphabet', 'w+') as p:
+    p.write(''.join(out_alph))