Created vocabulary

2020-04-02 20:01:33 +02:00 · 2020-04-02 20:01:33 +02:00 · 2dcb39fdde
commit 2dcb39fdde
parent a546cd9958
1 changed files with 22 additions and 10 deletions
--- a/code_regression.py
+++ b/code_regression.py
@ -4,19 +4,31 @@ import pickle
 import re
 vocabulary=[]
-file_to_save=open("test.tsv","wb")
+#word_to_index_mapping={}
 #index_to_word_mapping={}
-def define_vocabulary(file_to_learn_new_words):
+file_to_save=open("test.tsv","w",encoding='utf-8')
-    with open(file_to_learn_new_words,encoding='utf-8') as file:
+def define_vocabulary(file_to_learn_new_words,expected_path):
-        for line in file:
+    word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
-            #for word in re.findall(r"([a-zA-Z\-]+)", line):
+    with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8')  as  expected_file:
-            for word in line.split():
+        for line, exp in zip(in_file, expected_file):
-                vocabulary.append(word)
+            class_ = exp.rstrip('\n').replace(' ', '')
-    return vocabulary
+            text, timestamp = line.rstrip('\n').split('\t')
            tokens = text.lower().split(' ')
            for token in tokens:
                if class_ == 'P':
                    word_counts['paranormal'][token] += 1
                elif class_ == 'S':
                    word_counts['skeptic'][token] += 1
    return word_counts
 def main():
-    vocabulary=define_vocabulary('train/in.tsv')
+    vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv')
-    file_to_save=vocabulary
+    ix=1
    #for word in vocabulary:
        #word_to_index_mapping(word)=ix
        #index_to_word_mapping(ix)=word
 main()