diff --git a/code_regression.py b/code_regression.py
index d31c8ff..34afa61 100644
--- a/code_regression.py
+++ b/code_regression.py
@@ -4,19 +4,31 @@ import pickle
 import re
 
 vocabulary=[]
-file_to_save=open("test.tsv","wb")
+#word_to_index_mapping={}
+#index_to_word_mapping={}
 
-def define_vocabulary(file_to_learn_new_words):
-    with open(file_to_learn_new_words,encoding='utf-8') as file:
-        for line in file:
-            #for word in re.findall(r"([a-zA-Z\-]+)", line):
-            for word in line.split():
-                vocabulary.append(word)
-    return vocabulary
+file_to_save=open("test.tsv","w",encoding='utf-8')
+def define_vocabulary(file_to_learn_new_words,expected_path):
+    word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
+    with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8')  as  expected_file:
+        for line, exp in zip(in_file, expected_file):
+            class_ = exp.rstrip('\n').replace(' ', '')
+            text, timestamp = line.rstrip('\n').split('\t')
+            tokens = text.lower().split(' ')
+            for token in tokens:
+                if class_ == 'P':
+                    word_counts['paranormal'][token] += 1
+                elif class_ == 'S':
+                    word_counts['skeptic'][token] += 1
+    return word_counts
 
 def main():
-    vocabulary=define_vocabulary('train/in.tsv')
-    file_to_save=vocabulary
+    vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv')
+    ix=1
+    #for word in vocabulary:
+        #word_to_index_mapping(word)=ix
+        #index_to_word_mapping(ix)=word
+
 
 main()