from collections import defaultdict import math import pickle import re vocabulary=[] #word_to_index_mapping={} #index_to_word_mapping={} file_to_save=open("test.tsv","w",encoding='utf-8') def define_vocabulary(file_to_learn_new_words,expected_path): word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file: for line, exp in zip(in_file, expected_file): class_ = exp.rstrip('\n').replace(' ', '') text, timestamp = line.rstrip('\n').split('\t') tokens = text.lower().split(' ') for token in tokens: if class_ == 'P': word_counts['paranormal'][token] += 1 elif class_ == 'S': word_counts['skeptic'][token] += 1 return word_counts def main(): vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv') ix=1 #for word in vocabulary: #word_to_index_mapping(word)=ix #index_to_word_mapping(ix)=word main()