From 2dcb39fdde6b4637f7c3ce36aecdf688b3f314ac Mon Sep 17 00:00:00 2001 From: Bartusiak Date: Thu, 2 Apr 2020 20:01:33 +0200 Subject: [PATCH] Created vocabulary --- code_regression.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/code_regression.py b/code_regression.py index d31c8ff..34afa61 100644 --- a/code_regression.py +++ b/code_regression.py @@ -4,19 +4,31 @@ import pickle import re vocabulary=[] -file_to_save=open("test.tsv","wb") +#word_to_index_mapping={} +#index_to_word_mapping={} -def define_vocabulary(file_to_learn_new_words): - with open(file_to_learn_new_words,encoding='utf-8') as file: - for line in file: - #for word in re.findall(r"([a-zA-Z\-]+)", line): - for word in line.split(): - vocabulary.append(word) - return vocabulary +file_to_save=open("test.tsv","w",encoding='utf-8') +def define_vocabulary(file_to_learn_new_words,expected_path): + word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)} + with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file: + for line, exp in zip(in_file, expected_file): + class_ = exp.rstrip('\n').replace(' ', '') + text, timestamp = line.rstrip('\n').split('\t') + tokens = text.lower().split(' ') + for token in tokens: + if class_ == 'P': + word_counts['paranormal'][token] += 1 + elif class_ == 'S': + word_counts['skeptic'][token] += 1 + return word_counts def main(): - vocabulary=define_vocabulary('train/in.tsv') - file_to_save=vocabulary + vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv') + ix=1 + #for word in vocabulary: + #word_to_index_mapping(word)=ix + #index_to_word_mapping(ix)=word + main()