Created vocabulary

This commit is contained in:
Bartusiak 2020-04-02 20:01:33 +02:00
parent a546cd9958
commit 2dcb39fdde

View File

@ -4,19 +4,31 @@ import pickle
import re import re
vocabulary=[] vocabulary=[]
file_to_save=open("test.tsv","wb") #word_to_index_mapping={}
#index_to_word_mapping={}
def define_vocabulary(file_to_learn_new_words): file_to_save=open("test.tsv","w",encoding='utf-8')
with open(file_to_learn_new_words,encoding='utf-8') as file: def define_vocabulary(file_to_learn_new_words,expected_path):
for line in file: word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
#for word in re.findall(r"([a-zA-Z\-]+)", line): with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8') as expected_file:
for word in line.split(): for line, exp in zip(in_file, expected_file):
vocabulary.append(word) class_ = exp.rstrip('\n').replace(' ', '')
return vocabulary text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def main(): def main():
vocabulary=define_vocabulary('train/in.tsv') vocabulary=define_vocabulary('train/in.tsv','train/expected.tsv')
file_to_save=vocabulary ix=1
#for word in vocabulary:
#word_to_index_mapping(word)=ix
#index_to_word_mapping(ix)=word
main() main()