From a546cd9958b9a8b50795b4e876fc03d22efc43f8 Mon Sep 17 00:00:00 2001 From: Bartusiak Date: Thu, 2 Apr 2020 18:29:06 +0200 Subject: [PATCH] Created vocabulary --- code_regression.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 code_regression.py diff --git a/code_regression.py b/code_regression.py new file mode 100644 index 0000000..d31c8ff --- /dev/null +++ b/code_regression.py @@ -0,0 +1,22 @@ +from collections import defaultdict +import math +import pickle +import re + +vocabulary=[] +file_to_save=open("test.tsv","wb") + +def define_vocabulary(file_to_learn_new_words): + with open(file_to_learn_new_words,encoding='utf-8') as file: + for line in file: + #for word in re.findall(r"([a-zA-Z\-]+)", line): + for word in line.split(): + vocabulary.append(word) + return vocabulary + +def main(): + vocabulary=define_vocabulary('train/in.tsv') + file_to_save=vocabulary + +main() +