From 51435aace9777e7baccbb474951acf33c74964ca Mon Sep 17 00:00:00 2001 From: Jakub Konieczny Date: Tue, 18 Jan 2022 23:27:23 +0000 Subject: [PATCH] save test.py --- test.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..92994ed --- /dev/null +++ b/test.py @@ -0,0 +1,26 @@ +import time +import nltk +from nltk.stem import WordNetLemmatizer + +# nltk.download('omw-1.4') +# nltk.download('punkt') +nltk.download('wordnet') + +wl = WordNetLemmatizer() + +start_time = time.time_ns() +filex = [] +with open('mt-summit-corpora/train/in.tsv', 'r') as file: + for line in file: + if len(filex) % 50000 == 0: + print(len(filex), end='\r') + line = nltk.word_tokenize(line) + filex.append(' '.join([wl.lemmatize(x) for x in line])) + + +stop = time.time_ns() +timex = (stop - start_time) / 1000000000 +print(timex) +f = open('temp', 'w') +for line in filex: + f.write(line + '\n')