diff --git a/test.py b/test.py new file mode 100644 index 0000000..92994ed --- /dev/null +++ b/test.py @@ -0,0 +1,26 @@ +import time +import nltk +from nltk.stem import WordNetLemmatizer + +# nltk.download('omw-1.4') +# nltk.download('punkt') +nltk.download('wordnet') + +wl = WordNetLemmatizer() + +start_time = time.time_ns() +filex = [] +with open('mt-summit-corpora/train/in.tsv', 'r') as file: + for line in file: + if len(filex) % 50000 == 0: + print(len(filex), end='\r') + line = nltk.word_tokenize(line) + filex.append(' '.join([wl.lemmatize(x) for x in line])) + + +stop = time.time_ns() +timex = (stop - start_time) / 1000000000 +print(timex) +f = open('temp', 'w') +for line in filex: + f.write(line + '\n')