transfix-mt/random-scripts/test.py

27 lines
612 B
Python
Raw Normal View History

2022-01-22 00:04:56 +01:00
import time
import nltk
from nltk.stem import WordNetLemmatizer
# nltk.download('omw-1.4')
# nltk.download('punkt')
nltk.download('wordnet')
wl = WordNetLemmatizer()
start_time = time.time_ns()
filex = []
with open('mt-summit-corpora/train/in.tsv', 'r') as file:
for line in file:
if len(filex) % 50000 == 0:
print(len(filex), end='\r')
line = nltk.word_tokenize(line)
filex.append(' '.join([wl.lemmatize(x) for x in line]))
stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
f = open('temp', 'w')
for line in filex:
f.write(line + '\n')