27 lines
612 B
Python
27 lines
612 B
Python
import time
|
|
import nltk
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
# nltk.download('omw-1.4')
|
|
# nltk.download('punkt')
|
|
nltk.download('wordnet')
|
|
|
|
wl = WordNetLemmatizer()
|
|
|
|
start_time = time.time_ns()
|
|
filex = []
|
|
with open('mt-summit-corpora/train/in.tsv', 'r') as file:
|
|
for line in file:
|
|
if len(filex) % 50000 == 0:
|
|
print(len(filex), end='\r')
|
|
line = nltk.word_tokenize(line)
|
|
filex.append(' '.join([wl.lemmatize(x) for x in line]))
|
|
|
|
|
|
stop = time.time_ns()
|
|
timex = (stop - start_time) / 1000000000
|
|
print(timex)
|
|
f = open('temp', 'w')
|
|
for line in filex:
|
|
f.write(line + '\n')
|