transfix-mt/scripts/lemmatize_in.py

26 lines
875 B
Python
Raw Normal View History

2022-01-23 16:01:44 +01:00
import nltk
2022-01-23 16:39:11 +01:00
import os
2022-01-23 16:01:44 +01:00
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
2022-01-23 16:39:11 +01:00
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
2022-01-23 16:01:44 +01:00
2022-01-23 16:39:11 +01:00
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
2022-01-23 16:01:44 +01:00
file_lemmatized = []
with open(train_in_path, 'r') as file:
for line in file:
if len(file_lemmatized) % 50000 == 0:
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
line = nltk.word_tokenize(line)
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
with open(train_in_path + '.lemmatized', 'w') as file_write:
for line in file_lemmatized:
file_write.write(line + '\n')