transfix-mt/scripts/lemmatize_glossary.py

24 lines
681 B
Python
Raw Normal View History

2022-01-23 16:01:44 +01:00
import nltk
2022-01-23 16:58:40 +01:00
import os
2022-01-23 16:01:44 +01:00
import pandas as pd
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wl = WordNetLemmatizer()
2022-01-23 16:39:11 +01:00
2022-01-23 16:58:40 +01:00
glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
2022-01-23 16:39:11 +01:00
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
2022-01-23 16:01:44 +01:00
source_lemmatized = []
for word in glossary['source']:
word = nltk.word_tokenize(word)
source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))
glossary['source_lem'] = source_lemmatized
glossary = glossary[['source', 'source_lem', 'result']]
glossary.set_index('source_lem')
2022-01-23 16:39:11 +01:00
glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False)