20 lines
607 B
Python
20 lines
607 B
Python
|
import nltk
|
||
|
import pandas as pd
|
||
|
|
||
|
from nltk.stem import WordNetLemmatizer
|
||
|
|
||
|
nltk.download('wordnet')
|
||
|
|
||
|
wl = WordNetLemmatizer()
|
||
|
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
|
||
|
source_lemmatized = []
|
||
|
for word in glossary['source']:
|
||
|
word = nltk.word_tokenize(word)
|
||
|
source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))
|
||
|
|
||
|
glossary['source_lem'] = source_lemmatized
|
||
|
glossary = glossary[['source', 'source_lem', 'result']]
|
||
|
glossary.set_index('source_lem')
|
||
|
|
||
|
glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False)
|