2022-01-23 16:01:44 +01:00
|
|
|
import nltk
|
2022-01-23 16:58:40 +01:00
|
|
|
import os
|
2022-01-23 16:01:44 +01:00
|
|
|
import pandas as pd
|
2022-01-24 14:50:52 +01:00
|
|
|
import sys
|
2022-01-23 16:01:44 +01:00
|
|
|
|
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
nltk.download('wordnet')
|
|
|
|
|
2022-01-24 14:50:52 +01:00
|
|
|
|
|
|
|
def read_arguments():
|
|
|
|
try:
|
|
|
|
glossary_arg_pathx = sys.argv
|
|
|
|
return glossary_arg_pathx
|
|
|
|
except Exception:
|
|
|
|
print("ERROR: Wrong argument.")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
2022-01-23 16:01:44 +01:00
|
|
|
wl = WordNetLemmatizer()
|
2022-01-23 16:39:11 +01:00
|
|
|
|
2022-01-24 14:50:52 +01:00
|
|
|
glossary_path = os.path.join(os.path.expanduser('~'), read_arguments()[0])
|
2022-01-23 16:39:11 +01:00
|
|
|
|
|
|
|
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
2022-01-23 16:01:44 +01:00
|
|
|
source_lemmatized = []
|
|
|
|
for word in glossary['source']:
|
|
|
|
word = nltk.word_tokenize(word)
|
|
|
|
source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))
|
|
|
|
|
|
|
|
glossary['source_lem'] = source_lemmatized
|
|
|
|
glossary = glossary[['source', 'source_lem', 'result']]
|
|
|
|
glossary.set_index('source_lem')
|
|
|
|
|
2022-01-23 16:39:11 +01:00
|
|
|
glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False)
|