2022-01-23 16:01:44 +01:00
|
|
|
import nltk
|
2022-01-23 16:39:11 +01:00
|
|
|
import os
|
2022-01-24 14:50:52 +01:00
|
|
|
import sys
|
2022-01-23 16:39:11 +01:00
|
|
|
|
2022-01-23 16:01:44 +01:00
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
|
2022-01-24 14:50:52 +01:00
|
|
|
def read_arguments():
|
|
|
|
try:
|
|
|
|
path_in, path_expected = sys.argv
|
|
|
|
return path_in, path_expected
|
|
|
|
except Exception:
|
|
|
|
print("ERROR: Wrong argument.")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
2022-01-23 16:01:44 +01:00
|
|
|
wl = WordNetLemmatizer()
|
|
|
|
|
2022-01-24 14:50:52 +01:00
|
|
|
in_arg_path, expected_arg_path = read_arguments()
|
|
|
|
train_in_path = os.path.join(os.path.expanduser('~'), in_arg_path)
|
|
|
|
train_expected_path = os.path.join(os.path.expanduser('~'), expected_arg_path)
|
2022-01-23 16:01:44 +01:00
|
|
|
|
|
|
|
file_lemmatized = []
|
|
|
|
with open(train_in_path, 'r') as file:
|
|
|
|
for line in file:
|
2022-01-23 17:48:37 +01:00
|
|
|
if len(file_lemmatized) % 1000 == 0:
|
2022-01-23 16:01:44 +01:00
|
|
|
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
|
|
|
|
line = nltk.word_tokenize(line)
|
|
|
|
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
|
2022-01-23 16:58:40 +01:00
|
|
|
print('\n')
|
2022-01-23 16:01:44 +01:00
|
|
|
|
|
|
|
with open(train_in_path + '.lemmatized', 'w') as file_write:
|
|
|
|
for line in file_lemmatized:
|
|
|
|
file_write.write(line + '\n')
|