import nltk import os import sys from nltk.stem import WordNetLemmatizer def read_arguments(): try: path_in, path_expected = sys.argv return path_in, path_expected except Exception: print("ERROR: Wrong argument.") sys.exit(1) wl = WordNetLemmatizer() in_arg_path, expected_arg_path = read_arguments() train_in_path = os.path.join(os.path.expanduser('~'), in_arg_path) train_expected_path = os.path.join(os.path.expanduser('~'), expected_arg_path) file_lemmatized = [] with open(train_in_path, 'r') as file: for line in file: if len(file_lemmatized) % 1000 == 0: print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r') line = nltk.word_tokenize(line) file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line])) print('\n') with open(train_in_path + '.lemmatized', 'w') as file_write: for line in file_lemmatized: file_write.write(line + '\n')