diff --git a/fast-aligner/Makefile b/fast-aligner/Makefile index 8d7ed2a..4f2e9e1 100644 --- a/fast-aligner/Makefile +++ b/fast-aligner/Makefile @@ -1,6 +1,6 @@ SRC_LANG=pl TRG_LANG=en -CORPUS_NAME=opensubtitles +CORPUS_NAME=opensubtitles_sample SEPARATOR=@\#@ DICTIONARY_WEIGHT=3 @@ -9,6 +9,7 @@ all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt clean: + rm -f corpora/$(CORPUS_NAME)/*.norm rm -f corpora/$(CORPUS_NAME)/*.lem rm -f corpora/$(CORPUS_NAME)/*.dict rm -f corpora/$(CORPUS_NAME)/src_clean.txt diff --git a/fast-aligner/sentence_lemmatizer.py b/fast-aligner/sentence_lemmatizer.py index 2e506a8..33abd1d 100755 --- a/fast-aligner/sentence_lemmatizer.py +++ b/fast-aligner/sentence_lemmatizer.py @@ -17,7 +17,6 @@ def lemmatize_sentences(language_code, sentences): response_json = json.loads(response.text) result = {'normalized':[], 'lemmatized':[]} - print(response_json) for processed_sentence in response_json['processed_sentences']: result['normalized'].append(processed_sentence['normalized']) result['lemmatized'].append(processed_sentence['tokens']) @@ -33,7 +32,7 @@ def write_result(result, norm_file, lem_file): file_name = sys.argv[1] language_code = sys.argv[2] norm_output_name = sys.argv[3] -lem_output_name = sys.argv[3] +lem_output_name = sys.argv[4] sentences_buffer = [] with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem: