adjustments

This commit is contained in:
rjawor 2019-06-12 17:42:46 +02:00
parent e00f27d62f
commit 1e6d9dfa89
2 changed files with 3 additions and 3 deletions

View File

@ -1,6 +1,6 @@
SRC_LANG=pl SRC_LANG=pl
TRG_LANG=en TRG_LANG=en
CORPUS_NAME=opensubtitles CORPUS_NAME=opensubtitles_sample
SEPARATOR=@\#@ SEPARATOR=@\#@
DICTIONARY_WEIGHT=3 DICTIONARY_WEIGHT=3
@ -9,6 +9,7 @@ all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt
clean: clean:
rm -f corpora/$(CORPUS_NAME)/*.norm
rm -f corpora/$(CORPUS_NAME)/*.lem rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.dict rm -f corpora/$(CORPUS_NAME)/*.dict
rm -f corpora/$(CORPUS_NAME)/src_clean.txt rm -f corpora/$(CORPUS_NAME)/src_clean.txt

View File

@ -17,7 +17,6 @@ def lemmatize_sentences(language_code, sentences):
response_json = json.loads(response.text) response_json = json.loads(response.text)
result = {'normalized':[], 'lemmatized':[]} result = {'normalized':[], 'lemmatized':[]}
print(response_json)
for processed_sentence in response_json['processed_sentences']: for processed_sentence in response_json['processed_sentences']:
result['normalized'].append(processed_sentence['normalized']) result['normalized'].append(processed_sentence['normalized'])
result['lemmatized'].append(processed_sentence['tokens']) result['lemmatized'].append(processed_sentence['tokens'])
@ -33,7 +32,7 @@ def write_result(result, norm_file, lem_file):
file_name = sys.argv[1] file_name = sys.argv[1]
language_code = sys.argv[2] language_code = sys.argv[2]
norm_output_name = sys.argv[3] norm_output_name = sys.argv[3]
lem_output_name = sys.argv[3] lem_output_name = sys.argv[4]
sentences_buffer = [] sentences_buffer = []
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem: with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem: