adjustments
This commit is contained in:
parent
e00f27d62f
commit
1e6d9dfa89
@ -1,6 +1,6 @@
|
|||||||
SRC_LANG=pl
|
SRC_LANG=pl
|
||||||
TRG_LANG=en
|
TRG_LANG=en
|
||||||
CORPUS_NAME=opensubtitles
|
CORPUS_NAME=opensubtitles_sample
|
||||||
SEPARATOR=@\#@
|
SEPARATOR=@\#@
|
||||||
|
|
||||||
DICTIONARY_WEIGHT=3
|
DICTIONARY_WEIGHT=3
|
||||||
@ -9,6 +9,7 @@ all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt
|
|||||||
|
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.norm
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.dict
|
rm -f corpora/$(CORPUS_NAME)/*.dict
|
||||||
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||||
|
@ -17,7 +17,6 @@ def lemmatize_sentences(language_code, sentences):
|
|||||||
response_json = json.loads(response.text)
|
response_json = json.loads(response.text)
|
||||||
|
|
||||||
result = {'normalized':[], 'lemmatized':[]}
|
result = {'normalized':[], 'lemmatized':[]}
|
||||||
print(response_json)
|
|
||||||
for processed_sentence in response_json['processed_sentences']:
|
for processed_sentence in response_json['processed_sentences']:
|
||||||
result['normalized'].append(processed_sentence['normalized'])
|
result['normalized'].append(processed_sentence['normalized'])
|
||||||
result['lemmatized'].append(processed_sentence['tokens'])
|
result['lemmatized'].append(processed_sentence['tokens'])
|
||||||
@ -33,7 +32,7 @@ def write_result(result, norm_file, lem_file):
|
|||||||
file_name = sys.argv[1]
|
file_name = sys.argv[1]
|
||||||
language_code = sys.argv[2]
|
language_code = sys.argv[2]
|
||||||
norm_output_name = sys.argv[3]
|
norm_output_name = sys.argv[3]
|
||||||
lem_output_name = sys.argv[3]
|
lem_output_name = sys.argv[4]
|
||||||
|
|
||||||
sentences_buffer = []
|
sentences_buffer = []
|
||||||
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem:
|
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem:
|
||||||
|
Loading…
Reference in New Issue
Block a user