lemmatizer
This commit is contained in:
parent
021c483a2c
commit
8b9deb906f
13
Makefile
13
Makefile
@ -34,17 +34,20 @@ corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.t
|
||||
|
||||
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
|
||||
./sentence_lemmatizer.py $< $(SRC_LANG) corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/src.lem
|
||||
|
||||
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
||||
./sentence_lemmatizer.py $< $(TRG_LANG) corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/trg.lem
|
||||
|
||||
|
||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
||||
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
|
||||
|
||||
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
||||
./sentence_lemmatizer.py $< $(SRC_LANG) $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
|
||||
./sentence_lemmatizer.py $< $(TRG_LANG) $@
|
||||
|
||||
|
||||
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
||||
/usr/local/bin/concordia-sentence-tokenizer -c /code/prod/resources/concordia-config/concordia.cfg < $< > $@
|
||||
|
||||
|
@ -9,40 +9,32 @@ BUFFER_SIZE = 500
|
||||
|
||||
def lemmatize_sentences(language_code, sentences):
|
||||
data = {
|
||||
'lemmatize': True,
|
||||
'language':language_code,
|
||||
'sentences':sentences
|
||||
}
|
||||
response = requests.post(url = 'http://concordia-preprocessor:9001/preprocess', json = data, timeout = 120)
|
||||
response = requests.post(url = 'http://concordia-preprocessor:9001/lemmatize', json = data, timeout = 120)
|
||||
response_json = json.loads(response.text)
|
||||
|
||||
result = {'normalized':[], 'lemmatized':[]}
|
||||
for processed_sentence in response_json['processed_sentences']:
|
||||
result['normalized'].append(processed_sentence['normalized'])
|
||||
result['lemmatized'].append(processed_sentence['tokens'])
|
||||
return result
|
||||
return response_json['processed_sentences']
|
||||
|
||||
def write_result(result, norm_file, lem_file):
|
||||
for s in result['normalized']:
|
||||
norm_file.write(s+'\n')
|
||||
for s in result['lemmatized']:
|
||||
lem_file.write(s+'\n')
|
||||
def write_result(result, lem_file):
|
||||
for s in result:
|
||||
lem_file.write(s['tokens']+'\n')
|
||||
|
||||
|
||||
file_name = sys.argv[1]
|
||||
language_code = sys.argv[2]
|
||||
norm_output_name = sys.argv[3]
|
||||
lem_output_name = sys.argv[4]
|
||||
lem_output_name = sys.argv[3]
|
||||
|
||||
sentences_buffer = []
|
||||
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem:
|
||||
with open(file_name) as in_file, open(lem_output_name, 'w') as out_lem:
|
||||
for line in in_file:
|
||||
sentences_buffer.append(line.rstrip())
|
||||
if len(sentences_buffer) == BUFFER_SIZE:
|
||||
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||||
write_result(lemmatize_sentences(language_code,sentences_buffer), out_lem)
|
||||
sentences_buffer = []
|
||||
|
||||
if len(sentences_buffer) > 0:
|
||||
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||||
write_result(lemmatize_sentences(language_code,sentences_buffer), out_lem)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user