modified aligner
This commit is contained in:
parent
4a21673352
commit
e00f27d62f
1
fast-aligner/.gitignore
vendored
1
fast-aligner/.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
corpora/
|
corpora/
|
||||||
|
fast_align
|
@ -30,16 +30,16 @@ corpora/$(CORPUS_NAME)/trg.dict:
|
|||||||
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
|
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
|
corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
|
||||||
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(SRC_LANG) > $@
|
./sentence_lemmatizer.py $< $(SRC_LANG) corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/src.lem
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
||||||
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@
|
./sentence_lemmatizer.py $< $(TRG_LANG) corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/trg.lem
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
||||||
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
|
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||||
fast_align -i $< -d -o -v > $@
|
./fast_align -i $< -d -o -v > $@
|
@ -9,29 +9,41 @@ BUFFER_SIZE = 500
|
|||||||
|
|
||||||
def lemmatize_sentences(language_code, sentences):
|
def lemmatize_sentences(language_code, sentences):
|
||||||
data = {
|
data = {
|
||||||
'operation': 'lemmatizeAll',
|
'lemmatize': True,
|
||||||
'languageCode':language_code,
|
'language':language_code,
|
||||||
'sentences':sentences
|
'sentences':sentences
|
||||||
}
|
}
|
||||||
|
response = requests.post(url = 'http://127.0.0.1:10002/preprocess', json = data)
|
||||||
address = 'http://localhost:8800'
|
|
||||||
|
|
||||||
response = requests.post(address, data = json.dumps(data))
|
|
||||||
response.encoding = 'utf-8'
|
|
||||||
|
|
||||||
response_json = json.loads(response.text)
|
response_json = json.loads(response.text)
|
||||||
return '\n'.join(response_json['lemmatizedSentences'])
|
|
||||||
|
result = {'normalized':[], 'lemmatized':[]}
|
||||||
|
print(response_json)
|
||||||
|
for processed_sentence in response_json['processed_sentences']:
|
||||||
|
result['normalized'].append(processed_sentence['normalized'])
|
||||||
|
result['lemmatized'].append(processed_sentence['tokens'])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def write_result(result, norm_file, lem_file):
|
||||||
|
for s in result['normalized']:
|
||||||
|
norm_file.write(s+'\n')
|
||||||
|
for s in result['lemmatized']:
|
||||||
|
lem_file.write(s+'\n')
|
||||||
|
|
||||||
|
|
||||||
language_code = sys.argv[1]
|
file_name = sys.argv[1]
|
||||||
|
language_code = sys.argv[2]
|
||||||
|
norm_output_name = sys.argv[3]
|
||||||
|
lem_output_name = sys.argv[3]
|
||||||
|
|
||||||
sentences_buffer = []
|
sentences_buffer = []
|
||||||
for line in sys.stdin:
|
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem:
|
||||||
|
for line in in_file:
|
||||||
sentences_buffer.append(line.rstrip())
|
sentences_buffer.append(line.rstrip())
|
||||||
if len(sentences_buffer) == BUFFER_SIZE:
|
if len(sentences_buffer) == BUFFER_SIZE:
|
||||||
print(lemmatize_sentences(language_code,sentences_buffer))
|
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||||||
sentences_buffer = []
|
sentences_buffer = []
|
||||||
|
|
||||||
if len(sentences_buffer) > 0:
|
if len(sentences_buffer) > 0:
|
||||||
print(lemmatize_sentences(language_code,sentences_buffer))
|
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user