concordia-aligner/sentence_lemmatizer.py

41 lines
1.1 KiB
Python
Raw Normal View History

2019-06-13 12:34:19 +02:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import requests
import sys
BUFFER_SIZE = 500
def lemmatize_sentences(language_code, sentences):
data = {
'language':language_code,
'sentences':sentences
}
2019-06-26 09:08:00 +02:00
response = requests.post(url = 'http://concordia-preprocessor:9001/lemmatize', json = data, timeout = 120)
2019-06-13 12:34:19 +02:00
response_json = json.loads(response.text)
2019-06-26 09:08:00 +02:00
return response_json['processed_sentences']
2019-06-13 12:34:19 +02:00
2019-06-26 09:08:00 +02:00
def write_result(result, lem_file):
for s in result:
lem_file.write(s['tokens']+'\n')
2019-06-13 12:34:19 +02:00
file_name = sys.argv[1]
language_code = sys.argv[2]
2019-06-26 09:08:00 +02:00
lem_output_name = sys.argv[3]
2019-06-13 12:34:19 +02:00
sentences_buffer = []
2019-06-26 09:08:00 +02:00
with open(file_name) as in_file, open(lem_output_name, 'w') as out_lem:
2019-06-13 12:34:19 +02:00
for line in in_file:
sentences_buffer.append(line.rstrip())
if len(sentences_buffer) == BUFFER_SIZE:
2019-06-26 09:08:00 +02:00
write_result(lemmatize_sentences(language_code,sentences_buffer), out_lem)
2019-06-13 12:34:19 +02:00
sentences_buffer = []
if len(sentences_buffer) > 0:
2019-06-26 09:08:00 +02:00
write_result(lemmatize_sentences(language_code,sentences_buffer), out_lem)
2019-06-13 12:34:19 +02:00