49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
|
#!/usr/bin/python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import json
|
||
|
import requests
|
||
|
import sys
|
||
|
|
||
|
BUFFER_SIZE = 500
|
||
|
|
||
|
def lemmatize_sentences(language_code, sentences):
|
||
|
data = {
|
||
|
'lemmatize': True,
|
||
|
'language':language_code,
|
||
|
'sentences':sentences
|
||
|
}
|
||
|
response = requests.post(url = 'http://127.0.0.1:10002/preprocess', json = data)
|
||
|
response_json = json.loads(response.text)
|
||
|
|
||
|
result = {'normalized':[], 'lemmatized':[]}
|
||
|
for processed_sentence in response_json['processed_sentences']:
|
||
|
result['normalized'].append(processed_sentence['normalized'])
|
||
|
result['lemmatized'].append(processed_sentence['tokens'])
|
||
|
return result
|
||
|
|
||
|
def write_result(result, norm_file, lem_file):
|
||
|
for s in result['normalized']:
|
||
|
norm_file.write(s+'\n')
|
||
|
for s in result['lemmatized']:
|
||
|
lem_file.write(s+'\n')
|
||
|
|
||
|
|
||
|
file_name = sys.argv[1]
|
||
|
language_code = sys.argv[2]
|
||
|
norm_output_name = sys.argv[3]
|
||
|
lem_output_name = sys.argv[4]
|
||
|
|
||
|
sentences_buffer = []
|
||
|
with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem:
|
||
|
for line in in_file:
|
||
|
sentences_buffer.append(line.rstrip())
|
||
|
if len(sentences_buffer) == BUFFER_SIZE:
|
||
|
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||
|
sentences_buffer = []
|
||
|
|
||
|
if len(sentences_buffer) > 0:
|
||
|
write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)
|
||
|
|
||
|
|