#!/usr/bin/python3 # -*- coding: utf-8 -*- import json import requests import sys BUFFER_SIZE = 500 def lemmatize_sentences(language_code, sentences): data = { 'lemmatize': True, 'language':language_code, 'sentences':sentences } response = requests.post(url = 'http://127.0.0.1:10002/preprocess', json = data) response_json = json.loads(response.text) result = {'normalized':[], 'lemmatized':[]} for processed_sentence in response_json['processed_sentences']: result['normalized'].append(processed_sentence['normalized']) result['lemmatized'].append(processed_sentence['tokens']) return result def write_result(result, norm_file, lem_file): for s in result['normalized']: norm_file.write(s+'\n') for s in result['lemmatized']: lem_file.write(s+'\n') file_name = sys.argv[1] language_code = sys.argv[2] norm_output_name = sys.argv[3] lem_output_name = sys.argv[4] sentences_buffer = [] with open(file_name) as in_file, open(norm_output_name, 'w') as out_norm, open(lem_output_name, 'w') as out_lem: for line in in_file: sentences_buffer.append(line.rstrip()) if len(sentences_buffer) == BUFFER_SIZE: write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem) sentences_buffer = [] if len(sentences_buffer) > 0: write_result(lemmatize_sentences(language_code,sentences_buffer), out_norm, out_lem)