concordia-server/tests/lemmatizer-test/test_corpus.py

37 lines
869 B
Python
Raw Permalink Normal View History

2018-12-31 11:13:16 +01:00
#!/usr/bin/python3
import unittest
import json
import requests
import sys
def lemmatizeSentence(lang, sentence):
data = {
'operation': 'lemmatize',
'languageCode':lang,
'sentence':sentence
}
address = 'http://localhost:8800'
response = requests.post(address, data=json.dumps(data))
return response.json()['lemmatizedSentence']
corpus_file_path = sys.argv[1]
lang = sys.argv[2]
line_count = 0
with open(corpus_file_path) as corpus_file:
for line in corpus_file:
line_count += 1
orig = line.rstrip()
lemmatized = lemmatizeSentence(lang,orig)
if len(orig.split()) != len(lemmatized.split()):
print("Different length in:")
print(orig)
print(lemmatized)
if line_count % 1000 == 0:
sys.stderr.write("Done %d lines\n" % line_count)