37 lines
869 B
Python
37 lines
869 B
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
import unittest
|
||
|
import json
|
||
|
import requests
|
||
|
import sys
|
||
|
|
||
|
|
||
|
|
||
|
def lemmatizeSentence(lang, sentence):
|
||
|
data = {
|
||
|
'operation': 'lemmatize',
|
||
|
'languageCode':lang,
|
||
|
'sentence':sentence
|
||
|
}
|
||
|
|
||
|
address = 'http://localhost:8800'
|
||
|
response = requests.post(address, data=json.dumps(data))
|
||
|
return response.json()['lemmatizedSentence']
|
||
|
|
||
|
corpus_file_path = sys.argv[1]
|
||
|
lang = sys.argv[2]
|
||
|
|
||
|
|
||
|
line_count = 0
|
||
|
with open(corpus_file_path) as corpus_file:
|
||
|
for line in corpus_file:
|
||
|
line_count += 1
|
||
|
orig = line.rstrip()
|
||
|
lemmatized = lemmatizeSentence(lang,orig)
|
||
|
if len(orig.split()) != len(lemmatized.split()):
|
||
|
print("Different length in:")
|
||
|
print(orig)
|
||
|
print(lemmatized)
|
||
|
if line_count % 1000 == 0:
|
||
|
sys.stderr.write("Done %d lines\n" % line_count)
|