2019-06-26 12:17:07 +02:00
|
|
|
from flask import request
|
|
|
|
from flask_restful import Resource
|
|
|
|
|
|
|
|
|
|
|
|
import requests, json, re, pickle
|
|
|
|
|
|
|
|
class LemmatizerResource(Resource):
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
self.lemmatizer = kwargs['lemmatizer']
|
|
|
|
self.dictionary = kwargs['dictionary']
|
|
|
|
|
|
|
|
def post(self):
|
|
|
|
json_data = request.get_json(force=True)
|
|
|
|
if not 'language' in json_data:
|
|
|
|
return {'error':'Missing parameter: language'}, 400
|
|
|
|
if not 'sentences' in json_data:
|
|
|
|
return {'error':'Missing parameter: sentences'}, 400
|
|
|
|
|
|
|
|
language = json_data['language']
|
2019-06-26 13:46:33 +02:00
|
|
|
if language not in ['pl', 'en']:
|
|
|
|
return {'error':'Unsupported language: %s' % language}, 400
|
|
|
|
|
2019-06-26 12:17:07 +02:00
|
|
|
|
|
|
|
sentences = []
|
|
|
|
for sentence in json_data['sentences']:
|
|
|
|
sentences.append(self.processSentence(sentence, language))
|
|
|
|
result = {
|
|
|
|
'processed_sentences':sentences
|
|
|
|
}
|
|
|
|
return result, 200
|
|
|
|
|
|
|
|
def processSentence(self, sentence, language):
|
2019-07-15 10:49:48 +02:00
|
|
|
raw_tokens = sentence.split()
|
|
|
|
tokens = [self.lemmatizeWord(token, language) for token in raw_tokens]
|
|
|
|
return {'tokens':' '.join(tokens), 'isFirstLemmatized':self.isFirstLemmatized(raw_tokens, language)}
|
|
|
|
|
|
|
|
def isFirstLemmatized(self, raw_tokens, language):
|
2019-08-27 21:55:40 +02:00
|
|
|
if language == 'pl' and len(raw_tokens) > 0:
|
2019-07-15 10:49:48 +02:00
|
|
|
first_token = raw_tokens[0]
|
|
|
|
if self.lemmatizeWord(first_token, language) != first_token:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2019-06-26 12:17:07 +02:00
|
|
|
|
|
|
|
def lemmatizeWord(self, word, language):
|
|
|
|
if len(word) == 1:
|
|
|
|
return word
|
|
|
|
lemma = None
|
2019-06-26 13:46:33 +02:00
|
|
|
if language == 'pl':
|
2019-06-26 12:17:07 +02:00
|
|
|
lemma = self.dictionary.get(word, None)
|
|
|
|
if lemma is None:
|
|
|
|
lemma = word
|
|
|
|
elif language == 'en':
|
2019-07-14 22:29:07 +02:00
|
|
|
lemma = self.lemmatizer.stem(word)
|
|
|
|
if len(lemma) == 0:
|
|
|
|
lemma = word
|
2019-06-26 12:17:07 +02:00
|
|
|
return lemma
|
|
|
|
|