concordia-preprocessor/server/lemmatizer_resource.py

50 lines
1.5 KiB
Python
Raw Normal View History

2019-06-26 12:17:07 +02:00
from flask import request
from flask_restful import Resource
import requests, json, re, pickle
class LemmatizerResource(Resource):
def __init__(self, **kwargs):
self.lemmatizer = kwargs['lemmatizer']
self.dictionary = kwargs['dictionary']
def post(self):
json_data = request.get_json(force=True)
if not 'language' in json_data:
return {'error':'Missing parameter: language'}, 400
if not 'sentences' in json_data:
return {'error':'Missing parameter: sentences'}, 400
language = json_data['language']
2019-06-26 13:46:33 +02:00
if language not in ['pl', 'en']:
return {'error':'Unsupported language: %s' % language}, 400
2019-06-26 12:17:07 +02:00
sentences = []
for sentence in json_data['sentences']:
sentences.append(self.processSentence(sentence, language))
result = {
'processed_sentences':sentences
}
return result, 200
def processSentence(self, sentence, language):
tokens = [self.lemmatizeWord(token, language) for token in sentence.split()]
return {'tokens':' '.join(tokens)}
def lemmatizeWord(self, word, language):
if len(word) == 1:
return word
lemma = None
2019-06-26 13:46:33 +02:00
if language == 'pl':
2019-06-26 12:17:07 +02:00
lemma = self.dictionary.get(word, None)
if lemma is None:
lemma = word
elif language == 'en':
2019-07-14 22:29:07 +02:00
lemma = self.lemmatizer.stem(word)
if len(lemma) == 0:
lemma = word
2019-06-26 12:17:07 +02:00
return lemma