diff --git a/server/concordia_preprocessor.py b/server/concordia_preprocessor.py index bb80bb1..9ffcff8 100755 --- a/server/concordia_preprocessor.py +++ b/server/concordia_preprocessor.py @@ -3,13 +3,14 @@ from flask import Flask from flask_restful import Api from server.lemmatizer_resource import LemmatizerResource -import pickle, spacy +import pickle +from nltk.stem import PorterStemmer app = Flask('preprocessor') api = Api(app) -lemmatizer = spacy.load('en', disable=['parser', 'ner']) +lemmatizer = PorterStemmer() print("Lemmatizer initialized") pickle_in = open("dictionaries/dict.pickle","rb") diff --git a/server/lemmatizer_resource.py b/server/lemmatizer_resource.py index dc9a6d9..90ffbb3 100644 --- a/server/lemmatizer_resource.py +++ b/server/lemmatizer_resource.py @@ -42,7 +42,8 @@ class LemmatizerResource(Resource): if lemma is None: lemma = word elif language == 'en': - doc = self.lemmatizer(word) - lemma = doc[0].lemma_ + lemma = self.lemmatizer.stem(word) + if len(lemma) == 0: + lemma = word return lemma diff --git a/tests/lemmatize_speed.py b/tests/lemmatize_speed.py new file mode 100755 index 0000000..6d772e1 --- /dev/null +++ b/tests/lemmatize_speed.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import requests, json, time + + +def do_lemmatize(data): + response = requests.post(url = 'http://127.0.0.1:10002/lemmatize', json = data) + return json.loads(response.text) + + +start = time.time() +data = {'language':'en', 'sentences':100*['this is just one of the sentences for testing']} +do_lemmatize(data) +end = time.time() +print("The operation took %.4f s" % (end - start))