From 2fb17e2bed2f437a189ec3db6311fee6172067ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Thu, 27 Jun 2019 12:54:16 +0200 Subject: [PATCH] json lemmatizer --- CMakeLists.txt | 25 ++++++++++++++++++ concordia-server/CMakeLists.txt | 2 +- concordia-server/json_lemmatizer.cpp | 36 ++++++++++++++++++++++++++ concordia-server/json_lemmatizer.hpp | 24 +++++++++++++++++ concordia-server/lemmatizer_facade.cpp | 14 ++++------ concordia-server/lemmatizer_facade.hpp | 3 ++- scripts/cmake_stubs/simplestart.sh.in | 1 - tests/lemmatizeSentence.py | 1 + 8 files changed, 94 insertions(+), 12 deletions(-) create mode 100644 concordia-server/json_lemmatizer.cpp create mode 100644 concordia-server/json_lemmatizer.hpp mode change 100644 => 100755 scripts/cmake_stubs/simplestart.sh.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f3c411..f11de11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,6 +150,31 @@ if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) +# ---------------------------------------------------- +# restclient-cpp +# ---------------------------------------------------- +find_library(RESTCLIENT_CPP_LIB NAMES restclient-cpp REQUIRED) +find_path(RESTCLIENT_CPP_INCLUDE restclient-cpp) + +if(EXISTS ${RESTCLIENT_CPP_LIB} AND EXISTS ${RESTCLIENT_CPP_INCLUDE}) + message(STATUS "Found restclient-cpp") + include_directories(${RESTCLIENT_CPP_INCLUDE}) + link_directories(${RESTCLIENT_CPP_LIB}) +endif(EXISTS ${RESTCLIENT_CPP_LIB} AND EXISTS ${RESTCLIENT_CPP_INCLUDE}) + +# ---------------------------------------------------- +# curl +# ---------------------------------------------------- +find_library(CURL_LIB NAMES curl REQUIRED) +find_path(CURL_INCLUDE curl) + +if(EXISTS ${CURL_LIB} AND EXISTS ${CURL_INCLUDE}) + message(STATUS "Found curl") + include_directories(${CURL_INCLUDE}) + link_directories(${CURL_LIB}) +endif(EXISTS ${CURL_LIB} AND EXISTS ${CURL_INCLUDE}) + + # ---------------------------------------------------- # Logging # ---------------------------------------------------- diff --git a/concordia-server/CMakeLists.txt b/concordia-server/CMakeLists.txt index ef392b8..d7b798d 100644 --- a/concordia-server/CMakeLists.txt +++ b/concordia-server/CMakeLists.txt @@ -3,4 +3,4 @@ file(GLOB main_sources "*.cpp") add_executable(concordia_server_process ${main_sources} ) -target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc) +target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc restclient-cpp curl) diff --git a/concordia-server/json_lemmatizer.cpp b/concordia-server/json_lemmatizer.cpp new file mode 100644 index 0000000..2c2848e --- /dev/null +++ b/concordia-server/json_lemmatizer.cpp @@ -0,0 +1,36 @@ +#include "json_lemmatizer.hpp" +#include "config.hpp" +#include "restclient-cpp/restclient.h" +#include "rapidjson/rapidjson.h" +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" +#include "rapidjson/error/en.h" +#include + +JsonLemmatizer::JsonLemmatizer() throw(ConcordiaException) { +} + +JsonLemmatizer::~JsonLemmatizer() { +} + + +std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { + rapidjson::StringBuffer paramsJson; + rapidjson::Writer jsonWriter(paramsJson); + jsonWriter.StartObject(); + jsonWriter.String("language"); + jsonWriter.String(languageCode.c_str()); + jsonWriter.String("sentences"); + jsonWriter.StartArray(); + jsonWriter.String(sentence.c_str()); + jsonWriter.EndArray(); + jsonWriter.EndObject(); + + + RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString()); + rapidjson::Document d; + d.Parse(r.body.c_str()); + std::string lemmatized = d["processed_sentences"][0]["tokens"].GetString(); + return lemmatized; +} diff --git a/concordia-server/json_lemmatizer.hpp b/concordia-server/json_lemmatizer.hpp new file mode 100644 index 0000000..84e9efa --- /dev/null +++ b/concordia-server/json_lemmatizer.hpp @@ -0,0 +1,24 @@ +#ifndef JSON_LEMMATIZER_HDR +#define JSON_LEMMATIZER_HDR + +#include + +#include + +#include "logger.hpp" + +class JsonLemmatizer { +public: + /*! Constructor. + */ + explicit JsonLemmatizer() throw(ConcordiaException); + /*! Destructor. + */ + virtual ~JsonLemmatizer(); + + std::string lemmatizeSentence(std::string languageCode, std::string sentence); +private: + Logger _logger; +}; + +#endif \ No newline at end of file diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp index 1c406e0..1cfdd13 100644 --- a/concordia-server/lemmatizer_facade.cpp +++ b/concordia-server/lemmatizer_facade.cpp @@ -4,19 +4,15 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { - _lemmatizersMap = boost::ptr_map(); + _lemmatizersMap = boost::ptr_map(); // todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator - SocketLemmatizer * socketLemmatizer1 = new SocketLemmatizer(11000); + JsonLemmatizer * jsonLemmatizer1 = new JsonLemmatizer(); std::string plCode = "pl"; std::string enCode = "en"; - std::string hrCode = "hr"; - std::string frCode = "fr"; - _lemmatizersMap.insert(plCode, socketLemmatizer1); - _lemmatizersMap.insert(enCode, socketLemmatizer1); - _lemmatizersMap.insert(hrCode, socketLemmatizer1); - _lemmatizersMap.insert(frCode, socketLemmatizer1); + _lemmatizersMap.insert(plCode, jsonLemmatizer1); + _lemmatizersMap.insert(enCode, jsonLemmatizer1); } LemmatizerFacade::~LemmatizerFacade() { @@ -24,7 +20,7 @@ LemmatizerFacade::~LemmatizerFacade() { std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) { - boost::ptr_map::iterator it = _lemmatizersMap.find(languageCode); + boost::ptr_map::iterator it = _lemmatizersMap.find(languageCode); if (it != _lemmatizersMap.end()) { return it->second->lemmatizeSentence(languageCode, sentence); } else { diff --git a/concordia-server/lemmatizer_facade.hpp b/concordia-server/lemmatizer_facade.hpp index 149f53f..0039568 100644 --- a/concordia-server/lemmatizer_facade.hpp +++ b/concordia-server/lemmatizer_facade.hpp @@ -2,6 +2,7 @@ #define LEMMATIZER_FACADE_HDR #include "socket_lemmatizer.hpp" +#include "json_lemmatizer.hpp" #include "tm_dao.hpp" #include @@ -27,7 +28,7 @@ public: std::vector lemmatizeSentencesIfNeeded(std::vector patterns, int tmId); private: - boost::ptr_map _lemmatizersMap; + boost::ptr_map _lemmatizersMap; TmDAO _tmDAO; }; diff --git a/scripts/cmake_stubs/simplestart.sh.in b/scripts/cmake_stubs/simplestart.sh.in old mode 100644 new mode 100755 index 3f79940..ea2f7cd --- a/scripts/cmake_stubs/simplestart.sh.in +++ b/scripts/cmake_stubs/simplestart.sh.in @@ -1,5 +1,4 @@ #!/bin/sh -mono "@LEMMAGEN_BINARIES_PATH@"/LemmaGenSockets.exe & spawn-fcgi -p 8000 -n "@COMPILED_BINARIES_PATH@"/concordia_server_process diff --git a/tests/lemmatizeSentence.py b/tests/lemmatizeSentence.py index 4874f7c..3bdadd4 100755 --- a/tests/lemmatizeSentence.py +++ b/tests/lemmatizeSentence.py @@ -27,3 +27,4 @@ end = time.time() print "Execution time: %.4f seconds." % (end-start) print "Result: " print response +print response['lemmatizedSentence']