diff --git a/.gitignore b/.gitignore index 70a29fc..b3ee04f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ db/pgbouncer.pid db/pgbouncer.ini upstart/concordia-server.conf upstart/pgbouncer.conf +upstart/lemmagen.conf cat/host.cfg mgiza-aligner/mgiza/mgizapp/CMakeCache.txt mgiza-aligner/mgiza/mgizapp/CMakeFiles/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 1403d70..f4b6818 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ configure_file ( ) set(COMPILED_BINARIES_PATH "${concordia-server_SOURCE_DIR}/build/concordia-server") +set(LEMMAGEN_BINARIES_PATH "${concordia-server_SOURCE_DIR}/LemmaGenSockets/LemmaGenSockets/bin/Debug") set(SCRIPTS_PATH "${concordia-server_SOURCE_DIR}/scripts") configure_file ( "${concordia-server_SOURCE_DIR}/scripts/cmake_stubs/start.sh.in" @@ -55,6 +56,12 @@ configure_file ( "${concordia-server_SOURCE_DIR}/upstart/pgbouncer.conf" ) +configure_file ( + "${concordia-server_SOURCE_DIR}/upstart/cmake_stubs/lemmagen.conf.in" + "${concordia-server_SOURCE_DIR}/upstart/lemmagen.conf" + ) + + configure_file ( "${concordia-server_SOURCE_DIR}/db/pgbouncer.ini.in" "${concordia-server_SOURCE_DIR}/db/pgbouncer.ini" @@ -119,7 +126,7 @@ endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE}) # ---------------------------------------------------- set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_STATIC_RUNTIME OFF) -find_package(Boost COMPONENTS +find_package(Boost COMPONENTS serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED) # ---------------------------------------------------- @@ -214,8 +221,3 @@ if(EXISTS ${UTF8CASE_LIB}) endif(EXISTS ${UTF8CASE_LIB}) add_subdirectory(concordia-server) - - - - - diff --git a/LemmaGenSockets/LemmaGen/LemmaSharp.dll b/LemmaGenSockets/LemmaGen/LemmaSharp.dll new file mode 100644 index 0000000..5d8380d Binary files /dev/null and b/LemmaGenSockets/LemmaGen/LemmaSharp.dll differ diff --git a/LemmaGenSockets/LemmaGen/LemmaSharpPrebuilt.dll b/LemmaGenSockets/LemmaGen/LemmaSharpPrebuilt.dll new file mode 100644 index 0000000..dde85f7 Binary files /dev/null and b/LemmaGenSockets/LemmaGen/LemmaSharpPrebuilt.dll differ diff --git a/LemmaGenSockets/LemmaGen/LemmaSharpPrebuiltCompact.dll b/LemmaGenSockets/LemmaGen/LemmaSharpPrebuiltCompact.dll new file mode 100644 index 0000000..c1b2a38 Binary files /dev/null and b/LemmaGenSockets/LemmaGen/LemmaSharpPrebuiltCompact.dll differ diff --git a/LemmaGenSockets/LemmaGen/Lzma#.dll b/LemmaGenSockets/LemmaGen/Lzma#.dll new file mode 100644 index 0000000..2bb9990 Binary files /dev/null and b/LemmaGenSockets/LemmaGen/Lzma#.dll differ diff --git a/LemmaGenSockets/LemmaGenSockets.sln b/LemmaGenSockets/LemmaGenSockets.sln new file mode 100644 index 0000000..970942e --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25420.1 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaGenSockets", "LemmaGenSockets\LemmaGenSockets.csproj", "{3098BC55-2CC9-4612-9F79-8C812B3BE539}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/LemmaGenSockets/LemmaGenSockets/App.config b/LemmaGenSockets/LemmaGenSockets/App.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmaGenSockets/LemmaGenSockets.csproj b/LemmaGenSockets/LemmaGenSockets/LemmaGenSockets.csproj new file mode 100644 index 0000000..eaae3e2 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/LemmaGenSockets.csproj @@ -0,0 +1,73 @@ + + + + + Debug + AnyCPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539} + Exe + Properties + LemmaGenSockets + LemmaGenSockets + v4.5.2 + 512 + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + ..\LemmaGen\LemmaSharp.dll + + + ..\LemmaGen\LemmaSharpPrebuilt.dll + + + ..\LemmaGen\LemmaSharpPrebuiltCompact.dll + + + ..\LemmaGen\Lzma#.dll + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs new file mode 100644 index 0000000..9eee9bf --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs @@ -0,0 +1,130 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Net; +using System.Net.Sockets; +using System.Threading.Tasks; +using LemmaSharp; + +namespace LemmaGenSockets +{ + class LemmatizerListener + { + private Dictionary lemmatizersDict = new Dictionary(); + + private char[] wordInnerSeparator = { '-' }; + + + private void initializeLemmatizers() + { + lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish)); + lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English)); + } + + public LemmatizerListener() + { + initializeLemmatizers(); + } + + private string lemmatizeSentence(string languageCode, string sentence) + { + string[] tokens = sentence.Split(null); + + string result = ""; + foreach (string token in tokens) + { + result += lemmatizeWord(languageCode, token) + " "; + } + + return result.Trim(); + } + + private string lemmatizeWord(string languageCode, string word) + { + string[] parts = word.Split(wordInnerSeparator); + if (parts.Length == 2) + { + string firstPart = parts[0]; + if (!parts[0].EndsWith("o")) + { + firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart); + } + string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]); + return firstPart + "-" + secondPart; + } + else + { + return lemmatizersDict[languageCode].Lemmatize(word); + } + } + + public void DoListening() + { + // Data buffer for incoming data. + byte[] bytes = new Byte[1024]; + + string data; + + // Establish the local endpoint for the socket. + IPAddress ipAddress = IPAddress.Parse("127.0.0.1"); + IPEndPoint localEndPoint = new IPEndPoint(ipAddress, 11000); + + // Create a TCP/IP socket. + Socket listener = new Socket(AddressFamily.InterNetwork, + SocketType.Stream, ProtocolType.Tcp); + + // Bind the socket to the local endpoint and + // listen for incoming connections. + try + { + listener.Bind(localEndPoint); + listener.Listen(10); + + // Start listening for connections. + while (true) + { + // Program is suspended while waiting for an incoming connection. + Socket handler = listener.Accept(); + data = null; + + // An incoming connection needs to be processed. + while (true) + { + bytes = new byte[1024]; + int bytesRec = handler.Receive(bytes); + data += Encoding.UTF8.GetString(bytes, 0, bytesRec); + if (data.IndexOf("@#@") > -1) + { + break; + } + } + + data = data.Substring(0, data.IndexOf("@#@")); + + string languageCode = data.Substring(0, 2); + string sentence = data.Substring(2); + + + // Show the data on the console. + // Console.WriteLine("Sentence received : "+ sentence + ", language code : "+languageCode); + + // Send lemmatized data back to client. + byte[] msg = Encoding.UTF8.GetBytes(lemmatizeSentence(languageCode, sentence) + "@#@"); + + handler.Send(msg); + handler.Shutdown(SocketShutdown.Both); + handler.Close(); + } + + } + catch (Exception e) + { + Console.WriteLine(e.ToString()); + } + + } + + + } +} diff --git a/LemmaGenSockets/LemmaGenSockets/Program.cs b/LemmaGenSockets/LemmaGenSockets/Program.cs new file mode 100644 index 0000000..b7ce8a5 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/Program.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Net; +using System.Net.Sockets; +using System.Threading.Tasks; +using LemmaSharp; + +namespace LemmaGenSockets +{ + class Program + { + + // Incoming data from the client. + public static string data = null; + + + + static void Main(string[] args) + { + LemmatizerListener listener = new LemmatizerListener(); + listener.DoListening(); + } + } +} diff --git a/LemmaGenSockets/LemmaGenSockets/Properties/AssemblyInfo.cs b/LemmaGenSockets/LemmaGenSockets/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..a865875 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("LemmaGenSockets")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("LemmaGenSockets")] +[assembly: AssemblyCopyright("Copyright © 2017")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("3098bc55-2cc9-4612-9f79-8c812b3be539")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe new file mode 100644 index 0000000..8cb06db Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb new file mode 100644 index 0000000..1ab947c Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe new file mode 100644 index 0000000..681ab77 Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest new file mode 100644 index 0000000..f96b1d6 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharp.dll b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharp.dll new file mode 100644 index 0000000..5d8380d Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharp.dll differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll new file mode 100644 index 0000000..dde85f7 Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll new file mode 100644 index 0000000..c1b2a38 Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/Lzma#.dll b/LemmaGenSockets/LemmaGenSockets/bin/Debug/Lzma#.dll new file mode 100644 index 0000000..2bb9990 Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/Lzma#.dll differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache new file mode 100644 index 0000000..5a81c4d Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt new file mode 100644 index 0000000..3df4681 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt @@ -0,0 +1,10 @@ +j:\documents\visual studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache new file mode 100644 index 0000000..18ff92c Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe new file mode 100644 index 0000000..8cb06db Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb new file mode 100644 index 0000000..1ab947c Binary files /dev/null and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs b/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs new file mode 100644 index 0000000..e69de29 diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs b/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs new file mode 100644 index 0000000..e69de29 diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs b/LemmaGenSockets/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs new file mode 100644 index 0000000..e69de29 diff --git a/concordia-server/CMakeLists.txt b/concordia-server/CMakeLists.txt index 82c95d8..ef392b8 100644 --- a/concordia-server/CMakeLists.txt +++ b/concordia-server/CMakeLists.txt @@ -1,19 +1,6 @@ -add_executable(concordia_server_process - concordia_server_process.cpp - concordia_server.cpp - index_controller.cpp - searcher_controller.cpp - json_generator.cpp - unit_dao.cpp - db_connection.cpp - query_param.cpp - string_param.cpp - int_param.cpp - logger.cpp - int_array_param.cpp - simple_search_result.cpp - complete_concordia_search_result.cpp - tm_dao.cpp - ) -target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc) +file(GLOB main_sources "*.cpp") +add_executable(concordia_server_process + ${main_sources} + ) +target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc) diff --git a/concordia-server/concordia_server.cpp b/concordia-server/concordia_server.cpp index 8bdf5ee..9d45eaf 100644 --- a/concordia-server/concordia_server.cpp +++ b/concordia-server/concordia_server.cpp @@ -11,6 +11,7 @@ #include "json_generator.hpp" #include "config.hpp" #include "logger.hpp" +#include "socket_lemmatizer.hpp" #include "rapidjson/rapidjson.h" #include #include @@ -26,7 +27,7 @@ ConcordiaServer::ConcordiaServer(const std::string & configFilePath) _addTm(tmId); } _indexController = boost::shared_ptr (new IndexController(_concordiasMap)); - _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap)); + _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap)); } ConcordiaServer::~ConcordiaServer() { @@ -50,7 +51,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { JsonGenerator::signalError(jsonWriter, errorstream.str()); } else { // json parsed std::string operation = _getStringParameter(d, OPERATION_PARAM); - if (operation == ADD_SENTENCE_OP) { + if (operation == ADD_SENTENCE_OP) { std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM); std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM); int tmId = _getIntParameter(d, TM_ID_PARAM); @@ -93,6 +94,15 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { } } _indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId); + } else if (operation == "lemmatize") { + std::string sentence = _getStringParameter(d, "sentence"); + std::string languageCode = _getStringParameter(d, "languageCode"); + SocketLemmatizer lemmatizer; + std::string lemmatizedSentence = lemmatizer.lemmatizeSentence(languageCode, sentence); + jsonWriter.StartObject(); + jsonWriter.String("lemmatizedSentence"); + jsonWriter.String(lemmatizedSentence.c_str()); + jsonWriter.EndObject(); } else if (operation == REFRESH_INDEX_OP) { int tmId = _getIntParameter(d, TM_ID_PARAM); _indexController->refreshIndexFromRAM(jsonWriter, tmId); @@ -104,7 +114,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { std::string pattern = _getStringParameter(d, PATTERN_PARAM); int tmId = _getIntParameter(d, TM_ID_PARAM); Logger::logString("concordia search pattern", pattern); - _searcherController->concordiaSearch(jsonWriter, pattern, tmId); + _searcherController->concordiaSearch(jsonWriter, pattern, tmId); } else if (operation == CONCORDIA_PHRASE_SEARCH_OP) { std::string pattern = _getStringParameter(d, PATTERN_PARAM); int tmId = _getIntParameter(d, TM_ID_PARAM); @@ -114,31 +124,31 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { const rapidjson::Value & intervalsArray = d[INTERVALS_PARAM]; for (rapidjson::SizeType i = 0; i < intervalsArray.Size(); i++) { intervals.push_back(Interval(intervalsArray[i][0].GetInt(), intervalsArray[i][1].GetInt())); - } - _searcherController->concordiaPhraseSearch(jsonWriter, pattern, intervals, tmId); + } + _searcherController->concordiaPhraseSearch(jsonWriter, pattern, intervals, tmId); } else if (operation == ADD_TM_OP) { int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM); int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM); std::string name = _getStringParameter(d, NAME_PARAM); int newId = _tmDAO.addTm(sourceLangId, targetLangId, name); _addTm(newId); - + jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.String("newTmId"); jsonWriter.Int(newId); jsonWriter.EndObject(); - + } else { JsonGenerator::signalError(jsonWriter, "no such operation: " + operation); } } - + } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } outputString << outputJson.GetString(); @@ -182,5 +192,5 @@ void ConcordiaServer::_logPhrase(std::string phraseString) { std::ofstream logFile; logFile.open(PHRASE_LOG_FILE_PATH, std::ios::out | std::ios::app); logFile << phraseString.substr(0, phraseString.size()-1) << ", \"timestamp\":" << std::time(0) << "}\n"; - logFile.close(); + logFile.close(); } diff --git a/concordia-server/db_connection.hpp b/concordia-server/db_connection.hpp index 5b821af..c65fb35 100644 --- a/concordia-server/db_connection.hpp +++ b/concordia-server/db_connection.hpp @@ -1,5 +1,5 @@ -#ifndef DB_MANAGER_HDR -#define DB_MANAGER_HDR +#ifndef DB_CONNECTION_HDR +#define DB_CONNECTION_HDR #include #include @@ -17,7 +17,7 @@ public: /*! Destructor. */ virtual ~DBconnection(); - + void startTransaction() throw(ConcordiaException); void endTransaction() throw(ConcordiaException); @@ -28,16 +28,16 @@ public: std::vector params) throw(ConcordiaException); void clearResult(PGresult * result); - + int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException); - + std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException); int getRowCount(PGresult * result) throw (ConcordiaException); private: void close(); - + PGconn * _connection; }; diff --git a/concordia-server/socket_lemmatizer.cpp b/concordia-server/socket_lemmatizer.cpp new file mode 100644 index 0000000..f6170a8 --- /dev/null +++ b/concordia-server/socket_lemmatizer.cpp @@ -0,0 +1,90 @@ +#include "socket_lemmatizer.hpp" + +SocketLemmatizer::SocketLemmatizer() throw(ConcordiaException) : + _sock(-1) { + _connect("127.0.0.1" , 11000); +} + +SocketLemmatizer::~SocketLemmatizer() { +} + +/** + Connect to a host on a certain port number +*/ +bool SocketLemmatizer::_connect(std::string address , int port) +{ + //create socket if it is not already created + if(_sock == -1) { + //Create socket + _sock = socket(AF_INET , SOCK_STREAM , 0); + if (_sock == -1) { + throw ConcordiaException("Could not create socket for the lemmatizer."); + } + } + + //setup address structure + if(inet_addr(address.c_str()) == -1) { + struct hostent *he; + struct in_addr **addr_list; + + //resolve the hostname, its not an ip address + if ( (he = gethostbyname( address.c_str() ) ) == NULL) { + //gethostbyname failed + throw ConcordiaException("gethostbyname: Failed to resolve hostname"); + } + + //Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only + addr_list = (struct in_addr **) he->h_addr_list; + + for(int i = 0; addr_list[i] != NULL; i++) { + _server.sin_addr = *addr_list[i]; + break; + } + } else { //plain ip address + _server.sin_addr.s_addr = inet_addr(address.c_str()); + } + + _server.sin_family = AF_INET; + _server.sin_port = htons(port); + + //Connect to remote server + if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) { + throw ConcordiaException("connect failed. Error"); + } + + return true; +} + +/** + Send data to the connected host +*/ +bool SocketLemmatizer::_send_data(std::string data) +{ + //Send some data + if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) { + throw ConcordiaException("Send failed"); + } + return true; +} + +/** + Receive data from the connected host +*/ +std::string SocketLemmatizer::_receive(int size=512) +{ + char buffer[size]; + std::string reply; + + //Receive a reply from the server + if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) { + throw ConcordiaException("Receive failed"); + } + reply = buffer; + return reply; +} + +std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { + _send_data(languageCode+sentence+"@#@"); + std::string reply = _receive(512); + return reply.substr(0,reply.find("@#@")); +} diff --git a/concordia-server/socket_lemmatizer.hpp b/concordia-server/socket_lemmatizer.hpp new file mode 100644 index 0000000..7f20255 --- /dev/null +++ b/concordia-server/socket_lemmatizer.hpp @@ -0,0 +1,35 @@ +#ifndef SOCKET_LEMMATIZER_HDR +#define SOCKET_LEMMATIZER_HDR + +#include +#include //socket +#include //inet_addr +#include //hostent + +#include + + +class SocketLemmatizer { +public: + /*! Constructor. + */ + SocketLemmatizer() throw(ConcordiaException); + /*! Destructor. + */ + virtual ~SocketLemmatizer(); + + std::string lemmatizeSentence(std::string languageCode, std::string sentence); +private: + bool _connect(std::string, int); + + bool _send_data(std::string data); + + std::string _receive(int); + + int _sock; + + struct sockaddr_in _server; + +}; + +#endif diff --git a/tests/lemmatizeSentence.py b/tests/lemmatizeSentence.py new file mode 100755 index 0000000..4874f7c --- /dev/null +++ b/tests/lemmatizeSentence.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest +import json +import urllib2 +import sys +import time +import host + +data = { + 'operation': 'lemmatize', + 'languageCode':sys.argv[1], + 'sentence':sys.argv[2] +} + +address = 'http://'+host.concordia_host +if len(host.concordia_port) > 0: + address += ':'+host.concordia_port + +start = time.time() +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) +end = time.time() + +print "Execution time: %.4f seconds." % (end-start) +print "Result: " +print response diff --git a/upstart/README.txt b/upstart/README.txt index 894ef48..7be3284 100644 --- a/upstart/README.txt +++ b/upstart/README.txt @@ -1,4 +1,4 @@ -In order to configure Concordia as upstart job, copy the 2 .conf files into your /etc/init and run: +In order to configure Concordia as upstart job, copy the 3 .conf files into your /etc/init and run: sudo initctl reload-configuration diff --git a/upstart/cmake_stubs/lemmagen.conf.in b/upstart/cmake_stubs/lemmagen.conf.in new file mode 100644 index 0000000..86a7714 --- /dev/null +++ b/upstart/cmake_stubs/lemmagen.conf.in @@ -0,0 +1,23 @@ +# lemmagen +# +# This service runs the LemmaGen lemmatizer +# via mono. The process starts listening on the port 11000 + +description LemmaGen + +# When to start the service +start on started concordia-server + +# When to stop the service +stop on runlevel [016] + +# Automatically restart process if crashed +respawn + +# Essentially lets upstart know the process will detach itself to the background +expect fork + +# Start the process +script + exec mono @LEMMAGEN_BINARIES_PATH@/LemmaGenSockets.exe & +end script