diff --git a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs index 77a321f..28afaf9 100644 --- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs +++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs @@ -21,6 +21,7 @@ namespace LemmaGenSockets lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish)); lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English)); lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian)); + lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French)); } public LemmatizerListener() @@ -52,13 +53,37 @@ namespace LemmaGenSockets private string lemmatizeWord(string languageCode, string word) { - if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do") + // exceptions + if (word.StartsWith("ne_")) { return word; } - string[] parts = word.Split(wordInnerSeparator); + + + Dictionary> exceptions = new Dictionary>(); + + HashSet plExceptions = new HashSet(); + plExceptions.Add("i"); + plExceptions.Add("o"); + plExceptions.Add("do"); + exceptions.Add("pl", plExceptions); + + HashSet enExceptions = new HashSet(); + enExceptions.Add("d"); + exceptions.Add("en", enExceptions); + + HashSet languageExceptions; + if (exceptions.TryGetValue(languageCode, out languageExceptions)) + { + if(languageExceptions.Contains(word)) + { + return word; + } + } + string result = ""; + string[] parts = word.Split(wordInnerSeparator); if (parts.Length == 2) { string firstPart = parts[0]; @@ -74,7 +99,7 @@ namespace LemmaGenSockets result = lemmatizersDict[languageCode].Lemmatize(word); } - if (result == "" || result.Contains(" ")) + if (result == "") { return word; } diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe index 1a297bb..dfb6538 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb index 46bce95..e85154f 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache index 5a81c4d..41637e9 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt index 3df4681..dc882e2 100644 --- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt +++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt @@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache index 877d59a..58d7cf5 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe index 1a297bb..dfb6538 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb index 46bce95..e85154f 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb differ diff --git a/cat/versions_available/jrc_enes.cfg b/cat/versions_available/jrc_enes.cfg index 07f8e46..ae3c864 100644 --- a/cat/versions_available/jrc_enes.cfg +++ b/cat/versions_available/jrc_enes.cfg @@ -1,7 +1,7 @@ dir@#@jrc_enes concordia_host@#@concordia.vm.wmi.amu.edu.pl concordia_port@#@8800 -tmid@#@6 +tmid@#@1 desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context. enjoy@#@Enjoy your work with the system! prompt@#@Enter search pattern (English sentence): diff --git a/cat/versions_available/pytania_odpowiedzi_logistyka.cfg b/cat/versions_available/pytania_odpowiedzi_logistyka.cfg new file mode 100644 index 0000000..6ca35e0 --- /dev/null +++ b/cat/versions_available/pytania_odpowiedzi_logistyka.cfg @@ -0,0 +1,10 @@ +dir@#@pytania_odpowiedzi_logistyka +concordia_host@#@localhost +concordia_port@#@8800 +tmid@#@12 +desc@#@Wyszukiwarka pytań +enjoy@#@Wybierz przykładowe pytanie: +prompt@#@Wprowadź pytanie (po polsku): +suggestion@#@chciałbym zakupić samochód specjalistyczny +suggestion@#@czy są jakieś zlecenia od spedytorów z terminala? +suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską diff --git a/cat/versions_enabled/emea_plen.cfg b/cat/versions_enabled/emea_plen.cfg deleted file mode 120000 index 6081551..0000000 --- a/cat/versions_enabled/emea_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/emea_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/europarl_sample.cfg b/cat/versions_enabled/europarl_sample.cfg deleted file mode 120000 index c90ed2e..0000000 --- a/cat/versions_enabled/europarl_sample.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/europarl_sample.cfg \ No newline at end of file diff --git a/cat/versions_enabled/icd_filtered_plen.cfg b/cat/versions_enabled/icd_filtered_plen.cfg deleted file mode 120000 index e6dd218..0000000 --- a/cat/versions_enabled/icd_filtered_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/icd_filtered_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/icd_plen.cfg b/cat/versions_enabled/icd_plen.cfg deleted file mode 120000 index 447fa8d..0000000 --- a/cat/versions_enabled/icd_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/icd_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/logofag_enpl.cfg b/cat/versions_enabled/logofag_enpl.cfg deleted file mode 120000 index c1c7f55..0000000 --- a/cat/versions_enabled/logofag_enpl.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/logofag_enpl.cfg \ No newline at end of file diff --git a/cat/versions_enabled/logofag_plen.cfg b/cat/versions_enabled/logofag_plen.cfg deleted file mode 120000 index 370b68b..0000000 --- a/cat/versions_enabled/logofag_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/logofag_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/opus_medicine_plen.cfg b/cat/versions_enabled/opus_medicine_plen.cfg deleted file mode 120000 index 8461a8f..0000000 --- a/cat/versions_enabled/opus_medicine_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/opus_medicine_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/tmrepository_enhr.cfg b/cat/versions_enabled/tmrepository_enhr.cfg deleted file mode 120000 index de131be..0000000 --- a/cat/versions_enabled/tmrepository_enhr.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/tmrepository_enhr.cfg \ No newline at end of file diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp index 43b0aae..c853024 100644 --- a/concordia-server/lemmatizer_facade.cpp +++ b/concordia-server/lemmatizer_facade.cpp @@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { std::string plCode = "pl"; std::string enCode = "en"; std::string hrCode = "hr"; + std::string frCode = "fr"; _lemmatizersMap.insert(plCode, socketLemmatizer1); _lemmatizersMap.insert(enCode, socketLemmatizer1); _lemmatizersMap.insert(hrCode, socketLemmatizer1); + _lemmatizersMap.insert(frCode, socketLemmatizer1); } LemmatizerFacade::~LemmatizerFacade() { diff --git a/concordia-server/socket_lemmatizer.cpp b/concordia-server/socket_lemmatizer.cpp index 0cd6aee..137f4ab 100644 --- a/concordia-server/socket_lemmatizer.cpp +++ b/concordia-server/socket_lemmatizer.cpp @@ -1,6 +1,9 @@ #include "socket_lemmatizer.hpp" +#include + #include "config.hpp" + #include SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) : @@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data) std::string SocketLemmatizer::_receive(int size=512) { char buffer[size]; - std::string reply; + std::string reply = ""; //Receive a reply from the server - if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) { - throw ConcordiaException("Receive failed"); + bool dataAvailable = true; + while (dataAvailable) { + int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0); + if (amountReceived < 0) { + throw ConcordiaException("Lemmatizer: recv failed"); + } else if (amountReceived == 0) { + dataAvailable = false; + } else { + buffer[amountReceived] = '\0'; + reply += buffer; + } } - reply = buffer; return reply; } std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { - _connect(); - _send_data(languageCode+sentence+LEMMATIZER_DELIMITER); - std::string reply = _receive(512); - _disconnect(); - return reply.substr(0,reply.find(LEMMATIZER_DELIMITER)); + for (int i=0;i<5;i++) { + try { + _connect(); + _send_data(languageCode+sentence+LEMMATIZER_DELIMITER); + std::string reply = _receive(512); + _disconnect(); + return reply.substr(0,reply.find(LEMMATIZER_DELIMITER)); + } catch (std::exception & e) { + _logger.logString("Problem with lemmatization of the sentence", sentence); + _logger.log("Waiting 2 seconds and retrying..."); + sleep(2); + } + } + + throw ConcordiaException("Can not lemmatize sentence: "+sentence); } diff --git a/concordia-server/socket_lemmatizer.hpp b/concordia-server/socket_lemmatizer.hpp index 4f5e9e9..13d5949 100644 --- a/concordia-server/socket_lemmatizer.hpp +++ b/concordia-server/socket_lemmatizer.hpp @@ -9,6 +9,7 @@ #include +#include "logger.hpp" class SocketLemmatizer { public: @@ -34,6 +35,8 @@ private: int _sock; struct sockaddr_in _server; + + Logger _logger; }; #endif diff --git a/tests/addLemmatizedTM.sh b/tests/addLemmatizedTM.sh index 8cb99b0..b581e3c 100755 --- a/tests/addLemmatizedTM.sh +++ b/tests/addLemmatizedTM.sh @@ -1,7 +1,7 @@ #!/bin/sh -CORPUS_NAME="europarl_sample" +CORPUS_NAME="jrc_enes" SRC_LANG_ID=2 -TRG_LANG_ID=1 +TRG_LANG_ID=4 ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt diff --git a/tests/lemmatizer-test/.gitignore b/tests/lemmatizer-test/.gitignore new file mode 100644 index 0000000..b87b494 --- /dev/null +++ b/tests/lemmatizer-test/.gitignore @@ -0,0 +1,2 @@ +differences.log +corpora/ diff --git a/tests/lemmatizer-test/test.sh b/tests/lemmatizer-test/test.sh new file mode 100755 index 0000000..1e4aa0b --- /dev/null +++ b/tests/lemmatizer-test/test.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +./test_corpus.py corpora/A_en.txt en >> differences.log +./test_corpus.py corpora/B_en.txt en >> differences.log +./test_corpus.py corpora/C_en.txt en >> differences.log +./test_corpus.py corpora/D_en.txt en >> differences.log +./test_corpus.py corpora/A_fr.txt fr >> differences.log +./test_corpus.py corpora/B_fr.txt fr >> differences.log +./test_corpus.py corpora/C_fr.txt fr >> differences.log +./test_corpus.py corpora/D_fr.txt fr >> differences.log diff --git a/tests/lemmatizer-test/test_corpus.py b/tests/lemmatizer-test/test_corpus.py new file mode 100755 index 0000000..8c986bb --- /dev/null +++ b/tests/lemmatizer-test/test_corpus.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +import unittest +import json +import requests +import sys + + + +def lemmatizeSentence(lang, sentence): + data = { + 'operation': 'lemmatize', + 'languageCode':lang, + 'sentence':sentence + } + + address = 'http://localhost:8800' + response = requests.post(address, data=json.dumps(data)) + return response.json()['lemmatizedSentence'] + +corpus_file_path = sys.argv[1] +lang = sys.argv[2] + + +line_count = 0 +with open(corpus_file_path) as corpus_file: + for line in corpus_file: + line_count += 1 + orig = line.rstrip() + lemmatized = lemmatizeSentence(lang,orig) + if len(orig.split()) != len(lemmatized.split()): + print("Different length in:") + print(orig) + print(lemmatized) + if line_count % 1000 == 0: + sys.stderr.write("Done %d lines\n" % line_count) diff --git a/tests/lemmatizer-test/tokenize.sh b/tests/lemmatizer-test/tokenize.sh new file mode 100755 index 0000000..442a2f8 --- /dev/null +++ b/tests/lemmatizer-test/tokenize.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt` +do + a=`basename $corpus_file` + concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a +done