Merge branch 'master' into two-step-concordia-search

2019-01-04 14:15:00 +01:00 · 2019-01-04 14:15:00 +01:00 · aa544051dc
commit aa544051dc
parent b446c15faa c800fa7b57
26 changed files with 142 additions and 23 deletions
--- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
+++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
@ -21,6 +21,7 @@ namespace LemmaGenSockets
            lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
            lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
            lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian));
+            lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French));
        }

        public LemmatizerListener()
@ -52,13 +53,37 @@ namespace LemmaGenSockets

        private string lemmatizeWord(string languageCode, string word)
        {
-            if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do")
+            // exceptions
+            if (word.StartsWith("ne_"))
            {
                return word;
            }
-            string[] parts = word.Split(wordInnerSeparator);
+
+
+            Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
+
+            HashSet<String> plExceptions = new HashSet<string>();
+            plExceptions.Add("i");
+            plExceptions.Add("o");
+            plExceptions.Add("do");
+            exceptions.Add("pl", plExceptions);
+
+            HashSet<String> enExceptions = new HashSet<string>();
+            enExceptions.Add("d");
+            exceptions.Add("en", enExceptions);
+
+            HashSet<String> languageExceptions;
+            if (exceptions.TryGetValue(languageCode, out languageExceptions))
+            {
+                if(languageExceptions.Contains(word))
+                {
+                    return word;
+                }
+            }
+

            string result = "";
+            string[] parts = word.Split(wordInnerSeparator);
            if (parts.Length == 2)
            {
                string firstPart = parts[0];
@ -74,7 +99,7 @@ namespace LemmaGenSockets
                result = lemmatizersDict[languageCode].Lemmatize(word);
            }

-            if (result == "" || result.Contains(" "))
+            if (result == "")
            {
                return word;
            }
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt
@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
+J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
--- a/cat/versions_available/jrc_enes.cfg
+++ b/cat/versions_available/jrc_enes.cfg
@ -1,7 +1,7 @@
 dir@#@jrc_enes
 concordia_host@#@concordia.vm.wmi.amu.edu.pl
 concordia_port@#@8800
-tmid@#@6
+tmid@#@1
 desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
 enjoy@#@Enjoy your work with the system!
 prompt@#@Enter search pattern (English sentence):
--- a/cat/versions_available/pytania_odpowiedzi_logistyka.cfg
+++ b/cat/versions_available/pytania_odpowiedzi_logistyka.cfg
@ -0,0 +1,10 @@
+dir@#@pytania_odpowiedzi_logistyka
+concordia_host@#@localhost
+concordia_port@#@8800
+tmid@#@12
+desc@#@Wyszukiwarka pytań
+enjoy@#@Wybierz przykładowe pytanie:
+prompt@#@Wprowadź pytanie (po polsku):
+suggestion@#@chciałbym zakupić samochód specjalistyczny
+suggestion@#@czy są jakieś zlecenia od spedytorów z terminala?
+suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską
--- a/cat/versions_enabled/emea_plen.cfg
+++ b/cat/versions_enabled/emea_plen.cfg
@ -1 +0,0 @@
-../versions_available/emea_plen.cfg
--- a/cat/versions_enabled/europarl_sample.cfg
+++ b/cat/versions_enabled/europarl_sample.cfg
@ -1 +0,0 @@
-../versions_available/europarl_sample.cfg
--- a/cat/versions_enabled/icd_filtered_plen.cfg
+++ b/cat/versions_enabled/icd_filtered_plen.cfg
@ -1 +0,0 @@
-../versions_available/icd_filtered_plen.cfg
--- a/cat/versions_enabled/icd_plen.cfg
+++ b/cat/versions_enabled/icd_plen.cfg
@ -1 +0,0 @@
-../versions_available/icd_plen.cfg
--- a/cat/versions_enabled/logofag_enpl.cfg
+++ b/cat/versions_enabled/logofag_enpl.cfg
@ -1 +0,0 @@
-../versions_available/logofag_enpl.cfg
--- a/cat/versions_enabled/logofag_plen.cfg
+++ b/cat/versions_enabled/logofag_plen.cfg
@ -1 +0,0 @@
-../versions_available/logofag_plen.cfg
--- a/cat/versions_enabled/opus_medicine_plen.cfg
+++ b/cat/versions_enabled/opus_medicine_plen.cfg
@ -1 +0,0 @@
-../versions_available/opus_medicine_plen.cfg
--- a/cat/versions_enabled/tmrepository_enhr.cfg
+++ b/cat/versions_enabled/tmrepository_enhr.cfg
@ -1 +0,0 @@
-../versions_available/tmrepository_enhr.cfg
--- a/concordia-server/lemmatizer_facade.cpp
+++ b/concordia-server/lemmatizer_facade.cpp
@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
    std::string plCode = "pl";
    std::string enCode = "en";
    std::string hrCode = "hr";
+    std::string frCode = "fr";

    _lemmatizersMap.insert(plCode, socketLemmatizer1);
    _lemmatizersMap.insert(enCode, socketLemmatizer1);
    _lemmatizersMap.insert(hrCode, socketLemmatizer1);
+    _lemmatizersMap.insert(frCode, socketLemmatizer1);
 }

 LemmatizerFacade::~LemmatizerFacade() {
--- a/concordia-server/socket_lemmatizer.cpp
+++ b/concordia-server/socket_lemmatizer.cpp
@ -1,6 +1,9 @@
 #include "socket_lemmatizer.hpp"

+#include <time.h>
+
 #include "config.hpp"
+
 #include <boost/lexical_cast.hpp>

 SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data)
 std::string SocketLemmatizer::_receive(int size=512)
 {
    char buffer[size];
-    std::string reply;
+    std::string reply = "";

    //Receive a reply from the server
-    if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
-        throw ConcordiaException("Receive failed");
+    bool dataAvailable = true;
+    while (dataAvailable) {
+        int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
+        if (amountReceived < 0) {
+            throw ConcordiaException("Lemmatizer: recv failed");
+        } else  if (amountReceived == 0) {
+            dataAvailable = false;
+        } else {
+            buffer[amountReceived] = '\0';
+            reply += buffer;
+        }
    }
-    reply = buffer;
    return reply;
 }

 std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
-    _connect();
-    _send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
-    std::string reply = _receive(512);
-    _disconnect();
-    return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
+    for (int i=0;i<5;i++) {
+        try {
+            _connect();
+            _send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
+            std::string reply = _receive(512);
+            _disconnect();
+            return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
+        } catch (std::exception & e) {
+            _logger.logString("Problem with lemmatization of the sentence", sentence);
+            _logger.log("Waiting 2 seconds and retrying...");
+            sleep(2);
+        }
+    }
+
+    throw ConcordiaException("Can not lemmatize sentence: "+sentence);
 }
--- a/concordia-server/socket_lemmatizer.hpp
+++ b/concordia-server/socket_lemmatizer.hpp
@ -9,6 +9,7 @@

 #include <concordia/concordia_exception.hpp>

+#include "logger.hpp"

 class SocketLemmatizer {
 public:
@ -34,6 +35,8 @@ private:
    int _sock;

    struct sockaddr_in _server;
+
+    Logger _logger;
 };

 #endif
--- a/tests/addLemmatizedTM.sh
+++ b/tests/addLemmatizedTM.sh
@ -1,7 +1,7 @@
 #!/bin/sh

-CORPUS_NAME="europarl_sample"
+CORPUS_NAME="jrc_enes"
 SRC_LANG_ID=2
-TRG_LANG_ID=1
+TRG_LANG_ID=4

 ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
--- a/tests/lemmatizer-test/.gitignore
+++ b/tests/lemmatizer-test/.gitignore
@ -0,0 +1,2 @@
+differences.log
+corpora/
--- a/tests/lemmatizer-test/test.sh
+++ b/tests/lemmatizer-test/test.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+./test_corpus.py corpora/A_en.txt en >> differences.log
+./test_corpus.py corpora/B_en.txt en >> differences.log
+./test_corpus.py corpora/C_en.txt en >> differences.log
+./test_corpus.py corpora/D_en.txt en >> differences.log
+./test_corpus.py corpora/A_fr.txt fr >> differences.log
+./test_corpus.py corpora/B_fr.txt fr >> differences.log
+./test_corpus.py corpora/C_fr.txt fr >> differences.log
+./test_corpus.py corpora/D_fr.txt fr >> differences.log
--- a/tests/lemmatizer-test/test_corpus.py
+++ b/tests/lemmatizer-test/test_corpus.py
@ -0,0 +1,36 @@
+#!/usr/bin/python3
+
+import unittest
+import json
+import requests
+import sys
+
+
+
+def lemmatizeSentence(lang, sentence):
+    data = {
+        'operation': 'lemmatize',
+        'languageCode':lang,
+        'sentence':sentence
+    }
+
+    address = 'http://localhost:8800'
+    response = requests.post(address, data=json.dumps(data))
+    return response.json()['lemmatizedSentence']
+
+corpus_file_path = sys.argv[1]
+lang = sys.argv[2]
+
+
+line_count = 0
+with open(corpus_file_path) as corpus_file:
+    for line in corpus_file:
+        line_count += 1
+        orig = line.rstrip()
+        lemmatized = lemmatizeSentence(lang,orig)
+        if len(orig.split()) != len(lemmatized.split()):
+            print("Different length in:")
+            print(orig)
+            print(lemmatized)
+        if line_count % 1000 == 0:
+            sys.stderr.write("Done %d lines\n" % line_count)
--- a/tests/lemmatizer-test/tokenize.sh
+++ b/tests/lemmatizer-test/tokenize.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
+do
+    a=`basename $corpus_file`
+    concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
+done
				`@ -1 +0,0 @@`
				`../versions_available/opus_medicine_plen.cfg`