Merge branch 'master' into two-step-concordia-search

2019-01-04 14:15:00 +01:00 · 2019-01-04 14:15:00 +01:00 · aa544051dc
commit aa544051dc
parent b446c15faa c800fa7b57
26 changed files with 142 additions and 23 deletions
--- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
+++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
@ -21,6 +21,7 @@ namespace LemmaGenSockets
            lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
            lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
            lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian));
            lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French));
        }
        public LemmatizerListener()
@ -52,13 +53,37 @@ namespace LemmaGenSockets
        private string lemmatizeWord(string languageCode, string word)
        {
-            if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do")
+            // exceptions
            if (word.StartsWith("ne_"))
            {
                return word;
            }
-            string[] parts = word.Split(wordInnerSeparator);
+
            Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
            HashSet<String> plExceptions = new HashSet<string>();
            plExceptions.Add("i");
            plExceptions.Add("o");
            plExceptions.Add("do");
            exceptions.Add("pl", plExceptions);
            HashSet<String> enExceptions = new HashSet<string>();
            enExceptions.Add("d");
            exceptions.Add("en", enExceptions);
            HashSet<String> languageExceptions;
            if (exceptions.TryGetValue(languageCode, out languageExceptions))
            {
                if(languageExceptions.Contains(word))
                {
                    return word;
                }
            }
            string result = "";
            string[] parts = word.Split(wordInnerSeparator);
            if (parts.Length == 2)
            {
                string firstPart = parts[0];
@ -74,7 +99,7 @@ namespace LemmaGenSockets
                result = lemmatizersDict[languageCode].Lemmatize(word);
            }
-            if (result == "" || result.Contains(" "))
+            if (result == "")
            {
                return word;
            }
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt
@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
 j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
 J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
--- a/cat/versions_available/jrc_enes.cfg
+++ b/cat/versions_available/jrc_enes.cfg
@ -1,7 +1,7 @@
 dir@#@jrc_enes
 concordia_host@#@concordia.vm.wmi.amu.edu.pl
 concordia_port@#@8800
-tmid@#@6
+tmid@#@1
 desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
 enjoy@#@Enjoy your work with the system!
 prompt@#@Enter search pattern (English sentence):
--- a/cat/versions_available/pytania_odpowiedzi_logistyka.cfg
+++ b/cat/versions_available/pytania_odpowiedzi_logistyka.cfg
@ -0,0 +1,10 @@
 dir@#@pytania_odpowiedzi_logistyka
 concordia_host@#@localhost
 concordia_port@#@8800
 tmid@#@12
 desc@#@Wyszukiwarka pytań
 enjoy@#@Wybierz przykładowe pytanie:
 prompt@#@Wprowadź pytanie (po polsku):
 suggestion@#@chciałbym zakupić samochód specjalistyczny
 suggestion@#@czy są jakieś zlecenia od spedytorów z terminala?
 suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską
--- a/cat/versions_enabled/emea_plen.cfg
+++ b/cat/versions_enabled/emea_plen.cfg
@ -1 +0,0 @@
 ../versions_available/emea_plen.cfg
--- a/cat/versions_enabled/europarl_sample.cfg
+++ b/cat/versions_enabled/europarl_sample.cfg
@ -1 +0,0 @@
 ../versions_available/europarl_sample.cfg
--- a/cat/versions_enabled/icd_filtered_plen.cfg
+++ b/cat/versions_enabled/icd_filtered_plen.cfg
@ -1 +0,0 @@
 ../versions_available/icd_filtered_plen.cfg
--- a/cat/versions_enabled/icd_plen.cfg
+++ b/cat/versions_enabled/icd_plen.cfg
@ -1 +0,0 @@
 ../versions_available/icd_plen.cfg
--- a/cat/versions_enabled/logofag_enpl.cfg
+++ b/cat/versions_enabled/logofag_enpl.cfg
@ -1 +0,0 @@
 ../versions_available/logofag_enpl.cfg
--- a/cat/versions_enabled/logofag_plen.cfg
+++ b/cat/versions_enabled/logofag_plen.cfg
@ -1 +0,0 @@
 ../versions_available/logofag_plen.cfg
--- a/cat/versions_enabled/opus_medicine_plen.cfg
+++ b/cat/versions_enabled/opus_medicine_plen.cfg
@ -1 +0,0 @@
 ../versions_available/opus_medicine_plen.cfg
--- a/cat/versions_enabled/tmrepository_enhr.cfg
+++ b/cat/versions_enabled/tmrepository_enhr.cfg
@ -1 +0,0 @@
 ../versions_available/tmrepository_enhr.cfg
--- a/concordia-server/lemmatizer_facade.cpp
+++ b/concordia-server/lemmatizer_facade.cpp
@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
    std::string plCode = "pl";
    std::string enCode = "en";
    std::string hrCode = "hr";
    std::string frCode = "fr";
    _lemmatizersMap.insert(plCode, socketLemmatizer1);
    _lemmatizersMap.insert(enCode, socketLemmatizer1);
    _lemmatizersMap.insert(hrCode, socketLemmatizer1);
    _lemmatizersMap.insert(frCode, socketLemmatizer1);
 }
 LemmatizerFacade::~LemmatizerFacade() {
--- a/concordia-server/socket_lemmatizer.cpp
+++ b/concordia-server/socket_lemmatizer.cpp
@ -1,6 +1,9 @@
 #include "socket_lemmatizer.hpp"
 #include <time.h>
 #include "config.hpp"
 #include <boost/lexical_cast.hpp>
 SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data)
 std::string SocketLemmatizer::_receive(int size=512)
 {
    char buffer[size];
-    std::string reply;
+    std::string reply = "";
    //Receive a reply from the server
-    if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
+    bool dataAvailable = true;
-        throw ConcordiaException("Receive failed");
+    while (dataAvailable) {
        int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
        if (amountReceived < 0) {
            throw ConcordiaException("Lemmatizer: recv failed");
        } else  if (amountReceived == 0) {
            dataAvailable = false;
        } else {
            buffer[amountReceived] = '\0';
            reply += buffer;
        }
    }
    reply = buffer;
    return reply;
 }
 std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
    for (int i=0;i<5;i++) {
        try {
            _connect();
            _send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
            std::string reply = _receive(512);
            _disconnect();
            return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
        } catch (std::exception & e) {
            _logger.logString("Problem with lemmatization of the sentence", sentence);
            _logger.log("Waiting 2 seconds and retrying...");
            sleep(2);
        }
    }
    throw ConcordiaException("Can not lemmatize sentence: "+sentence);
 }
--- a/concordia-server/socket_lemmatizer.hpp
+++ b/concordia-server/socket_lemmatizer.hpp
@ -9,6 +9,7 @@
 #include <concordia/concordia_exception.hpp>
 #include "logger.hpp"
 class SocketLemmatizer {
 public:
@ -34,6 +35,8 @@ private:
    int _sock;
    struct sockaddr_in _server;
    Logger _logger;
 };
 #endif
--- a/tests/addLemmatizedTM.sh
+++ b/tests/addLemmatizedTM.sh
@ -1,7 +1,7 @@
 #!/bin/sh
-CORPUS_NAME="europarl_sample"
+CORPUS_NAME="jrc_enes"
 SRC_LANG_ID=2
-TRG_LANG_ID=1
+TRG_LANG_ID=4
 ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
--- a/tests/lemmatizer-test/.gitignore
+++ b/tests/lemmatizer-test/.gitignore
@ -0,0 +1,2 @@
 differences.log
 corpora/
--- a/tests/lemmatizer-test/test.sh
+++ b/tests/lemmatizer-test/test.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 ./test_corpus.py corpora/A_en.txt en >> differences.log
 ./test_corpus.py corpora/B_en.txt en >> differences.log
 ./test_corpus.py corpora/C_en.txt en >> differences.log
 ./test_corpus.py corpora/D_en.txt en >> differences.log
 ./test_corpus.py corpora/A_fr.txt fr >> differences.log
 ./test_corpus.py corpora/B_fr.txt fr >> differences.log
 ./test_corpus.py corpora/C_fr.txt fr >> differences.log
 ./test_corpus.py corpora/D_fr.txt fr >> differences.log
--- a/tests/lemmatizer-test/test_corpus.py
+++ b/tests/lemmatizer-test/test_corpus.py
@ -0,0 +1,36 @@
 #!/usr/bin/python3
 import unittest
 import json
 import requests
 import sys
 def lemmatizeSentence(lang, sentence):
    data = {
        'operation': 'lemmatize',
        'languageCode':lang,
        'sentence':sentence
    }
    address = 'http://localhost:8800'
    response = requests.post(address, data=json.dumps(data))
    return response.json()['lemmatizedSentence']
 corpus_file_path = sys.argv[1]
 lang = sys.argv[2]
 line_count = 0
 with open(corpus_file_path) as corpus_file:
    for line in corpus_file:
        line_count += 1
        orig = line.rstrip()
        lemmatized = lemmatizeSentence(lang,orig)
        if len(orig.split()) != len(lemmatized.split()):
            print("Different length in:")
            print(orig)
            print(lemmatized)
        if line_count % 1000 == 0:
            sys.stderr.write("Done %d lines\n" % line_count)
--- a/tests/lemmatizer-test/tokenize.sh
+++ b/tests/lemmatizer-test/tokenize.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
 do
    a=`basename $corpus_file`
    concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
 done
		`@ -1 +0,0 @@`
			`../versions_available/opus_medicine_plen.cfg`