working lemmatization

2017-03-10 14:52:01 +01:00 · 2017-03-10 14:52:01 +01:00 · 89fb77bf58
commit 89fb77bf58
parent 8b0666c34d
32 changed files with 592 additions and 142 deletions
--- a/concordia-server/bool_param.cpp
+++ b/concordia-server/bool_param.cpp
@ -0,0 +1,24 @@
+#include "bool_param.hpp"
+
+
+BoolParam::BoolParam(bool value):_value(value) {
+}
+
+BoolParam::~BoolParam() {
+}
+
+const char * BoolParam::getValue() {
+    if (_value) {
+        return "t";
+    } else {
+        return "f";
+    }
+}
+
+const int BoolParam::getLength() {
+    return 1;
+}
+
+const int BoolParam::isBinary() {
+    return 0;
+}
--- a/concordia-server/bool_param.hpp
+++ b/concordia-server/bool_param.hpp
@ -0,0 +1,24 @@
+#ifndef BOOL_PARAM_HDR
+#define BOOL_PARAM_HDR
+
+#include "query_param.hpp"
+
+class BoolParam : public QueryParam {
+public:
+    /*! Constructor.
+    */
+    BoolParam(bool value);
+    /*! Destructor.
+    */
+    virtual ~BoolParam();
+
+    const char * getValue();
+
+    const int getLength();
+
+    const int isBinary();
+private:
+    bool _value;
+};
+
+#endif
--- a/concordia-server/concordia_server.cpp
+++ b/concordia-server/concordia_server.cpp
@ -5,6 +5,7 @@
 #include <iostream>
 #include <fstream>
 #include <ctime>
+#include <utility>

 #include <concordia/interval.hpp>

@ -19,16 +20,17 @@
 ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
                                         throw(ConcordiaException) :
                                         _configFilePath(configFilePath) {
+
    std::vector<int> tmIds = _tmDAO.getTmIds();
    _concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());

    BOOST_FOREACH(int & tmId, tmIds) {
        _addTm(tmId);
    }
-    _indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
-    _searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
-
    _lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
+
+    _indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap, _lemmatizerFacade));
+    _searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap, _lemmatizerFacade));
 }

 ConcordiaServer::~ConcordiaServer() {
@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
                    }
                }
                _indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
+            } else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
+                std::vector<std::string> sourceSentences;
+                std::vector<std::string> targetSentences;
+                std::vector<std::string> alignmentStrings;
+                int tmId = d[TM_ID_PARAM].GetInt();
+                // loading data from json
+                const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
+                Logger::log("addAlignedLemmatizedSentences");
+                Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
+                Logger::logInt("tm id", tmId);
+                for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
+                    if (sentencesArray[i].Size() != 3) {
+                        JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
+                        break;
+                    } else {
+                        sourceSentences.push_back(sentencesArray[i][0].GetString());
+                        targetSentences.push_back(sentencesArray[i][1].GetString());
+                        alignmentStrings.push_back(sentencesArray[i][2].GetString());
+                    }
+                }
+                _indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
            } else if (operation == "lemmatize") {
                std::string sentence = _getStringParameter(d, "sentence");
                std::string languageCode = _getStringParameter(d, "languageCode");
@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
                int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
                int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
                std::string name = _getStringParameter(d, NAME_PARAM);
-                int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
+                bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM);
+                int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized);
                _addTm(newId);

                jsonWriter.StartObject();
@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
    }
 }

+int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
+                                                       throw (ConcordiaException) {
+    rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
+    if (itr != d.MemberEnd()) {
+        bool value = itr->value.GetBool();
+        return value;
+    } else {
+        throw ConcordiaException("missing parameter: " + std::string(name));
+    }
+}
+
 void ConcordiaServer::_addTm(int tmId) {
    std::stringstream indexPath;
    indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
--- a/concordia-server/concordia_server.hpp
+++ b/concordia-server/concordia_server.hpp
@ -38,6 +38,8 @@ private:

    int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);

+    int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
+
    void _addTm(int tmId);

    std::string _configFilePath;
--- a/concordia-server/config.hpp.in
+++ b/concordia-server/config.hpp.in
@ -16,7 +16,9 @@
 #define SOURCE_SENTENCE_PARAM "sourceSentence"
 #define TARGET_SENTENCE_PARAM "targetSentence"
 #define TM_ID_PARAM "tmId"
+#define TM_LEMMATIZED_PARAM "tmLemmatized"
 #define SENTENCES_PARAM "sentences"
+#define EXAMPLES_PARAM "examples"
 #define SOURCE_LANG_PARAM "sourceLangId"
 #define TARGET_LANG_PARAM "targetLangId"
 #define NAME_PARAM "name"
@ -25,6 +27,7 @@
 #define ADD_SENTENCE_OP "addSentence"
 #define ADD_SENTENCES_OP "addSentences"
 #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
+#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
 #define REFRESH_INDEX_OP "refreshIndex"
 #define SIMPLE_SEARCH_OP "simpleSearch"
 #define CONCORDIA_SEARCH_OP "concordiaSearch"
--- a/concordia-server/db_connection.cpp
+++ b/concordia-server/db_connection.cpp
@ -17,7 +17,7 @@ DBconnection::DBconnection() throw(ConcordiaException) {
        ss << "Connection string: " << connectionInfo;
        throw ConcordiaException(ss.str());
    }
-    
+
 }

 DBconnection::~DBconnection() {
@ -90,8 +90,8 @@ PGresult * DBconnection::execute(std::string query,
            paramFormats[index] = param->isBinary();
            index++;
        }
-        
-        
+
+
        PGresult * result = PQexecParams(_connection,
                                         query.c_str(),
                                         params.size(),
@ -129,7 +129,18 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
    } catch (std::exception & e) {
        std::stringstream ss;
        ss << "Error getting int value. Message: " << e.what();
-        throw ConcordiaException(ss.str());    
+        throw ConcordiaException(ss.str());
+    }
+}
+
+bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
+    try {
+        char * valueStr = PQgetvalue(result,row,col);
+        return std::string(valueStr) == "t";
+    } catch (std::exception & e) {
+        std::stringstream ss;
+        ss << "Error getting bool value. Message: " << e.what();
+        throw ConcordiaException(ss.str());
    }
 }

@ -150,7 +161,6 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
    } catch (std::exception & e) {
        std::stringstream ss;
        ss << "Error getting int value. Message: " << e.what();
-        throw ConcordiaException(ss.str());    
+        throw ConcordiaException(ss.str());
    }
 }
-
--- a/concordia-server/db_connection.hpp
+++ b/concordia-server/db_connection.hpp
@ -31,6 +31,8 @@ public:

    int getIntValue(PGresult * result, int row, int col)  throw (ConcordiaException);

+    bool getBoolValue(PGresult * result, int row, int col)  throw (ConcordiaException);
+
    std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);

    int getRowCount(PGresult * result)  throw (ConcordiaException);
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -14,9 +14,11 @@
 #include "json_generator.hpp"
 #include "logger.hpp"

-IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
+                                 boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                    throw(ConcordiaException):
-                                                                     _concordiasMap(concordiasMap) {
+                                                                     _concordiasMap(concordiasMap),
+                                                                     _lemmatizerFacade(lemmatizerFacade) {
 }

 IndexController::~IndexController() {
@ -32,9 +34,10 @@ void IndexController::addSentence(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
+            TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
            TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
-            int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);     
-            it->second->addTokenizedExample(tokenizedSentence, sentenceId);
+            int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
+            it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
            it->second->refreshSAfromRAM();

            jsonWriter.StartObject();
@ -42,20 +45,20 @@ void IndexController::addSentence(
            jsonWriter.String("success");
            jsonWriter.EndObject();
        } else {
-            JsonGenerator::signalError(jsonWriter, "no such tm!");        
+            JsonGenerator::signalError(jsonWriter, "no such tm!");
        }
    } catch (ConcordiaException & e) {
        std::stringstream errorstream;
        errorstream << "concordia error: " << e.what();
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    } catch (std::exception & e) {
        std::stringstream errorstream;
        errorstream << "general error: " << e.what();
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    } catch (...) {
        std::stringstream errorstream;
        errorstream << "unexpected error occurred";
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    }
 }

@ -67,21 +70,22 @@ void IndexController::addSentences(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
+            std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
            std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
-            it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
+            it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);

            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
            jsonWriter.EndObject();
        } else {
-            JsonGenerator::signalError(jsonWriter, "no such tm!");        
+            JsonGenerator::signalError(jsonWriter, "no such tm!");
        }
    } catch (ConcordiaException & e) {
        std::stringstream errorstream;
        errorstream << "concordia error: " << e.what();
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    }
 }

@ -96,28 +100,66 @@ void IndexController::addAlignedSentences(
            std::vector<std::string> sourceSentences;
            std::vector<std::vector<std::vector<int> > > allAlignments;
            _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
-            
-            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);            
+
+            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);

            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
            for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
                it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
-            } 
+            }
            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
            jsonWriter.EndObject();
        } else {
-            JsonGenerator::signalError(jsonWriter, "no such tm!");        
+            JsonGenerator::signalError(jsonWriter, "no such tm!");
        }
    } catch (ConcordiaException & e) {
        std::stringstream errorstream;
        errorstream << "concordia error: " << e.what();
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    }
 }

+void IndexController::addAlignedLemmatizedSentences(
+                 rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
+                 const std::vector<std::string> & sourceSentences,
+                 const std::vector<std::string> & targetSentences,
+                 const std::vector<std::string> & alignmentStrings,
+                 const int tmId) {
+    try {
+        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
+        if (it != _concordiasMap->end()) {
+            std::vector<std::string> lemmatizedSourceSentences;
+            std::vector<std::vector<std::vector<int> > > allAlignments;
+            _getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
+
+            std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
+            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
+            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
+
+            std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
+
+            _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
+            for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
+                it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
+            }
+            jsonWriter.StartObject();
+            jsonWriter.String("status");
+            jsonWriter.String("success");
+            jsonWriter.EndObject();
+        } else {
+            JsonGenerator::signalError(jsonWriter, "no such tm!");
+        }
+    } catch (ConcordiaException & e) {
+        std::stringstream errorstream;
+        errorstream << "concordia error: " << e.what();
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
+    }
+}
+
+
 void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                                          const int tmId) {
    try {
@ -130,12 +172,12 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
            jsonWriter.String("success");
            jsonWriter.EndObject();
        } else {
-            JsonGenerator::signalError(jsonWriter, "no such tm!");        
+            JsonGenerator::signalError(jsonWriter, "no such tm!");
        }
    } catch (ConcordiaException & e) {
        std::stringstream errorstream;
        errorstream << "concordia error: " << e.what();
-        JsonGenerator::signalError(jsonWriter, errorstream.str());        
+        JsonGenerator::signalError(jsonWriter, errorstream.str());
    }

 }
@ -147,10 +189,10 @@ void IndexController::_getSourceSentencesAndAlignments(

    for (int i = 0; i<rawSourceSentences.size(); i++) {
        std::string rawSourceSentence = rawSourceSentences[i];
-        
+
        std::string sourceSentence = "";
        std::vector<std::vector<int> > alignments;
-        
+
        UnicodeString s(rawSourceSentence.c_str());
        boost::u32regex_iterator<const UChar*> begin(
                           boost::make_u32regex_iterator(
@ -159,21 +201,21 @@ void IndexController::_getSourceSentencesAndAlignments(
                           )
                                               );
        boost::u32regex_iterator<const UChar*> end;
-        
+
        for (; begin != end; ++begin) {
            UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
            std::string token;
            tokenUTF8.toUTF8String(token);

            if (token != "NULL") {
-                std::string numbers((*begin)[2].first, (*begin)[2].second);            
+                std::string numbers((*begin)[2].first, (*begin)[2].second);
                std::istringstream iss(numbers);
                std::vector<std::string> numberStrings;
                std::copy(std::istream_iterator<std::string>(iss),
                          std::istream_iterator<std::string>(),
                          std::back_inserter(numberStrings));

-                std::vector<int> tokenAlignments;                
+                std::vector<int> tokenAlignments;
                for (int j=0;j<numberStrings.size();j++) {
                    int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
                    tokenAlignments.push_back(n);
@ -182,11 +224,10 @@ void IndexController::_getSourceSentencesAndAlignments(
                sourceSentence += token + " ";
            }
        }
-        
+
        sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
-        
+
        sourceSentences.push_back(sourceSentence);
        allAlignments.push_back(alignments);
    }
 }
-
--- a/concordia-server/index_controller.hpp
+++ b/concordia-server/index_controller.hpp
@ -10,6 +10,8 @@


 #include "unit_dao.hpp"
+#include "lemmatizer_facade.hpp"
+

 #include "rapidjson/writer.h"

@ -17,7 +19,8 @@ class IndexController {
 public:
    /*! Constructor.
    */
-    explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+    explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
+                             boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                   throw(ConcordiaException);
    /*! Destructor.
    */
@ -38,9 +41,16 @@ public:
                             const std::vector<std::string> & targetSentences,
                             const int tmId);

+    void addAlignedLemmatizedSentences(
+                          rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
+                          const std::vector<std::string> & sourceSentences,
+                          const std::vector<std::string> & targetSentences,
+                          const std::vector<std::string> & alignmentStrings,
+                          const int tmId);
+
    void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                             const int tmId);
-    
+
 private:
    void _getSourceSentencesAndAlignments(
                            std::vector<std::string> & sourceSentences,
@ -48,7 +58,9 @@ private:
                            const std::vector<std::string> & rawSourceSentences);

    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
-    
+
+    boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
+
    UnitDAO _unitDAO;
 };

--- a/concordia-server/lemmatizer_facade.cpp
+++ b/concordia-server/lemmatizer_facade.cpp
@ -1,5 +1,7 @@
 #include "lemmatizer_facade.hpp"

+#include <boost/foreach.hpp>
+

 LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
    _lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
    }

 }
+
+std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
+    std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
+    if (tmInfo.first) {
+        return lemmatizeSentence(tmInfo.second, pattern);
+    } else {
+        return pattern;
+    }
+}
+
+std::vector<std::string> LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId) {
+    std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
+    if (tmInfo.first) {
+        std::vector<std::string> result;
+        BOOST_FOREACH(std::string & pattern, patterns) {
+            result.push_back(lemmatizeSentence(tmInfo.second, pattern));
+        }
+        return result;
+    } else {
+        return patterns;
+    }
+
+}
--- a/concordia-server/lemmatizer_facade.hpp
+++ b/concordia-server/lemmatizer_facade.hpp
@ -2,6 +2,7 @@
 #define LEMMATIZER_FACADE_HDR

 #include "socket_lemmatizer.hpp"
+#include "tm_dao.hpp"

 #include <string>
 #include <concordia/concordia_exception.hpp>
@ -18,8 +19,15 @@ public:
    virtual ~LemmatizerFacade();

    std::string lemmatizeSentence(std::string languageCode, std::string sentence);
+
+    std::string lemmatizeIfNeeded(std::string pattern, int tmId);
+
+    std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
+
 private:
    boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
+
+    TmDAO _tmDAO;
 };

 #endif
--- a/concordia-server/searcher_controller.cpp
+++ b/concordia-server/searcher_controller.cpp
@ -8,9 +8,11 @@
 #include "logger.hpp"


-SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
+                                       boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                     throw(ConcordiaException):
-                                                                     _concordiasMap(concordiasMap) {
+                                                                     _concordiasMap(concordiasMap),
+                                                                     _lemmatizerFacade(lemmatizerFacade) {
 }

 SearcherController::~SearcherController() {
@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
                                      const int tmId) {
    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
+        pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
        std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));

        jsonWriter.StartObject();
@ -30,48 +33,49 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
        jsonWriter.String("results");
        jsonWriter.StartArray();
        BOOST_FOREACH(SimpleSearchResult & result, results) {
-            JsonGenerator::writeSearchResult(jsonWriter, result);        
-        }    
+            JsonGenerator::writeSearchResult(jsonWriter, result);
+        }
        jsonWriter.EndArray();
        jsonWriter.EndObject();
    } else {
-        JsonGenerator::signalError(jsonWriter, "no such tm!");    
+        JsonGenerator::signalError(jsonWriter, "no such tm!");
    }
 }

 void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                                               std::string & pattern,
                                               const std::vector<Interval> & intervals,
-                                               const int tmId) {    
+                                               const int tmId) {
    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
        if (intervals.size() > 0) {
 //            std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
+            pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
            std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
-            
+
            Logger::log("concordiaPhraseSearch");
            Logger::logString("short pattern", shortPattern);
            std::vector<SimpleSearchResult> shortPatternResults = _unitDAO.getSearchResults(it->second->simpleSearch(shortPattern));
-            
-            
-            
+
+
+
            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
            jsonWriter.String("found");
            if (shortPatternResults.size() > 0) {
                jsonWriter.Bool(true);
-            
-    
+
+
                std::vector<SimpleSearchResult> bestOverlay;
-                
+
                int currStart = 0;
                BOOST_FOREACH(const Interval & interval, intervals) {
                    CompleteConcordiaSearchResult restResult = _unitDAO.getConcordiaResult(
                                                    it->second->concordiaSearch(pattern.substr(currStart, interval.getStart()-currStart)));
                    restResult.offsetPattern(currStart);
                    bestOverlay.insert(bestOverlay.end(), restResult.getBestOverlay().begin(), restResult.getBestOverlay().end());
-                    
+
                    SimpleSearchResult shortPatternresult = shortPatternResults[0];
                    shortPatternresult.setMatchedPatternStart(interval.getStart());
                    shortPatternresult.setMatchedPatternEnd(interval.getEnd());
@ -82,26 +86,26 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
                                                it->second->concordiaSearch(_substrUTF8(pattern,currStart,INT_MAX)));
                lastRestResult.offsetPattern(currStart);
                bestOverlay.insert(bestOverlay.end(), lastRestResult.getBestOverlay().begin(), lastRestResult.getBestOverlay().end());
-                
+
                jsonWriter.String("result");
                jsonWriter.StartObject();
                jsonWriter.String("bestOverlay");
                jsonWriter.StartArray();
                BOOST_FOREACH(SimpleSearchResult & simpleResult, bestOverlay) {
-                    JsonGenerator::writeSearchResult(jsonWriter, simpleResult);        
-                }    
+                    JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
+                }
                jsonWriter.EndArray();
                jsonWriter.EndObject();
            } else {
-                jsonWriter.Bool(false);            
+                jsonWriter.Bool(false);
            }
            jsonWriter.EndObject();
        } else {
            JsonGenerator::signalError(jsonWriter, "no intervals for phrase search");
        }
    } else {
-        JsonGenerator::signalError(jsonWriter, "no such tm!");        
-    }            
+        JsonGenerator::signalError(jsonWriter, "no such tm!");
+    }
 }


@ -111,8 +115,9 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff

    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
+        pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
        CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
-        
+
        jsonWriter.StartObject();
        jsonWriter.String("status");
        jsonWriter.String("success");
@ -123,16 +128,16 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
        jsonWriter.String("bestOverlay");
        jsonWriter.StartArray();
        BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) {
-            JsonGenerator::writeSearchResult(jsonWriter, simpleResult);        
-        }    
+            JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
+        }
        jsonWriter.EndArray();
        jsonWriter.EndObject();
-        
-        
+
+
        jsonWriter.EndObject();
    } else {
-        JsonGenerator::signalError(jsonWriter, "no such tm!");        
-    }            
+        JsonGenerator::signalError(jsonWriter, "no such tm!");
+    }
 }

 std::string SearcherController::_substrUTF8(std::string source, int start, int length) {
@ -146,6 +151,3 @@ std::string SearcherController::_substrUTF8(std::string source, int start, int l

    return result;
 }
-
-
-
--- a/concordia-server/searcher_controller.hpp
+++ b/concordia-server/searcher_controller.hpp
@ -10,6 +10,7 @@

 #include "unit_dao.hpp"
 #include "simple_search_result.hpp"
+#include "lemmatizer_facade.hpp"
 #include "rapidjson/writer.h"


@ -17,8 +18,9 @@ class SearcherController {
 public:
    /*! Constructor.
    */
-    explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
-                                                                      throw(ConcordiaException);
+    explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
+                                boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
+                                                            throw(ConcordiaException);
    /*! Destructor.
    */
    virtual ~SearcherController();
@ -40,7 +42,9 @@ private:
    std::string _substrUTF8(std::string source, int start, int length);

    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
-    
+
+    boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
+
    UnitDAO _unitDAO;
 };

--- a/concordia-server/tm_dao.cpp
+++ b/concordia-server/tm_dao.cpp
@ -3,6 +3,7 @@
 #include "query_param.hpp"
 #include "string_param.hpp"
 #include "int_param.hpp"
+#include "bool_param.hpp"
 #include "int_array_param.hpp"
 #include "logger.hpp"

@ -27,20 +28,25 @@ std::vector<int> TmDAO::getTmIds() {
    }
    connection.clearResult(dbResult);
    connection.endTransaction();
-    
+
    return result;
 }

 int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
+    addTm(sourceLangId, targetLangId, name, false);
+}
+
+int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) {
    DBconnection connection;
    connection.startTransaction();

-    std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
+    std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id";
    std::vector<QueryParam*> params;
    params.push_back(new IntParam(sourceLangId));
    params.push_back(new IntParam(targetLangId));
    params.push_back(new StringParam(name));
-    
+    params.push_back(new BoolParam(lemmatized));
+
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
    connection.clearResult(result);
@ -48,8 +54,23 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
    BOOST_FOREACH (QueryParam * param, params) {
        delete param;
    }
-    
+
    return newId;

 }

+std::pair<bool, std::string> TmDAO::getTmInfo(int tmId) {
+    DBconnection connection;
+    connection.startTransaction();
+    std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;";
+    std::vector<QueryParam*> params;
+    params.push_back(new IntParam(tmId));
+    PGresult * dbResult = connection.execute(query, params);
+    bool lemmatized = connection.getBoolValue(dbResult, 0, 1);
+    std::string languageCode = connection.getStringValue(dbResult, 0, 2);
+    connection.clearResult(dbResult);
+    connection.endTransaction();
+
+    return std::pair<bool, std::string>(lemmatized, languageCode);
+
+}
--- a/concordia-server/tm_dao.hpp
+++ b/concordia-server/tm_dao.hpp
@ -3,6 +3,7 @@

 #include <string>
 #include <vector>
+#include <utility>

 #include <concordia/common/config.hpp>
 #include "db_connection.hpp"
@ -18,8 +19,12 @@ public:

    int addTm(const int sourceLangId, const int targetLangId, const std::string name);

+    int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized);
+
    std::vector<int> getTmIds();

+    std::pair<bool, std::string> getTmInfo(int tmId);
+
 private:

 };
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -22,7 +22,7 @@ int UnitDAO::addSentence(
     const TokenizedSentence & sourceSentence,
     const std::string & targetSentence,
     const int tmId) {
-    
+
    DBconnection connection;
    connection.startTransaction();
    int newId = _addSingleSentence(connection, sourceSentence, targetSentence, tmId);
@ -38,7 +38,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
    std::vector<SUFFIX_MARKER_TYPE> newIds;
    connection.startTransaction();
    int index = 0;
-    BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) {    
+    BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) {
        newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmId));
        index++;
    }
@ -50,7 +50,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
         const std::vector<TokenizedSentence> & sourceSentences,
         const std::vector<TokenizedSentence> & targetSentences,
         const std::vector<std::vector<std::vector<int> > > & allAlignments,
-         const int tmId) {
+         const int tmId) throw (ConcordiaException) {

    DBconnection connection;
    std::vector<SUFFIX_MARKER_TYPE> newIds;
@ -59,9 +59,9 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
    for (int i=0; i< sourceSentences.size(); i++) {
        newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
    }
-    
+
    connection.endTransaction();
-    return newIds;      
+    return newIds;
 }

 std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<MatchedPatternFragment> & fragments) {
@ -83,7 +83,7 @@ void UnitDAO::_getResultsFromFragments(
                              std::vector<SimpleSearchResult> & results,
                              const std::vector<MatchedPatternFragment> & fragments,
                              const TokenizedSentence & tokenizedPattern) {
-    
+
    DBconnection connection;
    connection.startTransaction();

@ -95,9 +95,9 @@ void UnitDAO::_getResultsFromFragments(
            matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
            matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
        }
-        
-        
-        
+
+
+
        std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
        std::vector<QueryParam*> params;
        params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
@ -116,7 +116,7 @@ void UnitDAO::_getResultsFromFragments(
            delete param;
        }

-        // now add all target fragments matched with this fragment        
+        // now add all target fragments matched with this fragment
        std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
        std::vector<QueryParam*> targetParams;
        targetParams.push_back(new IntParam(fragment.getExampleId()));
@ -127,12 +127,12 @@ void UnitDAO::_getResultsFromFragments(
        int prevPos = -2;
        int currStart = -1;
        int currEnd = -1;
-        
+
        for (int i=0;i<connection.getRowCount(targetResult);i++) {
            int targetPos = connection.getIntValue(targetResult, i, 0);
            int targetStart = connection.getIntValue(targetResult, i, 1);
            int targetEnd = connection.getIntValue(targetResult, i, 2);
-            
+
            if (prevPos < targetPos - 1) { // beginning of detached fragment
                // check if there is a fragment to end
                if (currStart >= 0) {
@ -141,7 +141,7 @@ void UnitDAO::_getResultsFromFragments(
                currStart = targetStart;
            }

-            currEnd = targetEnd;            
+            currEnd = targetEnd;
            prevPos = targetPos;
        }

@ -154,9 +154,9 @@ void UnitDAO::_getResultsFromFragments(
        BOOST_FOREACH (QueryParam * param, targetParams) {
            delete param;
        }
-        
+
        results.push_back(ssResult);
-    }    
+    }
    connection.endTransaction();
 }

@ -181,25 +181,29 @@ int UnitDAO::_addSingleSentence(
    params.push_back(new StringParam(targetSentence));
    params.push_back(new IntParam(tmId));
    params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
-    
+
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
    connection.clearResult(result);
    BOOST_FOREACH (QueryParam * param, params) {
        delete param;
    }
-    
+
    return newId;
 }


-int UnitDAO::_addAlignedUnit(
+int UnitDAO::_addAlignedUnit (
     DBconnection & connection,
     const TokenizedSentence & sourceSentence,
     const TokenizedSentence & targetSentence,
     const std::vector<std::vector<int> > & alignments,
-     const int tmId) {
-        
+     const int tmId) throw(ConcordiaException) {
+
+    if (sourceSentence.getTokens().size() != alignments.size()) {
+        throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
+    }
+
    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
    std::vector<QueryParam*> params;
    params.push_back(new StringParam(sourceSentence.getSentence()));
@ -207,14 +211,14 @@ int UnitDAO::_addAlignedUnit(
    params.push_back(new IntParam(tmId));
    params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
    params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
-    
+
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
    connection.clearResult(result);
    BOOST_FOREACH (QueryParam * param, params) {
        delete param;
    }
-    
+
    // add alignments
    bool nonEmpty = false;
    std::stringstream alignmentsQuery;
@ -230,10 +234,8 @@ int UnitDAO::_addAlignedUnit(
        query = alignmentsQuery.str();
        query = query.substr(0, query.length()-1);
        PGresult * result = connection.execute(query);
-        connection.clearResult(result);    
+        connection.clearResult(result);
    }

    return newId;
 }
-
-
--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -9,6 +9,7 @@
 #include <concordia/substring_occurence.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/concordia_search_result.hpp>
+#include <concordia/concordia_exception.hpp>
 #include <boost/shared_ptr.hpp>

 #include "simple_search_result.hpp"
@ -33,13 +34,13 @@ public:
             const std::vector<TokenizedSentence> & sourceSentences,
             const std::vector<std::string> & targetSentences,
             const int tmId);
-             
+
    std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
             const std::vector<TokenizedSentence> & sourceSentences,
             const std::vector<TokenizedSentence> & targetSentences,
             const std::vector<std::vector<std::vector<int> > > & allAlignments,
-             const int tmId);
-    
+             const int tmId) throw (ConcordiaException);
+
    std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);

    CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
@ -50,7 +51,7 @@ private:
                                  const TokenizedSentence & tokenizedPattern);

    std::vector<int> _getTokenPositions(const TokenizedSentence & ts);
-    
+
    int _addSingleSentence(
         DBconnection & connection,
         const TokenizedSentence & sourceSentence,
@ -62,7 +63,7 @@ private:
         const TokenizedSentence & sourceSentence,
         const TokenizedSentence & targetSentence,
         const std::vector<std::vector<int> > & alignments,
-         const int tmId);
+         const int tmId) throw(ConcordiaException);
 };

 #endif
--- a/db/concordia_server.sql
+++ b/db/concordia_server.sql
@ -3,7 +3,8 @@ CREATE TABLE tm (
    id SERIAL PRIMARY KEY,
    source_lang_id integer,
    target_lang_id integer,
-    name varchar(40)
+    name varchar(40),
+    lemmatized bool DEFAULT false
 );

 DROP TABLE IF EXISTS language;
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs
@ -12,12 +12,20 @@ namespace LemmaGenSentenceLemmatizer
        {
            if (args.Length == 1)
            {
-                SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
-                string line = Console.ReadLine();
-                while (!string.IsNullOrEmpty(line))
+                try
                {
-                    Console.WriteLine(lemmatizer.lemmatizeSentence(line));
-                    line = Console.ReadLine();
+                    SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
+                    string line = Console.ReadLine();
+                    while (line != null)
+                    {
+                        Console.WriteLine(lemmatizer.lemmatizeSentence(line));
+                        line = Console.ReadLine();
+                    }
+
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine("Exception occurred: " + ex.Message);
                }


--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -1,10 +1,22 @@
 SRC_LANG=en
 TRG_LANG=pl
-CORPUS_NAME=europarl
+CORPUS_NAME=europarljrc

 all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
 	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
-	cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
+	cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
+
+clean-intermediate-files:
+	rm -f corpora/$(CORPUS_NAME)/*.lem
+	rm -f corpora/$(CORPUS_NAME)/*.low
+	rm -f corpora/$(CORPUS_NAME)/*.classes
+	rm -f corpora/$(CORPUS_NAME)/*.classes.cats
+	rm -f corpora/$(CORPUS_NAME)/*.vcb
+	rm -f corpora/$(CORPUS_NAME)/*.snt
+	rm -f corpora/$(CORPUS_NAME)/*.cooc
+	rm -f corpora/$(CORPUS_NAME)/aligned*part*
+	rm -f corpora/$(CORPUS_NAME)/giza.cfg
+

 clean:
 	rm -f corpora/$(CORPUS_NAME)/*.tok
--- a/mgiza-aligner/clean-corpus-n.perl
+++ b/mgiza-aligner/clean-corpus-n.perl
@ -0,0 +1,168 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
+use warnings;
+use strict;
+use Getopt::Long;
+my $help;
+my $lc = 0; # lowercase the corpus?
+my $ignore_ratio = 0;
+my $ignore_xml = 0;
+my $enc = "utf8"; # encoding of the input and output files
+    # set to anything else you wish, but I have not tested it yet
+my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
+    # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
+    # and crashes if it encounters a word that exceeds it
+my $ratio = 9;
+
+GetOptions(
+  "help" => \$help,
+  "lowercase|lc" => \$lc,
+  "encoding=s" => \$enc,
+  "ratio=f" => \$ratio,
+  "ignore-ratio" => \$ignore_ratio,
+  "ignore-xml" => \$ignore_xml,
+  "max-word-length|mwl=s" => \$max_word_length
+) or exit(1);
+
+if (scalar(@ARGV) < 6 || $help) {
+    print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
+    exit;
+}
+
+my $corpus = $ARGV[0];
+my $l1 = $ARGV[1];
+my $l2 = $ARGV[2];
+my $out = $ARGV[3];
+my $min = $ARGV[4];
+my $max = $ARGV[5];
+
+my $linesRetainedFile = "";
+if (scalar(@ARGV) > 6) {
+	$linesRetainedFile = $ARGV[6];
+	open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
+}
+
+print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
+
+my $opn = undef;
+my $l1input = "$corpus.$l1";
+if (-e $l1input) {
+  $opn = $l1input;
+} elsif (-e $l1input.".gz") {
+  $opn = "gunzip -c $l1input.gz |";
+} else {
+    die "Error: $l1input does not exist";
+}
+open(F,$opn) or die "Can't open '$opn'";
+$opn = undef;
+my $l2input = "$corpus.$l2";
+if (-e $l2input) {
+  $opn = $l2input;
+} elsif (-e $l2input.".gz") {
+  $opn = "gunzip -c $l2input.gz |";
+} else  {
+ die "Error: $l2input does not exist";
+}
+
+open(E,$opn) or die "Can't open '$opn'";
+
+open(FO,">$out.$l1") or die "Can't write $out.$l1";
+open(EO,">$out.$l2") or die "Can't write $out.$l2";
+
+# necessary for proper lowercasing
+my $binmode;
+if ($enc eq "utf8") {
+  $binmode = ":utf8";
+} else {
+  $binmode = ":encoding($enc)";
+}
+binmode(F, $binmode);
+binmode(E, $binmode);
+binmode(FO, $binmode);
+binmode(EO, $binmode);
+
+my $innr = 0;
+my $outnr = 0;
+my $factored_flag;
+while(my $f = <F>) {
+  $innr++;
+  print STDERR "." if $innr % 10000 == 0;
+  print STDERR "($innr)" if $innr % 100000 == 0;
+  my $e = <E>;
+  die "$corpus.$l2 is too short!" if !defined $e;
+  chomp($e);
+  chomp($f);
+  if ($innr == 1) {
+    $factored_flag = ($e =~ /\|/ || $f =~ /\|/);
+  }
+
+  #if lowercasing, lowercase
+  if ($lc) {
+    $e = lc($e);
+    $f = lc($f);
+  }
+
+  $e =~ s/\|//g unless $factored_flag;
+  $e =~ s/\s+/ /g;
+  $e =~ s/^ //;
+  $e =~ s/ $//;
+  $f =~ s/\|//g unless $factored_flag;
+  $f =~ s/\s+/ /g;
+  $f =~ s/^ //;
+  $f =~ s/ $//;
+  next if $f eq '';
+  next if $e eq '';
+
+  my $ec = &word_count($e);
+  my $fc = &word_count($f);
+  next if $ec > $max;
+  next if $fc > $max;
+  next if $ec < $min;
+  next if $fc < $min;
+  next if !$ignore_ratio && $ec/$fc > $ratio;
+  next if !$ignore_ratio && $fc/$ec > $ratio;
+  # Skip this segment if any factor is longer than $max_word_length
+  my $max_word_length_plus_one = $max_word_length + 1;
+  next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
+  next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
+
+  # An extra check: none of the factors can be blank!
+  die "There is a blank factor in $corpus.$l1 on line $innr: $f"
+    if $f =~ /[ \|]\|/;
+  die "There is a blank factor in $corpus.$l2 on line $innr: $e"
+    if $e =~ /[ \|]\|/;
+
+  $outnr++;
+  print FO $f."\n";
+  print EO $e."\n";
+
+  if ($linesRetainedFile ne "") {
+	print LINES_RETAINED $innr."\n";
+  }
+}
+
+if ($linesRetainedFile ne "") {
+  close LINES_RETAINED;
+}
+
+print STDERR "\n";
+my $e = <E>;
+die "$corpus.$l2 is too long!" if defined $e;
+
+print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
+
+sub word_count {
+  my ($line) = @_;
+  if ($ignore_xml) {
+    $line =~ s/<\S[^>]*\S>/ /g;
+    $line =~ s/\s+/ /g;
+    $line =~ s/^ //g;
+    $line =~ s/ $//g;
+  }
+  my @w = split(/ /,$line);
+  return scalar @w;
+}
--- a/mgiza-aligner/sortGizaAlignments.py
+++ b/mgiza-aligner/sortGizaAlignments.py
@ -0,0 +1,26 @@
+#!/usr/bin/python3
+
+import sys, re
+
+examples_dict = {}
+p = re.compile("# Sentence pair \((\d+)\)")
+
+i = 0
+for line in sys.stdin:
+    line = line.strip()
+    if i % 3 == 0:
+        current_example = [line]
+        m = p.match(line)
+        if m:
+            current_key = int(m.group(1))
+        else:
+            raise Exception("Wrong line: "+line)
+    elif i % 3 == 1:
+        current_example.append(line)
+    else:
+        current_example.append(line)
+        examples_dict[current_key] = current_example
+    i+=1
+
+for key in sorted(examples_dict.keys()):
+    print ('\n'.join(examples_dict[key]))
--- a/tests/addAlignedLemmatizedTM.py
+++ b/tests/addAlignedLemmatizedTM.py
@ -21,10 +21,15 @@ def file_len(fname):
            pass
    return i + 1

-def add_data(data):
+def add_examples(examplesData):
    req = urllib2.Request(address)
    req.add_header('Content-Type', 'application/json')
-    json.loads(urllib2.urlopen(req, json.dumps(data)).read())
+    response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
+    if response['status'] == 'error':
+        raise Exception(response['message'])
+
+if len(sys.argv) != 7:
+    raise Exception("wrong number of arguments")

 name = sys.argv[1]
 sourceFile = sys.argv[2]
@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
    raise Exception("alignments file is not exactly 3 times longer than source and target")


-totalLines = file_len(sourceFile)
+totalExamples = file_len(sourceFile)

 data = {
    'operation': 'addTm',
    'sourceLangId':sourceLangId,
    'targetLangId':targetLangId,
-    'name':name
+    'name':name,
+    'tmLemmatized':True
 }

 req = urllib2.Request(address)
@ -60,35 +66,35 @@ data = {
    'tmId':tmId
 }

-sentences = []
+examples = []
 start = time.time()
-with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines:
+with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
+    for lineNumber in range(totalExamples):
+        sourceSentence = sf.readline().strip()
+        targetSentence = tf.readline().strip()

-    lineNumber = 0
-    for line in sourceLines:
-        line = line.strip()
-        if lineNumber % 3 == 1:
-            currSentence.append(line)
-        elif lineNumber % 3 == 2:
-            currSentence.append(line)
-            currSentence.reverse()
-            sentences.append(currSentence)
-            currSentence = []
-            if len(sentences) >= BUFFER_SIZE:
-                data['sentences'] = sentences
-                add_data(data)
-                mark = time.time()
-                print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
-                sentences = []
-        lineNumber += 1
+        # skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
+        af.readline()
+        af.readline()
+
+        alignmentString = af.readline().strip()
+
+        examples.append([sourceSentence, targetSentence, alignmentString])
+
+        if len(examples) >= BUFFER_SIZE:
+            data['examples'] = examples
+            add_examples(data)
+            mark = time.time()
+            print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
+            examples = []


-if len(sentences) > 0:
-    data['sentences'] = sentences
-    add_data(data)
+if len(examples) > 0:
+    data['examples'] = examples
+    add_examples(data)

 end = time.time()
-print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
+print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))

 print "Generating index..."
 start = time.time()
--- a/tests/addLemmatizedTM.sh
+++ b/tests/addLemmatizedTM.sh
@ -0,0 +1,7 @@
+#!/bin/sh
+
+CORPUS_NAME="europarl_sample"
+SRC_LANG_ID=2
+TRG_LANG_ID=1
+
+./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
--- a/tests/addTm.py
+++ b/tests/addTm.py
@ -16,7 +16,8 @@ data = {
    'operation': 'addTm',
    'sourceLangId':int(sys.argv[1]),
    'targetLangId':int(sys.argv[2]),
-    'name':sys.argv[3]
+    'name':sys.argv[3],
+    'tmLemmatized':bool(int(sys.argv[4]))
 }

 req = urllib2.Request(address)