working lemmatization

2017-03-10 14:52:01 +01:00 · 2017-03-10 14:52:01 +01:00 · 89fb77bf58
commit 89fb77bf58
parent 8b0666c34d
32 changed files with 592 additions and 142 deletions
--- a/concordia-server/bool_param.cpp
+++ b/concordia-server/bool_param.cpp
@ -0,0 +1,24 @@
 #include "bool_param.hpp"
 BoolParam::BoolParam(bool value):_value(value) {
 }
 BoolParam::~BoolParam() {
 }
 const char * BoolParam::getValue() {
    if (_value) {
        return "t";
    } else {
        return "f";
    }
 }
 const int BoolParam::getLength() {
    return 1;
 }
 const int BoolParam::isBinary() {
    return 0;
 }
--- a/concordia-server/bool_param.hpp
+++ b/concordia-server/bool_param.hpp
@ -0,0 +1,24 @@
 #ifndef BOOL_PARAM_HDR
 #define BOOL_PARAM_HDR
 #include "query_param.hpp"
 class BoolParam : public QueryParam {
 public:
    /*! Constructor.
    */
    BoolParam(bool value);
    /*! Destructor.
    */
    virtual ~BoolParam();
    const char * getValue();
    const int getLength();
    const int isBinary();
 private:
    bool _value;
 };
 #endif
--- a/concordia-server/concordia_server.cpp
+++ b/concordia-server/concordia_server.cpp
@ -5,6 +5,7 @@
 #include <iostream>
 #include <fstream>
 #include <ctime>
 #include <utility>
 #include <concordia/interval.hpp>
@ -19,16 +20,17 @@
 ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
                                         throw(ConcordiaException) :
                                         _configFilePath(configFilePath) {
    std::vector<int> tmIds = _tmDAO.getTmIds();
    _concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
    BOOST_FOREACH(int & tmId, tmIds) {
        _addTm(tmId);
    }
    _indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
    _searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
    _lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
    _indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap, _lemmatizerFacade));
    _searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap, _lemmatizerFacade));
 }
 ConcordiaServer::~ConcordiaServer() {
@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
                    }
                }
                _indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
            } else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
                std::vector<std::string> sourceSentences;
                std::vector<std::string> targetSentences;
                std::vector<std::string> alignmentStrings;
                int tmId = d[TM_ID_PARAM].GetInt();
                // loading data from json
                const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
                Logger::log("addAlignedLemmatizedSentences");
                Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
                Logger::logInt("tm id", tmId);
                for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
                    if (sentencesArray[i].Size() != 3) {
                        JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
                        break;
                    } else {
                        sourceSentences.push_back(sentencesArray[i][0].GetString());
                        targetSentences.push_back(sentencesArray[i][1].GetString());
                        alignmentStrings.push_back(sentencesArray[i][2].GetString());
                    }
                }
                _indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
            } else if (operation == "lemmatize") {
                std::string sentence = _getStringParameter(d, "sentence");
                std::string languageCode = _getStringParameter(d, "languageCode");
@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
                int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
                int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
                std::string name = _getStringParameter(d, NAME_PARAM);
-                int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
+                bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM);
                int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized);
                _addTm(newId);
                jsonWriter.StartObject();
@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
    }
 }
 int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
                                                       throw (ConcordiaException) {
    rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
    if (itr != d.MemberEnd()) {
        bool value = itr->value.GetBool();
        return value;
    } else {
        throw ConcordiaException("missing parameter: " + std::string(name));
    }
 }
 void ConcordiaServer::_addTm(int tmId) {
    std::stringstream indexPath;
    indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
--- a/concordia-server/concordia_server.hpp
+++ b/concordia-server/concordia_server.hpp
@ -38,6 +38,8 @@ private:
    int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
    int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
    void _addTm(int tmId);
    std::string _configFilePath;
--- a/concordia-server/config.hpp.in
+++ b/concordia-server/config.hpp.in
@ -16,7 +16,9 @@
 #define SOURCE_SENTENCE_PARAM "sourceSentence"
 #define TARGET_SENTENCE_PARAM "targetSentence"
 #define TM_ID_PARAM "tmId"
 #define TM_LEMMATIZED_PARAM "tmLemmatized"
 #define SENTENCES_PARAM "sentences"
 #define EXAMPLES_PARAM "examples"
 #define SOURCE_LANG_PARAM "sourceLangId"
 #define TARGET_LANG_PARAM "targetLangId"
 #define NAME_PARAM "name"
@ -25,6 +27,7 @@
 #define ADD_SENTENCE_OP "addSentence"
 #define ADD_SENTENCES_OP "addSentences"
 #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
 #define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
 #define REFRESH_INDEX_OP "refreshIndex"
 #define SIMPLE_SEARCH_OP "simpleSearch"
 #define CONCORDIA_SEARCH_OP "concordiaSearch"
--- a/concordia-server/db_connection.cpp
+++ b/concordia-server/db_connection.cpp
@ -133,6 +133,17 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
    }
 }
 bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
    try {
        char * valueStr = PQgetvalue(result,row,col);
        return std::string(valueStr) == "t";
    } catch (std::exception & e) {
        std::stringstream ss;
        ss << "Error getting bool value. Message: " << e.what();
        throw ConcordiaException(ss.str());
    }
 }
 std::string DBconnection::getStringValue(PGresult * result, int row, int col)  throw (ConcordiaException) {
    try {
        char * valueStr = PQgetvalue(result,row,col);
@ -153,4 +164,3 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
        throw ConcordiaException(ss.str());
    }
 }
--- a/concordia-server/db_connection.hpp
+++ b/concordia-server/db_connection.hpp
@ -31,6 +31,8 @@ public:
    int getIntValue(PGresult * result, int row, int col)  throw (ConcordiaException);
    bool getBoolValue(PGresult * result, int row, int col)  throw (ConcordiaException);
    std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
    int getRowCount(PGresult * result)  throw (ConcordiaException);
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -14,9 +14,11 @@
 #include "json_generator.hpp"
 #include "logger.hpp"
-IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
                                 boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                    throw(ConcordiaException):
-                                                                     _concordiasMap(concordiasMap) {
+                                                                     _concordiasMap(concordiasMap),
                                                                     _lemmatizerFacade(lemmatizerFacade) {
 }
 IndexController::~IndexController() {
@ -32,9 +34,10 @@ void IndexController::addSentence(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
            TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
            TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
            int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
-            it->second->addTokenizedExample(tokenizedSentence, sentenceId);
+            it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
            it->second->refreshSAfromRAM();
            jsonWriter.StartObject();
@ -67,9 +70,10 @@ void IndexController::addSentences(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
            std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
            std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
-            it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
+            it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
            jsonWriter.StartObject();
            jsonWriter.String("status");
@ -118,6 +122,44 @@ void IndexController::addAlignedSentences(
    }
 }
 void IndexController::addAlignedLemmatizedSentences(
                 rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                 const std::vector<std::string> & sourceSentences,
                 const std::vector<std::string> & targetSentences,
                 const std::vector<std::string> & alignmentStrings,
                 const int tmId) {
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
            std::vector<std::string> lemmatizedSourceSentences;
            std::vector<std::vector<std::vector<int> > > allAlignments;
            _getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
            std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
            _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
            for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
                it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
            }
            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
            jsonWriter.EndObject();
        } else {
            JsonGenerator::signalError(jsonWriter, "no such tm!");
        }
    } catch (ConcordiaException & e) {
        std::stringstream errorstream;
        errorstream << "concordia error: " << e.what();
        JsonGenerator::signalError(jsonWriter, errorstream.str());
    }
 }
 void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                                          const int tmId) {
    try {
@ -189,4 +231,3 @@ void IndexController::_getSourceSentencesAndAlignments(
        allAlignments.push_back(alignments);
    }
 }
--- a/concordia-server/index_controller.hpp
+++ b/concordia-server/index_controller.hpp
@ -10,6 +10,8 @@
 #include "unit_dao.hpp"
 #include "lemmatizer_facade.hpp"
 #include "rapidjson/writer.h"
@ -17,7 +19,8 @@ class IndexController {
 public:
    /*! Constructor.
    */
-    explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+    explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
                             boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                   throw(ConcordiaException);
    /*! Destructor.
    */
@ -38,6 +41,13 @@ public:
                             const std::vector<std::string> & targetSentences,
                             const int tmId);
    void addAlignedLemmatizedSentences(
                          rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                          const std::vector<std::string> & sourceSentences,
                          const std::vector<std::string> & targetSentences,
                          const std::vector<std::string> & alignmentStrings,
                          const int tmId);
    void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
                             const int tmId);
@ -49,6 +59,8 @@ private:
    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
    boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
    UnitDAO _unitDAO;
 };
--- a/concordia-server/lemmatizer_facade.cpp
+++ b/concordia-server/lemmatizer_facade.cpp
@ -1,5 +1,7 @@
 #include "lemmatizer_facade.hpp"
 #include <boost/foreach.hpp>
 LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
    _lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
    }
 }
 std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
    std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
    if (tmInfo.first) {
        return lemmatizeSentence(tmInfo.second, pattern);
    } else {
        return pattern;
    }
 }
 std::vector<std::string> LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId) {
    std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
    if (tmInfo.first) {
        std::vector<std::string> result;
        BOOST_FOREACH(std::string & pattern, patterns) {
            result.push_back(lemmatizeSentence(tmInfo.second, pattern));
        }
        return result;
    } else {
        return patterns;
    }
 }
--- a/concordia-server/lemmatizer_facade.hpp
+++ b/concordia-server/lemmatizer_facade.hpp
@ -2,6 +2,7 @@
 #define LEMMATIZER_FACADE_HDR
 #include "socket_lemmatizer.hpp"
 #include "tm_dao.hpp"
 #include <string>
 #include <concordia/concordia_exception.hpp>
@ -18,8 +19,15 @@ public:
    virtual ~LemmatizerFacade();
    std::string lemmatizeSentence(std::string languageCode, std::string sentence);
    std::string lemmatizeIfNeeded(std::string pattern, int tmId);
    std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
 private:
    boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
    TmDAO _tmDAO;
 };
 #endif
--- a/concordia-server/searcher_controller.cpp
+++ b/concordia-server/searcher_controller.cpp
@ -8,9 +8,11 @@
 #include "logger.hpp"
-SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
                                       boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
                                                                     throw(ConcordiaException):
-                                                                     _concordiasMap(concordiasMap) {
+                                                                     _concordiasMap(concordiasMap),
                                                                     _lemmatizerFacade(lemmatizerFacade) {
 }
 SearcherController::~SearcherController() {
@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
                                      const int tmId) {
    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
        pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
        std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
        jsonWriter.StartObject();
@ -47,6 +50,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
    if (it != _concordiasMap->end()) {
        if (intervals.size() > 0) {
 //            std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
            pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
            std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
            Logger::log("concordiaPhraseSearch");
@ -111,6 +115,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
        pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
        CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
        jsonWriter.StartObject();
@ -146,6 +151,3 @@ std::string SearcherController::_substrUTF8(std::string source, int start, int l
    return result;
 }
--- a/concordia-server/searcher_controller.hpp
+++ b/concordia-server/searcher_controller.hpp
@ -10,6 +10,7 @@
 #include "unit_dao.hpp"
 #include "simple_search_result.hpp"
 #include "lemmatizer_facade.hpp"
 #include "rapidjson/writer.h"
@ -17,7 +18,8 @@ class SearcherController {
 public:
    /*! Constructor.
    */
-    explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
+    explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
                                boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
                                                            throw(ConcordiaException);
    /*! Destructor.
    */
@ -41,6 +43,8 @@ private:
    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
    boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
    UnitDAO _unitDAO;
 };
--- a/concordia-server/tm_dao.cpp
+++ b/concordia-server/tm_dao.cpp
@ -3,6 +3,7 @@
 #include "query_param.hpp"
 #include "string_param.hpp"
 #include "int_param.hpp"
 #include "bool_param.hpp"
 #include "int_array_param.hpp"
 #include "logger.hpp"
@ -32,14 +33,19 @@ std::vector<int> TmDAO::getTmIds() {
 }
 int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
    addTm(sourceLangId, targetLangId, name, false);
 }
 int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) {
    DBconnection connection;
    connection.startTransaction();
-    std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
+    std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id";
    std::vector<QueryParam*> params;
    params.push_back(new IntParam(sourceLangId));
    params.push_back(new IntParam(targetLangId));
    params.push_back(new StringParam(name));
    params.push_back(new BoolParam(lemmatized));
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
@ -53,3 +59,18 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
 }
 std::pair<bool, std::string> TmDAO::getTmInfo(int tmId) {
    DBconnection connection;
    connection.startTransaction();
    std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;";
    std::vector<QueryParam*> params;
    params.push_back(new IntParam(tmId));
    PGresult * dbResult = connection.execute(query, params);
    bool lemmatized = connection.getBoolValue(dbResult, 0, 1);
    std::string languageCode = connection.getStringValue(dbResult, 0, 2);
    connection.clearResult(dbResult);
    connection.endTransaction();
    return std::pair<bool, std::string>(lemmatized, languageCode);
 }
--- a/concordia-server/tm_dao.hpp
+++ b/concordia-server/tm_dao.hpp
@ -3,6 +3,7 @@
 #include <string>
 #include <vector>
 #include <utility>
 #include <concordia/common/config.hpp>
 #include "db_connection.hpp"
@ -18,8 +19,12 @@ public:
    int addTm(const int sourceLangId, const int targetLangId, const std::string name);
    int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized);
    std::vector<int> getTmIds();
    std::pair<bool, std::string> getTmInfo(int tmId);
 private:
 };
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -50,7 +50,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
         const std::vector<TokenizedSentence> & sourceSentences,
         const std::vector<TokenizedSentence> & targetSentences,
         const std::vector<std::vector<std::vector<int> > > & allAlignments,
-         const int tmId) {
+         const int tmId) throw (ConcordiaException) {
    DBconnection connection;
    std::vector<SUFFIX_MARKER_TYPE> newIds;
@ -198,7 +198,11 @@ int UnitDAO::_addAlignedUnit(
     const TokenizedSentence & sourceSentence,
     const TokenizedSentence & targetSentence,
     const std::vector<std::vector<int> > & alignments,
-     const int tmId) {
+     const int tmId) throw(ConcordiaException) {
    if (sourceSentence.getTokens().size() != alignments.size()) {
        throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
    }
    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
    std::vector<QueryParam*> params;
@ -235,5 +239,3 @@ int UnitDAO::_addAlignedUnit(
    return newId;
 }
--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -9,6 +9,7 @@
 #include <concordia/substring_occurence.hpp>
 #include <concordia/matched_pattern_fragment.hpp>
 #include <concordia/concordia_search_result.hpp>
 #include <concordia/concordia_exception.hpp>
 #include <boost/shared_ptr.hpp>
 #include "simple_search_result.hpp"
@ -38,7 +39,7 @@ public:
             const std::vector<TokenizedSentence> & sourceSentences,
             const std::vector<TokenizedSentence> & targetSentences,
             const std::vector<std::vector<std::vector<int> > > & allAlignments,
-             const int tmId);
+             const int tmId) throw (ConcordiaException);
    std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
@ -62,7 +63,7 @@ private:
         const TokenizedSentence & sourceSentence,
         const TokenizedSentence & targetSentence,
         const std::vector<std::vector<int> > & alignments,
-         const int tmId);
+         const int tmId) throw(ConcordiaException);
 };
 #endif
--- a/db/concordia_server.sql
+++ b/db/concordia_server.sql
@ -3,7 +3,8 @@ CREATE TABLE tm (
    id SERIAL PRIMARY KEY,
    source_lang_id integer,
    target_lang_id integer,
-    name varchar(40)
+    name varchar(40),
    lemmatized bool DEFAULT false
 );
 DROP TABLE IF EXISTS language;
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs
@ -11,15 +11,23 @@ namespace LemmaGenSentenceLemmatizer
        static void Main(string[] args)
        {
            if (args.Length == 1)
            {
                try
                {
                    SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
                    string line = Console.ReadLine();
-                while (!string.IsNullOrEmpty(line))
+                    while (line != null)
                    {
                        Console.WriteLine(lemmatizer.lemmatizeSentence(line));
                        line = Console.ReadLine();
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Exception occurred: " + ex.Message);
                }
            }
            else
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe
--- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb
+++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -1,10 +1,22 @@
 SRC_LANG=en
 TRG_LANG=pl
-CORPUS_NAME=europarl
+CORPUS_NAME=europarljrc
 all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
 	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
-	cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
+	cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
 clean-intermediate-files:
 	rm -f corpora/$(CORPUS_NAME)/*.lem
 	rm -f corpora/$(CORPUS_NAME)/*.low
 	rm -f corpora/$(CORPUS_NAME)/*.classes
 	rm -f corpora/$(CORPUS_NAME)/*.classes.cats
 	rm -f corpora/$(CORPUS_NAME)/*.vcb
 	rm -f corpora/$(CORPUS_NAME)/*.snt
 	rm -f corpora/$(CORPUS_NAME)/*.cooc
 	rm -f corpora/$(CORPUS_NAME)/aligned*part*
 	rm -f corpora/$(CORPUS_NAME)/giza.cfg
 clean:
 	rm -f corpora/$(CORPUS_NAME)/*.tok
--- a/mgiza-aligner/clean-corpus-n.perl
+++ b/mgiza-aligner/clean-corpus-n.perl
@ -0,0 +1,168 @@
 #!/usr/bin/env perl
 #
 # This file is part of moses.  Its use is licensed under the GNU Lesser General
 # Public License version 2.1 or, at your option, any later version.
 # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
 use warnings;
 use strict;
 use Getopt::Long;
 my $help;
 my $lc = 0; # lowercase the corpus?
 my $ignore_ratio = 0;
 my $ignore_xml = 0;
 my $enc = "utf8"; # encoding of the input and output files
    # set to anything else you wish, but I have not tested it yet
 my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
    # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
    # and crashes if it encounters a word that exceeds it
 my $ratio = 9;
 GetOptions(
  "help" => \$help,
  "lowercase|lc" => \$lc,
  "encoding=s" => \$enc,
  "ratio=f" => \$ratio,
  "ignore-ratio" => \$ignore_ratio,
  "ignore-xml" => \$ignore_xml,
  "max-word-length|mwl=s" => \$max_word_length
 ) or exit(1);
 if (scalar(@ARGV) < 6 || $help) {
    print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
    exit;
 }
 my $corpus = $ARGV[0];
 my $l1 = $ARGV[1];
 my $l2 = $ARGV[2];
 my $out = $ARGV[3];
 my $min = $ARGV[4];
 my $max = $ARGV[5];
 my $linesRetainedFile = "";
 if (scalar(@ARGV) > 6) {
 	$linesRetainedFile = $ARGV[6];
 	open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
 }
 print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
 my $opn = undef;
 my $l1input = "$corpus.$l1";
 if (-e $l1input) {
  $opn = $l1input;
 } elsif (-e $l1input.".gz") {
  $opn = "gunzip -c $l1input.gz |";
 } else {
    die "Error: $l1input does not exist";
 }
 open(F,$opn) or die "Can't open '$opn'";
 $opn = undef;
 my $l2input = "$corpus.$l2";
 if (-e $l2input) {
  $opn = $l2input;
 } elsif (-e $l2input.".gz") {
  $opn = "gunzip -c $l2input.gz |";
 } else  {
 die "Error: $l2input does not exist";
 }
 open(E,$opn) or die "Can't open '$opn'";
 open(FO,">$out.$l1") or die "Can't write $out.$l1";
 open(EO,">$out.$l2") or die "Can't write $out.$l2";
 # necessary for proper lowercasing
 my $binmode;
 if ($enc eq "utf8") {
  $binmode = ":utf8";
 } else {
  $binmode = ":encoding($enc)";
 }
 binmode(F, $binmode);
 binmode(E, $binmode);
 binmode(FO, $binmode);
 binmode(EO, $binmode);
 my $innr = 0;
 my $outnr = 0;
 my $factored_flag;
 while(my $f = <F>) {
  $innr++;
  print STDERR "." if $innr % 10000 == 0;
  print STDERR "($innr)" if $innr % 100000 == 0;
  my $e = <E>;
  die "$corpus.$l2 is too short!" if !defined $e;
  chomp($e);
  chomp($f);
  if ($innr == 1) {
    $factored_flag = ($e =~ /\|/ || $f =~ /\|/);
  }
  #if lowercasing, lowercase
  if ($lc) {
    $e = lc($e);
    $f = lc($f);
  }
  $e =~ s/\|//g unless $factored_flag;
  $e =~ s/\s+/ /g;
  $e =~ s/^ //;
  $e =~ s/ $//;
  $f =~ s/\|//g unless $factored_flag;
  $f =~ s/\s+/ /g;
  $f =~ s/^ //;
  $f =~ s/ $//;
  next if $f eq '';
  next if $e eq '';
  my $ec = &word_count($e);
  my $fc = &word_count($f);
  next if $ec > $max;
  next if $fc > $max;
  next if $ec < $min;
  next if $fc < $min;
  next if !$ignore_ratio && $ec/$fc > $ratio;
  next if !$ignore_ratio && $fc/$ec > $ratio;
  # Skip this segment if any factor is longer than $max_word_length
  my $max_word_length_plus_one = $max_word_length + 1;
  next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
  next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
  # An extra check: none of the factors can be blank!
  die "There is a blank factor in $corpus.$l1 on line $innr: $f"
    if $f =~ /[ \|]\|/;
  die "There is a blank factor in $corpus.$l2 on line $innr: $e"
    if $e =~ /[ \|]\|/;
  $outnr++;
  print FO $f."\n";
  print EO $e."\n";
  if ($linesRetainedFile ne "") {
 	print LINES_RETAINED $innr."\n";
  }
 }
 if ($linesRetainedFile ne "") {
  close LINES_RETAINED;
 }
 print STDERR "\n";
 my $e = <E>;
 die "$corpus.$l2 is too long!" if defined $e;
 print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
 sub word_count {
  my ($line) = @_;
  if ($ignore_xml) {
    $line =~ s/<\S[^>]*\S>/ /g;
    $line =~ s/\s+/ /g;
    $line =~ s/^ //g;
    $line =~ s/ $//g;
  }
  my @w = split(/ /,$line);
  return scalar @w;
 }
--- a/mgiza-aligner/sortGizaAlignments.py
+++ b/mgiza-aligner/sortGizaAlignments.py
@ -0,0 +1,26 @@
 #!/usr/bin/python3
 import sys, re
 examples_dict = {}
 p = re.compile("# Sentence pair \((\d+)\)")
 i = 0
 for line in sys.stdin:
    line = line.strip()
    if i % 3 == 0:
        current_example = [line]
        m = p.match(line)
        if m:
            current_key = int(m.group(1))
        else:
            raise Exception("Wrong line: "+line)
    elif i % 3 == 1:
        current_example.append(line)
    else:
        current_example.append(line)
        examples_dict[current_key] = current_example
    i+=1
 for key in sorted(examples_dict.keys()):
    print ('\n'.join(examples_dict[key]))
--- a/tests/addAlignedLemmatizedTM.py
+++ b/tests/addAlignedLemmatizedTM.py
@ -21,10 +21,15 @@ def file_len(fname):
            pass
    return i + 1
-def add_data(data):
+def add_examples(examplesData):
    req = urllib2.Request(address)
    req.add_header('Content-Type', 'application/json')
-    json.loads(urllib2.urlopen(req, json.dumps(data)).read())
+    response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
    if response['status'] == 'error':
        raise Exception(response['message'])
 if len(sys.argv) != 7:
    raise Exception("wrong number of arguments")
 name = sys.argv[1]
 sourceFile = sys.argv[2]
@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
    raise Exception("alignments file is not exactly 3 times longer than source and target")
-totalLines = file_len(sourceFile)
+totalExamples = file_len(sourceFile)
 data = {
    'operation': 'addTm',
    'sourceLangId':sourceLangId,
    'targetLangId':targetLangId,
-    'name':name
+    'name':name,
    'tmLemmatized':True
 }
 req = urllib2.Request(address)
@ -60,35 +66,35 @@ data = {
    'tmId':tmId
 }
-sentences = []
+examples = []
 start = time.time()
-with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines:
+with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
    for lineNumber in range(totalExamples):
        sourceSentence = sf.readline().strip()
        targetSentence = tf.readline().strip()
-    lineNumber = 0
+        # skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
-    for line in sourceLines:
+        af.readline()
-        line = line.strip()
+        af.readline()
-        if lineNumber % 3 == 1:
+
-            currSentence.append(line)
+        alignmentString = af.readline().strip()
-        elif lineNumber % 3 == 2:
+
-            currSentence.append(line)
+        examples.append([sourceSentence, targetSentence, alignmentString])
-            currSentence.reverse()
+
-            sentences.append(currSentence)
+        if len(examples) >= BUFFER_SIZE:
-            currSentence = []
+            data['examples'] = examples
-            if len(sentences) >= BUFFER_SIZE:
+            add_examples(data)
                data['sentences'] = sentences
                add_data(data)
            mark = time.time()
-                print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
+            print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
-                sentences = []
+            examples = []
        lineNumber += 1
-if len(sentences) > 0:
+if len(examples) > 0:
-    data['sentences'] = sentences
+    data['examples'] = examples
-    add_data(data)
+    add_examples(data)
 end = time.time()
-print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
+print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
 print "Generating index..."
 start = time.time()
--- a/tests/addLemmatizedTM.sh
+++ b/tests/addLemmatizedTM.sh
@ -0,0 +1,7 @@
 #!/bin/sh
 CORPUS_NAME="europarl_sample"
 SRC_LANG_ID=2
 TRG_LANG_ID=1
 ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
--- a/tests/addTm.py
+++ b/tests/addTm.py
@ -16,7 +16,8 @@ data = {
    'operation': 'addTm',
    'sourceLangId':int(sys.argv[1]),
    'targetLangId':int(sys.argv[2]),
-    'name':sys.argv[3]
+    'name':sys.argv[3],
    'tmLemmatized':bool(int(sys.argv[4]))
 }
 req = urllib2.Request(address)