diff --git a/concordia-server/bool_param.cpp b/concordia-server/bool_param.cpp new file mode 100644 index 0000000..8029c8f --- /dev/null +++ b/concordia-server/bool_param.cpp @@ -0,0 +1,24 @@ +#include "bool_param.hpp" + + +BoolParam::BoolParam(bool value):_value(value) { +} + +BoolParam::~BoolParam() { +} + +const char * BoolParam::getValue() { + if (_value) { + return "t"; + } else { + return "f"; + } +} + +const int BoolParam::getLength() { + return 1; +} + +const int BoolParam::isBinary() { + return 0; +} diff --git a/concordia-server/bool_param.hpp b/concordia-server/bool_param.hpp new file mode 100644 index 0000000..ddb08f5 --- /dev/null +++ b/concordia-server/bool_param.hpp @@ -0,0 +1,24 @@ +#ifndef BOOL_PARAM_HDR +#define BOOL_PARAM_HDR + +#include "query_param.hpp" + +class BoolParam : public QueryParam { +public: + /*! Constructor. + */ + BoolParam(bool value); + /*! Destructor. + */ + virtual ~BoolParam(); + + const char * getValue(); + + const int getLength(); + + const int isBinary(); +private: + bool _value; +}; + +#endif diff --git a/concordia-server/concordia_server.cpp b/concordia-server/concordia_server.cpp index d33fba2..a5689d7 100644 --- a/concordia-server/concordia_server.cpp +++ b/concordia-server/concordia_server.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -19,16 +20,17 @@ ConcordiaServer::ConcordiaServer(const std::string & configFilePath) throw(ConcordiaException) : _configFilePath(configFilePath) { + std::vector tmIds = _tmDAO.getTmIds(); _concordiasMap = boost::shared_ptr >(new boost::ptr_map()); BOOST_FOREACH(int & tmId, tmIds) { _addTm(tmId); } - _indexController = boost::shared_ptr (new IndexController(_concordiasMap)); - _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap)); - _lemmatizerFacade = boost::shared_ptr (new LemmatizerFacade()); + + _indexController = boost::shared_ptr (new IndexController(_concordiasMap, _lemmatizerFacade)); + _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap, _lemmatizerFacade)); } ConcordiaServer::~ConcordiaServer() { @@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { } } _indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId); + } else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) { + std::vector sourceSentences; + std::vector targetSentences; + std::vector alignmentStrings; + int tmId = d[TM_ID_PARAM].GetInt(); + // loading data from json + const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM]; + Logger::log("addAlignedLemmatizedSentences"); + Logger::logInt("lemmatized sentences to add", sentencesArray.Size()); + Logger::logInt("tm id", tmId); + for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) { + if (sentencesArray[i].Size() != 3) { + JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements"); + break; + } else { + sourceSentences.push_back(sentencesArray[i][0].GetString()); + targetSentences.push_back(sentencesArray[i][1].GetString()); + alignmentStrings.push_back(sentencesArray[i][2].GetString()); + } + } + _indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId); } else if (operation == "lemmatize") { std::string sentence = _getStringParameter(d, "sentence"); std::string languageCode = _getStringParameter(d, "languageCode"); @@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM); int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM); std::string name = _getStringParameter(d, NAME_PARAM); - int newId = _tmDAO.addTm(sourceLangId, targetLangId, name); + bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM); + int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized); _addTm(newId); jsonWriter.StartObject(); @@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name } } +int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) + throw (ConcordiaException) { + rapidjson::Value::ConstMemberIterator itr = d.FindMember(name); + if (itr != d.MemberEnd()) { + bool value = itr->value.GetBool(); + return value; + } else { + throw ConcordiaException("missing parameter: " + std::string(name)); + } +} + void ConcordiaServer::_addTm(int tmId) { std::stringstream indexPath; indexPath << INDEX_DIRECTORY << "/tm_" << tmId; diff --git a/concordia-server/concordia_server.hpp b/concordia-server/concordia_server.hpp index c0e11c2..2822a9e 100644 --- a/concordia-server/concordia_server.hpp +++ b/concordia-server/concordia_server.hpp @@ -38,6 +38,8 @@ private: int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); + int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); + void _addTm(int tmId); std::string _configFilePath; diff --git a/concordia-server/config.hpp.in b/concordia-server/config.hpp.in index dac7ae6..571d18a 100644 --- a/concordia-server/config.hpp.in +++ b/concordia-server/config.hpp.in @@ -16,7 +16,9 @@ #define SOURCE_SENTENCE_PARAM "sourceSentence" #define TARGET_SENTENCE_PARAM "targetSentence" #define TM_ID_PARAM "tmId" +#define TM_LEMMATIZED_PARAM "tmLemmatized" #define SENTENCES_PARAM "sentences" +#define EXAMPLES_PARAM "examples" #define SOURCE_LANG_PARAM "sourceLangId" #define TARGET_LANG_PARAM "targetLangId" #define NAME_PARAM "name" @@ -25,6 +27,7 @@ #define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCES_OP "addSentences" #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences" +#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences" #define REFRESH_INDEX_OP "refreshIndex" #define SIMPLE_SEARCH_OP "simpleSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch" diff --git a/concordia-server/db_connection.cpp b/concordia-server/db_connection.cpp index c46516c..8b26eeb 100644 --- a/concordia-server/db_connection.cpp +++ b/concordia-server/db_connection.cpp @@ -17,7 +17,7 @@ DBconnection::DBconnection() throw(ConcordiaException) { ss << "Connection string: " << connectionInfo; throw ConcordiaException(ss.str()); } - + } DBconnection::~DBconnection() { @@ -90,8 +90,8 @@ PGresult * DBconnection::execute(std::string query, paramFormats[index] = param->isBinary(); index++; } - - + + PGresult * result = PQexecParams(_connection, query.c_str(), params.size(), @@ -129,7 +129,18 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor } catch (std::exception & e) { std::stringstream ss; ss << "Error getting int value. Message: " << e.what(); - throw ConcordiaException(ss.str()); + throw ConcordiaException(ss.str()); + } +} + +bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) { + try { + char * valueStr = PQgetvalue(result,row,col); + return std::string(valueStr) == "t"; + } catch (std::exception & e) { + std::stringstream ss; + ss << "Error getting bool value. Message: " << e.what(); + throw ConcordiaException(ss.str()); } } @@ -150,7 +161,6 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) { } catch (std::exception & e) { std::stringstream ss; ss << "Error getting int value. Message: " << e.what(); - throw ConcordiaException(ss.str()); + throw ConcordiaException(ss.str()); } } - diff --git a/concordia-server/db_connection.hpp b/concordia-server/db_connection.hpp index c65fb35..9542fb8 100644 --- a/concordia-server/db_connection.hpp +++ b/concordia-server/db_connection.hpp @@ -31,6 +31,8 @@ public: int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException); + bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException); + std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException); int getRowCount(PGresult * result) throw (ConcordiaException); diff --git a/concordia-server/index_controller.cpp b/concordia-server/index_controller.cpp index 60d65f0..37de410 100644 --- a/concordia-server/index_controller.cpp +++ b/concordia-server/index_controller.cpp @@ -14,9 +14,11 @@ #include "json_generator.hpp" #include "logger.hpp" -IndexController::IndexController(boost::shared_ptr >concordiasMap) +IndexController::IndexController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException): - _concordiasMap(concordiasMap) { + _concordiasMap(concordiasMap), + _lemmatizerFacade(lemmatizerFacade) { } IndexController::~IndexController() { @@ -32,9 +34,10 @@ void IndexController::addSentence( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId)); TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence); - int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); - it->second->addTokenizedExample(tokenizedSentence, sentenceId); + int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); + it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId); it->second->refreshSAfromRAM(); jsonWriter.StartObject(); @@ -42,20 +45,20 @@ void IndexController::addSentence( jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } catch (std::exception & e) { std::stringstream errorstream; errorstream << "general error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } catch (...) { std::stringstream errorstream; errorstream << "unexpected error occurred"; - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } @@ -67,21 +70,22 @@ void IndexController::addSentences( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + std::vector tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId)); std::vector tokenizedSentences = it->second->tokenizeAll(sourceSentences); std::vector sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId); - it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds); + it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds); jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } @@ -96,28 +100,66 @@ void IndexController::addAlignedSentences( std::vector sourceSentences; std::vector > > allAlignments; _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences); - - std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true); + + std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true); std::vector tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false); std::vector sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId); for(int index = 0; index < tokenizedSourceSentences.size(); index++) { it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index)); - } + } jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } +void IndexController::addAlignedLemmatizedSentences( + rapidjson::Writer & jsonWriter, + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector & alignmentStrings, + const int tmId) { + try { + boost::ptr_map::iterator it = _concordiasMap->find(tmId); + if (it != _concordiasMap->end()) { + std::vector lemmatizedSourceSentences; + std::vector > > allAlignments; + _getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings); + + std::vector tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true); + std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false); + std::vector tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false); + + std::vector sentenceIds = + + _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId); + for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) { + it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index)); + } + jsonWriter.StartObject(); + jsonWriter.String("status"); + jsonWriter.String("success"); + jsonWriter.EndObject(); + } else { + JsonGenerator::signalError(jsonWriter, "no such tm!"); + } + } catch (ConcordiaException & e) { + std::stringstream errorstream; + errorstream << "concordia error: " << e.what(); + JsonGenerator::signalError(jsonWriter, errorstream.str()); + } +} + + void IndexController::refreshIndexFromRAM(rapidjson::Writer & jsonWriter, const int tmId) { try { @@ -130,12 +172,12 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer > alignments; - + UnicodeString s(rawSourceSentence.c_str()); boost::u32regex_iterator begin( boost::make_u32regex_iterator( @@ -159,21 +201,21 @@ void IndexController::_getSourceSentencesAndAlignments( ) ); boost::u32regex_iterator end; - + for (; begin != end; ++begin) { UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1)); std::string token; tokenUTF8.toUTF8String(token); if (token != "NULL") { - std::string numbers((*begin)[2].first, (*begin)[2].second); + std::string numbers((*begin)[2].first, (*begin)[2].second); std::istringstream iss(numbers); std::vector numberStrings; std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(numberStrings)); - std::vector tokenAlignments; + std::vector tokenAlignments; for (int j=0;j >concordiasMap) + explicit IndexController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException); /*! Destructor. */ @@ -38,9 +41,16 @@ public: const std::vector & targetSentences, const int tmId); + void addAlignedLemmatizedSentences( + rapidjson::Writer & jsonWriter, + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector & alignmentStrings, + const int tmId); + void refreshIndexFromRAM(rapidjson::Writer & jsonWriter, const int tmId); - + private: void _getSourceSentencesAndAlignments( std::vector & sourceSentences, @@ -48,7 +58,9 @@ private: const std::vector & rawSourceSentences); boost::shared_ptr > _concordiasMap; - + + boost::shared_ptr _lemmatizerFacade; + UnitDAO _unitDAO; }; diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp index f6adc31..43b0aae 100644 --- a/concordia-server/lemmatizer_facade.cpp +++ b/concordia-server/lemmatizer_facade.cpp @@ -1,5 +1,7 @@ #include "lemmatizer_facade.hpp" +#include + LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { _lemmatizersMap = boost::ptr_map(); @@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s } } + +std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) { + std::pair tmInfo = _tmDAO.getTmInfo(tmId); + if (tmInfo.first) { + return lemmatizeSentence(tmInfo.second, pattern); + } else { + return pattern; + } +} + +std::vector LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector patterns, int tmId) { + std::pair tmInfo = _tmDAO.getTmInfo(tmId); + if (tmInfo.first) { + std::vector result; + BOOST_FOREACH(std::string & pattern, patterns) { + result.push_back(lemmatizeSentence(tmInfo.second, pattern)); + } + return result; + } else { + return patterns; + } + +} diff --git a/concordia-server/lemmatizer_facade.hpp b/concordia-server/lemmatizer_facade.hpp index 7eea156..e9f5c3e 100644 --- a/concordia-server/lemmatizer_facade.hpp +++ b/concordia-server/lemmatizer_facade.hpp @@ -2,6 +2,7 @@ #define LEMMATIZER_FACADE_HDR #include "socket_lemmatizer.hpp" +#include "tm_dao.hpp" #include #include @@ -18,8 +19,15 @@ public: virtual ~LemmatizerFacade(); std::string lemmatizeSentence(std::string languageCode, std::string sentence); + + std::string lemmatizeIfNeeded(std::string pattern, int tmId); + + std::vector lemmatizeSentencesIfNeeded(std::vector patterns, int tmId); + private: boost::ptr_map _lemmatizersMap; + + TmDAO _tmDAO; }; #endif diff --git a/concordia-server/searcher_controller.cpp b/concordia-server/searcher_controller.cpp index 11d36ac..dd7eb03 100644 --- a/concordia-server/searcher_controller.cpp +++ b/concordia-server/searcher_controller.cpp @@ -8,9 +8,11 @@ #include "logger.hpp" -SearcherController::SearcherController(boost::shared_ptr >concordiasMap) +SearcherController::SearcherController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException): - _concordiasMap(concordiasMap) { + _concordiasMap(concordiasMap), + _lemmatizerFacade(lemmatizerFacade) { } SearcherController::~SearcherController() { @@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer const int tmId) { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); std::vector results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern)); jsonWriter.StartObject(); @@ -30,48 +33,49 @@ void SearcherController::simpleSearch(rapidjson::Writer jsonWriter.String("results"); jsonWriter.StartArray(); BOOST_FOREACH(SimpleSearchResult & result, results) { - JsonGenerator::writeSearchResult(jsonWriter, result); - } + JsonGenerator::writeSearchResult(jsonWriter, result); + } jsonWriter.EndArray(); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } void SearcherController::concordiaPhraseSearch(rapidjson::Writer & jsonWriter, std::string & pattern, const std::vector & intervals, - const int tmId) { + const int tmId) { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { if (intervals.size() > 0) { // std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart()); + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart()); - + Logger::log("concordiaPhraseSearch"); Logger::logString("short pattern", shortPattern); std::vector shortPatternResults = _unitDAO.getSearchResults(it->second->simpleSearch(shortPattern)); - - - + + + jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.String("found"); if (shortPatternResults.size() > 0) { jsonWriter.Bool(true); - - + + std::vector bestOverlay; - + int currStart = 0; BOOST_FOREACH(const Interval & interval, intervals) { CompleteConcordiaSearchResult restResult = _unitDAO.getConcordiaResult( it->second->concordiaSearch(pattern.substr(currStart, interval.getStart()-currStart))); restResult.offsetPattern(currStart); bestOverlay.insert(bestOverlay.end(), restResult.getBestOverlay().begin(), restResult.getBestOverlay().end()); - + SimpleSearchResult shortPatternresult = shortPatternResults[0]; shortPatternresult.setMatchedPatternStart(interval.getStart()); shortPatternresult.setMatchedPatternEnd(interval.getEnd()); @@ -82,26 +86,26 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writersecond->concordiaSearch(_substrUTF8(pattern,currStart,INT_MAX))); lastRestResult.offsetPattern(currStart); bestOverlay.insert(bestOverlay.end(), lastRestResult.getBestOverlay().begin(), lastRestResult.getBestOverlay().end()); - + jsonWriter.String("result"); jsonWriter.StartObject(); jsonWriter.String("bestOverlay"); jsonWriter.StartArray(); BOOST_FOREACH(SimpleSearchResult & simpleResult, bestOverlay) { - JsonGenerator::writeSearchResult(jsonWriter, simpleResult); - } + JsonGenerator::writeSearchResult(jsonWriter, simpleResult); + } jsonWriter.EndArray(); jsonWriter.EndObject(); } else { - jsonWriter.Bool(false); + jsonWriter.Bool(false); } jsonWriter.EndObject(); } else { JsonGenerator::signalError(jsonWriter, "no intervals for phrase search"); } } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); - } + JsonGenerator::signalError(jsonWriter, "no such tm!"); + } } @@ -111,8 +115,9 @@ void SearcherController::concordiaSearch(rapidjson::Writer::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern)); - + jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); @@ -123,16 +128,16 @@ void SearcherController::concordiaSearch(rapidjson::Writer >concordiasMap) - throw(ConcordiaException); + explicit SearcherController(boost::shared_ptr > concordiasMap, + boost::shared_ptr LemmatizerFacade) + throw(ConcordiaException); /*! Destructor. */ virtual ~SearcherController(); @@ -40,7 +42,9 @@ private: std::string _substrUTF8(std::string source, int start, int length); boost::shared_ptr > _concordiasMap; - + + boost::shared_ptr _lemmatizerFacade; + UnitDAO _unitDAO; }; diff --git a/concordia-server/tm_dao.cpp b/concordia-server/tm_dao.cpp index 1319907..4b2e2da 100644 --- a/concordia-server/tm_dao.cpp +++ b/concordia-server/tm_dao.cpp @@ -3,6 +3,7 @@ #include "query_param.hpp" #include "string_param.hpp" #include "int_param.hpp" +#include "bool_param.hpp" #include "int_array_param.hpp" #include "logger.hpp" @@ -27,20 +28,25 @@ std::vector TmDAO::getTmIds() { } connection.clearResult(dbResult); connection.endTransaction(); - + return result; } int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) { + addTm(sourceLangId, targetLangId, name, false); +} + +int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) { DBconnection connection; connection.startTransaction(); - std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id"; + std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id"; std::vector params; params.push_back(new IntParam(sourceLangId)); params.push_back(new IntParam(targetLangId)); params.push_back(new StringParam(name)); - + params.push_back(new BoolParam(lemmatized)); + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); @@ -48,8 +54,23 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri BOOST_FOREACH (QueryParam * param, params) { delete param; } - + return newId; } +std::pair TmDAO::getTmInfo(int tmId) { + DBconnection connection; + connection.startTransaction(); + std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;"; + std::vector params; + params.push_back(new IntParam(tmId)); + PGresult * dbResult = connection.execute(query, params); + bool lemmatized = connection.getBoolValue(dbResult, 0, 1); + std::string languageCode = connection.getStringValue(dbResult, 0, 2); + connection.clearResult(dbResult); + connection.endTransaction(); + + return std::pair(lemmatized, languageCode); + +} diff --git a/concordia-server/tm_dao.hpp b/concordia-server/tm_dao.hpp index e43822a..4db8097 100644 --- a/concordia-server/tm_dao.hpp +++ b/concordia-server/tm_dao.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "db_connection.hpp" @@ -18,8 +19,12 @@ public: int addTm(const int sourceLangId, const int targetLangId, const std::string name); + int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized); + std::vector getTmIds(); + std::pair getTmInfo(int tmId); + private: }; diff --git a/concordia-server/unit_dao.cpp b/concordia-server/unit_dao.cpp index 16a1e92..6a1a68c 100644 --- a/concordia-server/unit_dao.cpp +++ b/concordia-server/unit_dao.cpp @@ -22,7 +22,7 @@ int UnitDAO::addSentence( const TokenizedSentence & sourceSentence, const std::string & targetSentence, const int tmId) { - + DBconnection connection; connection.startTransaction(); int newId = _addSingleSentence(connection, sourceSentence, targetSentence, tmId); @@ -38,7 +38,7 @@ std::vector UnitDAO::addSentences( std::vector newIds; connection.startTransaction(); int index = 0; - BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) { + BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) { newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmId)); index++; } @@ -50,7 +50,7 @@ std::vector UnitDAO::addAlignedSentences( const std::vector & sourceSentences, const std::vector & targetSentences, const std::vector > > & allAlignments, - const int tmId) { + const int tmId) throw (ConcordiaException) { DBconnection connection; std::vector newIds; @@ -59,9 +59,9 @@ std::vector UnitDAO::addAlignedSentences( for (int i=0; i< sourceSentences.size(); i++) { newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId)); } - + connection.endTransaction(); - return newIds; + return newIds; } std::vector UnitDAO::getSearchResults(const std::vector & fragments) { @@ -83,7 +83,7 @@ void UnitDAO::_getResultsFromFragments( std::vector & results, const std::vector & fragments, const TokenizedSentence & tokenizedPattern) { - + DBconnection connection; connection.startTransaction(); @@ -95,9 +95,9 @@ void UnitDAO::_getResultsFromFragments( matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart(); matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd(); } - - - + + + std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; std::vector params; params.push_back(new IntParam(2*fragment.getExampleOffset()+1)); @@ -116,7 +116,7 @@ void UnitDAO::_getResultsFromFragments( delete param; } - // now add all target fragments matched with this fragment + // now add all target fragments matched with this fragment std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos"; std::vector targetParams; targetParams.push_back(new IntParam(fragment.getExampleId())); @@ -127,12 +127,12 @@ void UnitDAO::_getResultsFromFragments( int prevPos = -2; int currStart = -1; int currEnd = -1; - + for (int i=0;i= 0) { @@ -141,7 +141,7 @@ void UnitDAO::_getResultsFromFragments( currStart = targetStart; } - currEnd = targetEnd; + currEnd = targetEnd; prevPos = targetPos; } @@ -154,9 +154,9 @@ void UnitDAO::_getResultsFromFragments( BOOST_FOREACH (QueryParam * param, targetParams) { delete param; } - + results.push_back(ssResult); - } + } connection.endTransaction(); } @@ -181,25 +181,29 @@ int UnitDAO::_addSingleSentence( params.push_back(new StringParam(targetSentence)); params.push_back(new IntParam(tmId)); params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); - + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } - + return newId; } -int UnitDAO::_addAlignedUnit( +int UnitDAO::_addAlignedUnit ( DBconnection & connection, const TokenizedSentence & sourceSentence, const TokenizedSentence & targetSentence, const std::vector > & alignments, - const int tmId) { - + const int tmId) throw(ConcordiaException) { + + if (sourceSentence.getTokens().size() != alignments.size()) { + throw ConcordiaException("The size of source sentence does not match the size of alignments array."); + } + std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; std::vector params; params.push_back(new StringParam(sourceSentence.getSentence())); @@ -207,14 +211,14 @@ int UnitDAO::_addAlignedUnit( params.push_back(new IntParam(tmId)); params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); params.push_back(new IntArrayParam(_getTokenPositions(targetSentence))); - + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } - + // add alignments bool nonEmpty = false; std::stringstream alignmentsQuery; @@ -230,10 +234,8 @@ int UnitDAO::_addAlignedUnit( query = alignmentsQuery.str(); query = query.substr(0, query.length()-1); PGresult * result = connection.execute(query); - connection.clearResult(result); + connection.clearResult(result); } return newId; } - - diff --git a/concordia-server/unit_dao.hpp b/concordia-server/unit_dao.hpp index 875fa0a..7159320 100644 --- a/concordia-server/unit_dao.hpp +++ b/concordia-server/unit_dao.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "simple_search_result.hpp" @@ -33,13 +34,13 @@ public: const std::vector & sourceSentences, const std::vector & targetSentences, const int tmId); - + std::vector addAlignedSentences( const std::vector & sourceSentences, const std::vector & targetSentences, const std::vector > > & allAlignments, - const int tmId); - + const int tmId) throw (ConcordiaException); + std::vector getSearchResults(const std::vector & fragments); CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr rawConcordiaResult); @@ -50,7 +51,7 @@ private: const TokenizedSentence & tokenizedPattern); std::vector _getTokenPositions(const TokenizedSentence & ts); - + int _addSingleSentence( DBconnection & connection, const TokenizedSentence & sourceSentence, @@ -62,7 +63,7 @@ private: const TokenizedSentence & sourceSentence, const TokenizedSentence & targetSentence, const std::vector > & alignments, - const int tmId); + const int tmId) throw(ConcordiaException); }; #endif diff --git a/db/concordia_server.sql b/db/concordia_server.sql index 125df22..c8a8a21 100644 --- a/db/concordia_server.sql +++ b/db/concordia_server.sql @@ -3,7 +3,8 @@ CREATE TABLE tm ( id SERIAL PRIMARY KEY, source_lang_id integer, target_lang_id integer, - name varchar(40) + name varchar(40), + lemmatized bool DEFAULT false ); DROP TABLE IF EXISTS language; diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo b/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo index ef1ddee..b0abdc9 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo and b/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs index 4c18358..f53b436 100644 --- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs +++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs @@ -12,12 +12,20 @@ namespace LemmaGenSentenceLemmatizer { if (args.Length == 1) { - SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]); - string line = Console.ReadLine(); - while (!string.IsNullOrEmpty(line)) + try { - Console.WriteLine(lemmatizer.lemmatizeSentence(line)); - line = Console.ReadLine(); + SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]); + string line = Console.ReadLine(); + while (line != null) + { + Console.WriteLine(lemmatizer.lemmatizeSentence(line)); + line = Console.ReadLine(); + } + + } + catch (Exception ex) + { + Console.WriteLine("Exception occurred: " + ex.Message); } diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe index 3f36045..d33ee58 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb index 7fafbe7..92ae405 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.pdb differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache new file mode 100644 index 0000000..5767a4e Binary files /dev/null and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache index b26246f..70c7d05 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe index 3f36045..d33ee58 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe differ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb index 7fafbe7..92ae405 100644 Binary files a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb and b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.pdb differ diff --git a/mgiza-aligner/Makefile b/mgiza-aligner/Makefile index 7e384c0..aebcaeb 100644 --- a/mgiza-aligner/Makefile +++ b/mgiza-aligner/Makefile @@ -1,10 +1,22 @@ SRC_LANG=en TRG_LANG=pl -CORPUS_NAME=europarl +CORPUS_NAME=europarljrc all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg - cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt + cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt + +clean-intermediate-files: + rm -f corpora/$(CORPUS_NAME)/*.lem + rm -f corpora/$(CORPUS_NAME)/*.low + rm -f corpora/$(CORPUS_NAME)/*.classes + rm -f corpora/$(CORPUS_NAME)/*.classes.cats + rm -f corpora/$(CORPUS_NAME)/*.vcb + rm -f corpora/$(CORPUS_NAME)/*.snt + rm -f corpora/$(CORPUS_NAME)/*.cooc + rm -f corpora/$(CORPUS_NAME)/aligned*part* + rm -f corpora/$(CORPUS_NAME)/giza.cfg + clean: rm -f corpora/$(CORPUS_NAME)/*.tok diff --git a/mgiza-aligner/clean-corpus-n.perl b/mgiza-aligner/clean-corpus-n.perl new file mode 100755 index 0000000..76a09e5 --- /dev/null +++ b/mgiza-aligner/clean-corpus-n.perl @@ -0,0 +1,168 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ +use warnings; +use strict; +use Getopt::Long; +my $help; +my $lc = 0; # lowercase the corpus? +my $ignore_ratio = 0; +my $ignore_xml = 0; +my $enc = "utf8"; # encoding of the input and output files + # set to anything else you wish, but I have not tested it yet +my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars + # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000) + # and crashes if it encounters a word that exceeds it +my $ratio = 9; + +GetOptions( + "help" => \$help, + "lowercase|lc" => \$lc, + "encoding=s" => \$enc, + "ratio=f" => \$ratio, + "ignore-ratio" => \$ignore_ratio, + "ignore-xml" => \$ignore_xml, + "max-word-length|mwl=s" => \$max_word_length +) or exit(1); + +if (scalar(@ARGV) < 6 || $help) { + print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n"; + exit; +} + +my $corpus = $ARGV[0]; +my $l1 = $ARGV[1]; +my $l2 = $ARGV[2]; +my $out = $ARGV[3]; +my $min = $ARGV[4]; +my $max = $ARGV[5]; + +my $linesRetainedFile = ""; +if (scalar(@ARGV) > 6) { + $linesRetainedFile = $ARGV[6]; + open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile"; +} + +print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n"; + +my $opn = undef; +my $l1input = "$corpus.$l1"; +if (-e $l1input) { + $opn = $l1input; +} elsif (-e $l1input.".gz") { + $opn = "gunzip -c $l1input.gz |"; +} else { + die "Error: $l1input does not exist"; +} +open(F,$opn) or die "Can't open '$opn'"; +$opn = undef; +my $l2input = "$corpus.$l2"; +if (-e $l2input) { + $opn = $l2input; +} elsif (-e $l2input.".gz") { + $opn = "gunzip -c $l2input.gz |"; +} else { + die "Error: $l2input does not exist"; +} + +open(E,$opn) or die "Can't open '$opn'"; + +open(FO,">$out.$l1") or die "Can't write $out.$l1"; +open(EO,">$out.$l2") or die "Can't write $out.$l2"; + +# necessary for proper lowercasing +my $binmode; +if ($enc eq "utf8") { + $binmode = ":utf8"; +} else { + $binmode = ":encoding($enc)"; +} +binmode(F, $binmode); +binmode(E, $binmode); +binmode(FO, $binmode); +binmode(EO, $binmode); + +my $innr = 0; +my $outnr = 0; +my $factored_flag; +while(my $f = ) { + $innr++; + print STDERR "." if $innr % 10000 == 0; + print STDERR "($innr)" if $innr % 100000 == 0; + my $e = ; + die "$corpus.$l2 is too short!" if !defined $e; + chomp($e); + chomp($f); + if ($innr == 1) { + $factored_flag = ($e =~ /\|/ || $f =~ /\|/); + } + + #if lowercasing, lowercase + if ($lc) { + $e = lc($e); + $f = lc($f); + } + + $e =~ s/\|//g unless $factored_flag; + $e =~ s/\s+/ /g; + $e =~ s/^ //; + $e =~ s/ $//; + $f =~ s/\|//g unless $factored_flag; + $f =~ s/\s+/ /g; + $f =~ s/^ //; + $f =~ s/ $//; + next if $f eq ''; + next if $e eq ''; + + my $ec = &word_count($e); + my $fc = &word_count($f); + next if $ec > $max; + next if $fc > $max; + next if $ec < $min; + next if $fc < $min; + next if !$ignore_ratio && $ec/$fc > $ratio; + next if !$ignore_ratio && $fc/$ec > $ratio; + # Skip this segment if any factor is longer than $max_word_length + my $max_word_length_plus_one = $max_word_length + 1; + next if $e =~ /[^\s\|]{$max_word_length_plus_one}/; + next if $f =~ /[^\s\|]{$max_word_length_plus_one}/; + + # An extra check: none of the factors can be blank! + die "There is a blank factor in $corpus.$l1 on line $innr: $f" + if $f =~ /[ \|]\|/; + die "There is a blank factor in $corpus.$l2 on line $innr: $e" + if $e =~ /[ \|]\|/; + + $outnr++; + print FO $f."\n"; + print EO $e."\n"; + + if ($linesRetainedFile ne "") { + print LINES_RETAINED $innr."\n"; + } +} + +if ($linesRetainedFile ne "") { + close LINES_RETAINED; +} + +print STDERR "\n"; +my $e = ; +die "$corpus.$l2 is too long!" if defined $e; + +print STDERR "Input sentences: $innr Output sentences: $outnr\n"; + +sub word_count { + my ($line) = @_; + if ($ignore_xml) { + $line =~ s/<\S[^>]*\S>/ /g; + $line =~ s/\s+/ /g; + $line =~ s/^ //g; + $line =~ s/ $//g; + } + my @w = split(/ /,$line); + return scalar @w; +} diff --git a/mgiza-aligner/sortGizaAlignments.py b/mgiza-aligner/sortGizaAlignments.py new file mode 100755 index 0000000..e6762ca --- /dev/null +++ b/mgiza-aligner/sortGizaAlignments.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 + +import sys, re + +examples_dict = {} +p = re.compile("# Sentence pair \((\d+)\)") + +i = 0 +for line in sys.stdin: + line = line.strip() + if i % 3 == 0: + current_example = [line] + m = p.match(line) + if m: + current_key = int(m.group(1)) + else: + raise Exception("Wrong line: "+line) + elif i % 3 == 1: + current_example.append(line) + else: + current_example.append(line) + examples_dict[current_key] = current_example + i+=1 + +for key in sorted(examples_dict.keys()): + print ('\n'.join(examples_dict[key])) diff --git a/tests/addAlignedLemmatizedTM.py b/tests/addAlignedLemmatizedTM.py old mode 100644 new mode 100755 index ee8a246..aecc7dd --- a/tests/addAlignedLemmatizedTM.py +++ b/tests/addAlignedLemmatizedTM.py @@ -21,10 +21,15 @@ def file_len(fname): pass return i + 1 -def add_data(data): +def add_examples(examplesData): req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') - json.loads(urllib2.urlopen(req, json.dumps(data)).read()) + response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read()) + if response['status'] == 'error': + raise Exception(response['message']) + +if len(sys.argv) != 7: + raise Exception("wrong number of arguments") name = sys.argv[1] sourceFile = sys.argv[2] @@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)): raise Exception("alignments file is not exactly 3 times longer than source and target") -totalLines = file_len(sourceFile) +totalExamples = file_len(sourceFile) data = { 'operation': 'addTm', 'sourceLangId':sourceLangId, 'targetLangId':targetLangId, - 'name':name + 'name':name, + 'tmLemmatized':True } req = urllib2.Request(address) @@ -60,35 +66,35 @@ data = { 'tmId':tmId } -sentences = [] +examples = [] start = time.time() -with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines: +with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af: + for lineNumber in range(totalExamples): + sourceSentence = sf.readline().strip() + targetSentence = tf.readline().strip() - lineNumber = 0 - for line in sourceLines: - line = line.strip() - if lineNumber % 3 == 1: - currSentence.append(line) - elif lineNumber % 3 == 2: - currSentence.append(line) - currSentence.reverse() - sentences.append(currSentence) - currSentence = [] - if len(sentences) >= BUFFER_SIZE: - data['sentences'] = sentences - add_data(data) - mark = time.time() - print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start))) - sentences = [] - lineNumber += 1 + # skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files. + af.readline() + af.readline() + + alignmentString = af.readline().strip() + + examples.append([sourceSentence, targetSentence, alignmentString]) + + if len(examples) >= BUFFER_SIZE: + data['examples'] = examples + add_examples(data) + mark = time.time() + print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start)) + examples = [] -if len(sentences) > 0: - data['sentences'] = sentences - add_data(data) +if len(examples) > 0: + data['examples'] = examples + add_examples(data) end = time.time() -print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start))) +print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start)) print "Generating index..." start = time.time() diff --git a/tests/addLemmatizedTM.sh b/tests/addLemmatizedTM.sh new file mode 100755 index 0000000..ba53f7f --- /dev/null +++ b/tests/addLemmatizedTM.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +CORPUS_NAME="europarl_sample" +SRC_LANG_ID=2 +TRG_LANG_ID=1 + +./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt diff --git a/tests/addTm.py b/tests/addTm.py index e3bfaa3..c36e791 100755 --- a/tests/addTm.py +++ b/tests/addTm.py @@ -16,7 +16,8 @@ data = { 'operation': 'addTm', 'sourceLangId':int(sys.argv[1]), 'targetLangId':int(sys.argv[2]), - 'name':sys.argv[3] + 'name':sys.argv[3], + 'tmLemmatized':bool(int(sys.argv[4])) } req = urllib2.Request(address)