From 89fb77bf5873abf367144b2076ae2456ed51d046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Fri, 10 Mar 2017 14:52:01 +0100 Subject: [PATCH] working lemmatization --- concordia-server/bool_param.cpp | 24 +++ concordia-server/bool_param.hpp | 24 +++ concordia-server/concordia_server.cpp | 43 ++++- concordia-server/concordia_server.hpp | 2 + concordia-server/config.hpp.in | 3 + concordia-server/db_connection.cpp | 22 ++- concordia-server/db_connection.hpp | 2 + concordia-server/index_controller.cpp | 93 +++++++--- concordia-server/index_controller.hpp | 18 +- concordia-server/lemmatizer_facade.cpp | 25 +++ concordia-server/lemmatizer_facade.hpp | 8 + concordia-server/searcher_controller.cpp | 62 +++---- concordia-server/searcher_controller.hpp | 10 +- concordia-server/tm_dao.cpp | 29 ++- concordia-server/tm_dao.hpp | 5 + concordia-server/unit_dao.cpp | 52 +++--- concordia-server/unit_dao.hpp | 11 +- db/concordia_server.sql | 3 +- .../.vs/LemmaGenSentenceLemmatizer/v14/.suo | Bin 38912 -> 52736 bytes .../LemmaGenSentenceLemmatizer/Program.cs | 18 +- .../bin/Debug/LemmaGenSentenceLemmatizer.exe | Bin 6656 -> 6656 bytes .../bin/Debug/LemmaGenSentenceLemmatizer.pdb | Bin 15872 -> 15872 bytes .../DesignTimeResolveAssemblyReferences.cache | Bin 0 -> 713 bytes ...tizer.csprojResolveAssemblyReference.cache | Bin 13306 -> 22553 bytes .../obj/Debug/LemmaGenSentenceLemmatizer.exe | Bin 6656 -> 6656 bytes .../obj/Debug/LemmaGenSentenceLemmatizer.pdb | Bin 15872 -> 15872 bytes mgiza-aligner/Makefile | 16 +- mgiza-aligner/clean-corpus-n.perl | 168 ++++++++++++++++++ mgiza-aligner/sortGizaAlignments.py | 26 +++ tests/addAlignedLemmatizedTM.py | 60 ++++--- tests/addLemmatizedTM.sh | 7 + tests/addTm.py | 3 +- 32 files changed, 592 insertions(+), 142 deletions(-) create mode 100644 concordia-server/bool_param.cpp create mode 100644 concordia-server/bool_param.hpp create mode 100644 mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/DesignTimeResolveAssemblyReferences.cache create mode 100755 mgiza-aligner/clean-corpus-n.perl create mode 100755 mgiza-aligner/sortGizaAlignments.py mode change 100644 => 100755 tests/addAlignedLemmatizedTM.py create mode 100755 tests/addLemmatizedTM.sh diff --git a/concordia-server/bool_param.cpp b/concordia-server/bool_param.cpp new file mode 100644 index 0000000..8029c8f --- /dev/null +++ b/concordia-server/bool_param.cpp @@ -0,0 +1,24 @@ +#include "bool_param.hpp" + + +BoolParam::BoolParam(bool value):_value(value) { +} + +BoolParam::~BoolParam() { +} + +const char * BoolParam::getValue() { + if (_value) { + return "t"; + } else { + return "f"; + } +} + +const int BoolParam::getLength() { + return 1; +} + +const int BoolParam::isBinary() { + return 0; +} diff --git a/concordia-server/bool_param.hpp b/concordia-server/bool_param.hpp new file mode 100644 index 0000000..ddb08f5 --- /dev/null +++ b/concordia-server/bool_param.hpp @@ -0,0 +1,24 @@ +#ifndef BOOL_PARAM_HDR +#define BOOL_PARAM_HDR + +#include "query_param.hpp" + +class BoolParam : public QueryParam { +public: + /*! Constructor. + */ + BoolParam(bool value); + /*! Destructor. + */ + virtual ~BoolParam(); + + const char * getValue(); + + const int getLength(); + + const int isBinary(); +private: + bool _value; +}; + +#endif diff --git a/concordia-server/concordia_server.cpp b/concordia-server/concordia_server.cpp index d33fba2..a5689d7 100644 --- a/concordia-server/concordia_server.cpp +++ b/concordia-server/concordia_server.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -19,16 +20,17 @@ ConcordiaServer::ConcordiaServer(const std::string & configFilePath) throw(ConcordiaException) : _configFilePath(configFilePath) { + std::vector tmIds = _tmDAO.getTmIds(); _concordiasMap = boost::shared_ptr >(new boost::ptr_map()); BOOST_FOREACH(int & tmId, tmIds) { _addTm(tmId); } - _indexController = boost::shared_ptr (new IndexController(_concordiasMap)); - _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap)); - _lemmatizerFacade = boost::shared_ptr (new LemmatizerFacade()); + + _indexController = boost::shared_ptr (new IndexController(_concordiasMap, _lemmatizerFacade)); + _searcherController = boost::shared_ptr (new SearcherController(_concordiasMap, _lemmatizerFacade)); } ConcordiaServer::~ConcordiaServer() { @@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { } } _indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId); + } else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) { + std::vector sourceSentences; + std::vector targetSentences; + std::vector alignmentStrings; + int tmId = d[TM_ID_PARAM].GetInt(); + // loading data from json + const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM]; + Logger::log("addAlignedLemmatizedSentences"); + Logger::logInt("lemmatized sentences to add", sentencesArray.Size()); + Logger::logInt("tm id", tmId); + for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) { + if (sentencesArray[i].Size() != 3) { + JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements"); + break; + } else { + sourceSentences.push_back(sentencesArray[i][0].GetString()); + targetSentences.push_back(sentencesArray[i][1].GetString()); + alignmentStrings.push_back(sentencesArray[i][2].GetString()); + } + } + _indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId); } else if (operation == "lemmatize") { std::string sentence = _getStringParameter(d, "sentence"); std::string languageCode = _getStringParameter(d, "languageCode"); @@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM); int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM); std::string name = _getStringParameter(d, NAME_PARAM); - int newId = _tmDAO.addTm(sourceLangId, targetLangId, name); + bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM); + int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized); _addTm(newId); jsonWriter.StartObject(); @@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name } } +int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) + throw (ConcordiaException) { + rapidjson::Value::ConstMemberIterator itr = d.FindMember(name); + if (itr != d.MemberEnd()) { + bool value = itr->value.GetBool(); + return value; + } else { + throw ConcordiaException("missing parameter: " + std::string(name)); + } +} + void ConcordiaServer::_addTm(int tmId) { std::stringstream indexPath; indexPath << INDEX_DIRECTORY << "/tm_" << tmId; diff --git a/concordia-server/concordia_server.hpp b/concordia-server/concordia_server.hpp index c0e11c2..2822a9e 100644 --- a/concordia-server/concordia_server.hpp +++ b/concordia-server/concordia_server.hpp @@ -38,6 +38,8 @@ private: int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); + int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); + void _addTm(int tmId); std::string _configFilePath; diff --git a/concordia-server/config.hpp.in b/concordia-server/config.hpp.in index dac7ae6..571d18a 100644 --- a/concordia-server/config.hpp.in +++ b/concordia-server/config.hpp.in @@ -16,7 +16,9 @@ #define SOURCE_SENTENCE_PARAM "sourceSentence" #define TARGET_SENTENCE_PARAM "targetSentence" #define TM_ID_PARAM "tmId" +#define TM_LEMMATIZED_PARAM "tmLemmatized" #define SENTENCES_PARAM "sentences" +#define EXAMPLES_PARAM "examples" #define SOURCE_LANG_PARAM "sourceLangId" #define TARGET_LANG_PARAM "targetLangId" #define NAME_PARAM "name" @@ -25,6 +27,7 @@ #define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCES_OP "addSentences" #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences" +#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences" #define REFRESH_INDEX_OP "refreshIndex" #define SIMPLE_SEARCH_OP "simpleSearch" #define CONCORDIA_SEARCH_OP "concordiaSearch" diff --git a/concordia-server/db_connection.cpp b/concordia-server/db_connection.cpp index c46516c..8b26eeb 100644 --- a/concordia-server/db_connection.cpp +++ b/concordia-server/db_connection.cpp @@ -17,7 +17,7 @@ DBconnection::DBconnection() throw(ConcordiaException) { ss << "Connection string: " << connectionInfo; throw ConcordiaException(ss.str()); } - + } DBconnection::~DBconnection() { @@ -90,8 +90,8 @@ PGresult * DBconnection::execute(std::string query, paramFormats[index] = param->isBinary(); index++; } - - + + PGresult * result = PQexecParams(_connection, query.c_str(), params.size(), @@ -129,7 +129,18 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor } catch (std::exception & e) { std::stringstream ss; ss << "Error getting int value. Message: " << e.what(); - throw ConcordiaException(ss.str()); + throw ConcordiaException(ss.str()); + } +} + +bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) { + try { + char * valueStr = PQgetvalue(result,row,col); + return std::string(valueStr) == "t"; + } catch (std::exception & e) { + std::stringstream ss; + ss << "Error getting bool value. Message: " << e.what(); + throw ConcordiaException(ss.str()); } } @@ -150,7 +161,6 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) { } catch (std::exception & e) { std::stringstream ss; ss << "Error getting int value. Message: " << e.what(); - throw ConcordiaException(ss.str()); + throw ConcordiaException(ss.str()); } } - diff --git a/concordia-server/db_connection.hpp b/concordia-server/db_connection.hpp index c65fb35..9542fb8 100644 --- a/concordia-server/db_connection.hpp +++ b/concordia-server/db_connection.hpp @@ -31,6 +31,8 @@ public: int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException); + bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException); + std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException); int getRowCount(PGresult * result) throw (ConcordiaException); diff --git a/concordia-server/index_controller.cpp b/concordia-server/index_controller.cpp index 60d65f0..37de410 100644 --- a/concordia-server/index_controller.cpp +++ b/concordia-server/index_controller.cpp @@ -14,9 +14,11 @@ #include "json_generator.hpp" #include "logger.hpp" -IndexController::IndexController(boost::shared_ptr >concordiasMap) +IndexController::IndexController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException): - _concordiasMap(concordiasMap) { + _concordiasMap(concordiasMap), + _lemmatizerFacade(lemmatizerFacade) { } IndexController::~IndexController() { @@ -32,9 +34,10 @@ void IndexController::addSentence( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId)); TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence); - int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); - it->second->addTokenizedExample(tokenizedSentence, sentenceId); + int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); + it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId); it->second->refreshSAfromRAM(); jsonWriter.StartObject(); @@ -42,20 +45,20 @@ void IndexController::addSentence( jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } catch (std::exception & e) { std::stringstream errorstream; errorstream << "general error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } catch (...) { std::stringstream errorstream; errorstream << "unexpected error occurred"; - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } @@ -67,21 +70,22 @@ void IndexController::addSentences( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + std::vector tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId)); std::vector tokenizedSentences = it->second->tokenizeAll(sourceSentences); std::vector sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId); - it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds); + it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds); jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } @@ -96,28 +100,66 @@ void IndexController::addAlignedSentences( std::vector sourceSentences; std::vector > > allAlignments; _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences); - - std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true); + + std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true); std::vector tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false); std::vector sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId); for(int index = 0; index < tokenizedSourceSentences.size(); index++) { it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index)); - } + } jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } catch (ConcordiaException & e) { std::stringstream errorstream; errorstream << "concordia error: " << e.what(); - JsonGenerator::signalError(jsonWriter, errorstream.str()); + JsonGenerator::signalError(jsonWriter, errorstream.str()); } } +void IndexController::addAlignedLemmatizedSentences( + rapidjson::Writer & jsonWriter, + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector & alignmentStrings, + const int tmId) { + try { + boost::ptr_map::iterator it = _concordiasMap->find(tmId); + if (it != _concordiasMap->end()) { + std::vector lemmatizedSourceSentences; + std::vector > > allAlignments; + _getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings); + + std::vector tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true); + std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false); + std::vector tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false); + + std::vector sentenceIds = + + _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId); + for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) { + it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index)); + } + jsonWriter.StartObject(); + jsonWriter.String("status"); + jsonWriter.String("success"); + jsonWriter.EndObject(); + } else { + JsonGenerator::signalError(jsonWriter, "no such tm!"); + } + } catch (ConcordiaException & e) { + std::stringstream errorstream; + errorstream << "concordia error: " << e.what(); + JsonGenerator::signalError(jsonWriter, errorstream.str()); + } +} + + void IndexController::refreshIndexFromRAM(rapidjson::Writer & jsonWriter, const int tmId) { try { @@ -130,12 +172,12 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer > alignments; - + UnicodeString s(rawSourceSentence.c_str()); boost::u32regex_iterator begin( boost::make_u32regex_iterator( @@ -159,21 +201,21 @@ void IndexController::_getSourceSentencesAndAlignments( ) ); boost::u32regex_iterator end; - + for (; begin != end; ++begin) { UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1)); std::string token; tokenUTF8.toUTF8String(token); if (token != "NULL") { - std::string numbers((*begin)[2].first, (*begin)[2].second); + std::string numbers((*begin)[2].first, (*begin)[2].second); std::istringstream iss(numbers); std::vector numberStrings; std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(numberStrings)); - std::vector tokenAlignments; + std::vector tokenAlignments; for (int j=0;j >concordiasMap) + explicit IndexController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException); /*! Destructor. */ @@ -38,9 +41,16 @@ public: const std::vector & targetSentences, const int tmId); + void addAlignedLemmatizedSentences( + rapidjson::Writer & jsonWriter, + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector & alignmentStrings, + const int tmId); + void refreshIndexFromRAM(rapidjson::Writer & jsonWriter, const int tmId); - + private: void _getSourceSentencesAndAlignments( std::vector & sourceSentences, @@ -48,7 +58,9 @@ private: const std::vector & rawSourceSentences); boost::shared_ptr > _concordiasMap; - + + boost::shared_ptr _lemmatizerFacade; + UnitDAO _unitDAO; }; diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp index f6adc31..43b0aae 100644 --- a/concordia-server/lemmatizer_facade.cpp +++ b/concordia-server/lemmatizer_facade.cpp @@ -1,5 +1,7 @@ #include "lemmatizer_facade.hpp" +#include + LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { _lemmatizersMap = boost::ptr_map(); @@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s } } + +std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) { + std::pair tmInfo = _tmDAO.getTmInfo(tmId); + if (tmInfo.first) { + return lemmatizeSentence(tmInfo.second, pattern); + } else { + return pattern; + } +} + +std::vector LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector patterns, int tmId) { + std::pair tmInfo = _tmDAO.getTmInfo(tmId); + if (tmInfo.first) { + std::vector result; + BOOST_FOREACH(std::string & pattern, patterns) { + result.push_back(lemmatizeSentence(tmInfo.second, pattern)); + } + return result; + } else { + return patterns; + } + +} diff --git a/concordia-server/lemmatizer_facade.hpp b/concordia-server/lemmatizer_facade.hpp index 7eea156..e9f5c3e 100644 --- a/concordia-server/lemmatizer_facade.hpp +++ b/concordia-server/lemmatizer_facade.hpp @@ -2,6 +2,7 @@ #define LEMMATIZER_FACADE_HDR #include "socket_lemmatizer.hpp" +#include "tm_dao.hpp" #include #include @@ -18,8 +19,15 @@ public: virtual ~LemmatizerFacade(); std::string lemmatizeSentence(std::string languageCode, std::string sentence); + + std::string lemmatizeIfNeeded(std::string pattern, int tmId); + + std::vector lemmatizeSentencesIfNeeded(std::vector patterns, int tmId); + private: boost::ptr_map _lemmatizersMap; + + TmDAO _tmDAO; }; #endif diff --git a/concordia-server/searcher_controller.cpp b/concordia-server/searcher_controller.cpp index 11d36ac..dd7eb03 100644 --- a/concordia-server/searcher_controller.cpp +++ b/concordia-server/searcher_controller.cpp @@ -8,9 +8,11 @@ #include "logger.hpp" -SearcherController::SearcherController(boost::shared_ptr >concordiasMap) +SearcherController::SearcherController(boost::shared_ptr >concordiasMap, + boost::shared_ptr lemmatizerFacade) throw(ConcordiaException): - _concordiasMap(concordiasMap) { + _concordiasMap(concordiasMap), + _lemmatizerFacade(lemmatizerFacade) { } SearcherController::~SearcherController() { @@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer const int tmId) { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); std::vector results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern)); jsonWriter.StartObject(); @@ -30,48 +33,49 @@ void SearcherController::simpleSearch(rapidjson::Writer jsonWriter.String("results"); jsonWriter.StartArray(); BOOST_FOREACH(SimpleSearchResult & result, results) { - JsonGenerator::writeSearchResult(jsonWriter, result); - } + JsonGenerator::writeSearchResult(jsonWriter, result); + } jsonWriter.EndArray(); jsonWriter.EndObject(); } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); + JsonGenerator::signalError(jsonWriter, "no such tm!"); } } void SearcherController::concordiaPhraseSearch(rapidjson::Writer & jsonWriter, std::string & pattern, const std::vector & intervals, - const int tmId) { + const int tmId) { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { if (intervals.size() > 0) { // std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart()); + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart()); - + Logger::log("concordiaPhraseSearch"); Logger::logString("short pattern", shortPattern); std::vector shortPatternResults = _unitDAO.getSearchResults(it->second->simpleSearch(shortPattern)); - - - + + + jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); jsonWriter.String("found"); if (shortPatternResults.size() > 0) { jsonWriter.Bool(true); - - + + std::vector bestOverlay; - + int currStart = 0; BOOST_FOREACH(const Interval & interval, intervals) { CompleteConcordiaSearchResult restResult = _unitDAO.getConcordiaResult( it->second->concordiaSearch(pattern.substr(currStart, interval.getStart()-currStart))); restResult.offsetPattern(currStart); bestOverlay.insert(bestOverlay.end(), restResult.getBestOverlay().begin(), restResult.getBestOverlay().end()); - + SimpleSearchResult shortPatternresult = shortPatternResults[0]; shortPatternresult.setMatchedPatternStart(interval.getStart()); shortPatternresult.setMatchedPatternEnd(interval.getEnd()); @@ -82,26 +86,26 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writersecond->concordiaSearch(_substrUTF8(pattern,currStart,INT_MAX))); lastRestResult.offsetPattern(currStart); bestOverlay.insert(bestOverlay.end(), lastRestResult.getBestOverlay().begin(), lastRestResult.getBestOverlay().end()); - + jsonWriter.String("result"); jsonWriter.StartObject(); jsonWriter.String("bestOverlay"); jsonWriter.StartArray(); BOOST_FOREACH(SimpleSearchResult & simpleResult, bestOverlay) { - JsonGenerator::writeSearchResult(jsonWriter, simpleResult); - } + JsonGenerator::writeSearchResult(jsonWriter, simpleResult); + } jsonWriter.EndArray(); jsonWriter.EndObject(); } else { - jsonWriter.Bool(false); + jsonWriter.Bool(false); } jsonWriter.EndObject(); } else { JsonGenerator::signalError(jsonWriter, "no intervals for phrase search"); } } else { - JsonGenerator::signalError(jsonWriter, "no such tm!"); - } + JsonGenerator::signalError(jsonWriter, "no such tm!"); + } } @@ -111,8 +115,9 @@ void SearcherController::concordiaSearch(rapidjson::Writer::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { + pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId); CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern)); - + jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); @@ -123,16 +128,16 @@ void SearcherController::concordiaSearch(rapidjson::Writer >concordiasMap) - throw(ConcordiaException); + explicit SearcherController(boost::shared_ptr > concordiasMap, + boost::shared_ptr LemmatizerFacade) + throw(ConcordiaException); /*! Destructor. */ virtual ~SearcherController(); @@ -40,7 +42,9 @@ private: std::string _substrUTF8(std::string source, int start, int length); boost::shared_ptr > _concordiasMap; - + + boost::shared_ptr _lemmatizerFacade; + UnitDAO _unitDAO; }; diff --git a/concordia-server/tm_dao.cpp b/concordia-server/tm_dao.cpp index 1319907..4b2e2da 100644 --- a/concordia-server/tm_dao.cpp +++ b/concordia-server/tm_dao.cpp @@ -3,6 +3,7 @@ #include "query_param.hpp" #include "string_param.hpp" #include "int_param.hpp" +#include "bool_param.hpp" #include "int_array_param.hpp" #include "logger.hpp" @@ -27,20 +28,25 @@ std::vector TmDAO::getTmIds() { } connection.clearResult(dbResult); connection.endTransaction(); - + return result; } int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) { + addTm(sourceLangId, targetLangId, name, false); +} + +int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) { DBconnection connection; connection.startTransaction(); - std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id"; + std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id"; std::vector params; params.push_back(new IntParam(sourceLangId)); params.push_back(new IntParam(targetLangId)); params.push_back(new StringParam(name)); - + params.push_back(new BoolParam(lemmatized)); + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); @@ -48,8 +54,23 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri BOOST_FOREACH (QueryParam * param, params) { delete param; } - + return newId; } +std::pair TmDAO::getTmInfo(int tmId) { + DBconnection connection; + connection.startTransaction(); + std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;"; + std::vector params; + params.push_back(new IntParam(tmId)); + PGresult * dbResult = connection.execute(query, params); + bool lemmatized = connection.getBoolValue(dbResult, 0, 1); + std::string languageCode = connection.getStringValue(dbResult, 0, 2); + connection.clearResult(dbResult); + connection.endTransaction(); + + return std::pair(lemmatized, languageCode); + +} diff --git a/concordia-server/tm_dao.hpp b/concordia-server/tm_dao.hpp index e43822a..4db8097 100644 --- a/concordia-server/tm_dao.hpp +++ b/concordia-server/tm_dao.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "db_connection.hpp" @@ -18,8 +19,12 @@ public: int addTm(const int sourceLangId, const int targetLangId, const std::string name); + int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized); + std::vector getTmIds(); + std::pair getTmInfo(int tmId); + private: }; diff --git a/concordia-server/unit_dao.cpp b/concordia-server/unit_dao.cpp index 16a1e92..6a1a68c 100644 --- a/concordia-server/unit_dao.cpp +++ b/concordia-server/unit_dao.cpp @@ -22,7 +22,7 @@ int UnitDAO::addSentence( const TokenizedSentence & sourceSentence, const std::string & targetSentence, const int tmId) { - + DBconnection connection; connection.startTransaction(); int newId = _addSingleSentence(connection, sourceSentence, targetSentence, tmId); @@ -38,7 +38,7 @@ std::vector UnitDAO::addSentences( std::vector newIds; connection.startTransaction(); int index = 0; - BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) { + BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) { newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmId)); index++; } @@ -50,7 +50,7 @@ std::vector UnitDAO::addAlignedSentences( const std::vector & sourceSentences, const std::vector & targetSentences, const std::vector > > & allAlignments, - const int tmId) { + const int tmId) throw (ConcordiaException) { DBconnection connection; std::vector newIds; @@ -59,9 +59,9 @@ std::vector UnitDAO::addAlignedSentences( for (int i=0; i< sourceSentences.size(); i++) { newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId)); } - + connection.endTransaction(); - return newIds; + return newIds; } std::vector UnitDAO::getSearchResults(const std::vector & fragments) { @@ -83,7 +83,7 @@ void UnitDAO::_getResultsFromFragments( std::vector & results, const std::vector & fragments, const TokenizedSentence & tokenizedPattern) { - + DBconnection connection; connection.startTransaction(); @@ -95,9 +95,9 @@ void UnitDAO::_getResultsFromFragments( matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart(); matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd(); } - - - + + + std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; std::vector params; params.push_back(new IntParam(2*fragment.getExampleOffset()+1)); @@ -116,7 +116,7 @@ void UnitDAO::_getResultsFromFragments( delete param; } - // now add all target fragments matched with this fragment + // now add all target fragments matched with this fragment std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos"; std::vector targetParams; targetParams.push_back(new IntParam(fragment.getExampleId())); @@ -127,12 +127,12 @@ void UnitDAO::_getResultsFromFragments( int prevPos = -2; int currStart = -1; int currEnd = -1; - + for (int i=0;i= 0) { @@ -141,7 +141,7 @@ void UnitDAO::_getResultsFromFragments( currStart = targetStart; } - currEnd = targetEnd; + currEnd = targetEnd; prevPos = targetPos; } @@ -154,9 +154,9 @@ void UnitDAO::_getResultsFromFragments( BOOST_FOREACH (QueryParam * param, targetParams) { delete param; } - + results.push_back(ssResult); - } + } connection.endTransaction(); } @@ -181,25 +181,29 @@ int UnitDAO::_addSingleSentence( params.push_back(new StringParam(targetSentence)); params.push_back(new IntParam(tmId)); params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); - + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } - + return newId; } -int UnitDAO::_addAlignedUnit( +int UnitDAO::_addAlignedUnit ( DBconnection & connection, const TokenizedSentence & sourceSentence, const TokenizedSentence & targetSentence, const std::vector > & alignments, - const int tmId) { - + const int tmId) throw(ConcordiaException) { + + if (sourceSentence.getTokens().size() != alignments.size()) { + throw ConcordiaException("The size of source sentence does not match the size of alignments array."); + } + std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; std::vector params; params.push_back(new StringParam(sourceSentence.getSentence())); @@ -207,14 +211,14 @@ int UnitDAO::_addAlignedUnit( params.push_back(new IntParam(tmId)); params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); params.push_back(new IntArrayParam(_getTokenPositions(targetSentence))); - + PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } - + // add alignments bool nonEmpty = false; std::stringstream alignmentsQuery; @@ -230,10 +234,8 @@ int UnitDAO::_addAlignedUnit( query = alignmentsQuery.str(); query = query.substr(0, query.length()-1); PGresult * result = connection.execute(query); - connection.clearResult(result); + connection.clearResult(result); } return newId; } - - diff --git a/concordia-server/unit_dao.hpp b/concordia-server/unit_dao.hpp index 875fa0a..7159320 100644 --- a/concordia-server/unit_dao.hpp +++ b/concordia-server/unit_dao.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "simple_search_result.hpp" @@ -33,13 +34,13 @@ public: const std::vector & sourceSentences, const std::vector & targetSentences, const int tmId); - + std::vector addAlignedSentences( const std::vector & sourceSentences, const std::vector & targetSentences, const std::vector > > & allAlignments, - const int tmId); - + const int tmId) throw (ConcordiaException); + std::vector getSearchResults(const std::vector & fragments); CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr rawConcordiaResult); @@ -50,7 +51,7 @@ private: const TokenizedSentence & tokenizedPattern); std::vector _getTokenPositions(const TokenizedSentence & ts); - + int _addSingleSentence( DBconnection & connection, const TokenizedSentence & sourceSentence, @@ -62,7 +63,7 @@ private: const TokenizedSentence & sourceSentence, const TokenizedSentence & targetSentence, const std::vector > & alignments, - const int tmId); + const int tmId) throw(ConcordiaException); }; #endif diff --git a/db/concordia_server.sql b/db/concordia_server.sql index 125df22..c8a8a21 100644 --- a/db/concordia_server.sql +++ b/db/concordia_server.sql @@ -3,7 +3,8 @@ CREATE TABLE tm ( id SERIAL PRIMARY KEY, source_lang_id integer, target_lang_id integer, - name varchar(40) + name varchar(40), + lemmatized bool DEFAULT false ); DROP TABLE IF EXISTS language; diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo b/mgiza-aligner/LemmaGenSentenceLemmatizer/.vs/LemmaGenSentenceLemmatizer/v14/.suo index ef1ddee70546eae2c0b3c4e932eefd6e0df42a8c..b0abdc9f45cff9740c3e977f02883dee13639625 100644 GIT binary patch delta 2195 zcmcIlZ){Ul6o2=%#oE3%M%Q)i)^1(tX4$$q72_XD>1b2I%G%PBpe``MCAtk6rZrB@ zESeOZV_SRXNQ4m5M2UPE<3+#PL^Lr`SqO<=jK)v?NsRwGGmNaB^Il)AK}?Ktli$1d zo^#K==l;&Q=Vi|GnNym%J}1vrT0O_r;gIHQ<#L&%8pKk3kwKXq5@UDoK5aX3nRSr{ zCiv&s>#)K{1zLo*9NLtDHM-dhq%~22rt50NP||qZohYEqskPx)k62$L6LFl2`vM!W zS5_(CO_e!qXh0Ydw;?=;JqRzN2hoe@N95u=v^(V9x_Kw|5d`h@5caze1BfUhhKM6} zBNB)tg8YMIhfhC@69JKnKauCh#G%E9|J(gY&@&Ea#B_Cng;bs5QHTc-^lWwkkEoQYJf-uDNjs(i|P|1p;YqFW+8#{ir$cV%8hnr-@wuC z7a?FDaSty(xn_Rnmv6t=bL1Y|(8>4ES~jOG)CSJ51c__>NDf= zUjMGd<+mYr;(Wo*UvU4YlkH&|X=~QRAkbaR&nLaJ zpZ{E_`)*_+@n!mlP4Q1Y{X3sXMoo$y%dnerpDs(@6cjnUG}nqCGL#6IuITAO|0!sQ zK}geQybpy`jj%!gp-~tg-7KBne&CkQE=f}CqngAN73QnYV(64XGXcbMz@Zt=Q#sB*D|1| z;-lKg5rx&p2%XNlA#K5YCM8uTbrBQ?w6JQC*Fd063(q!1bgUg$Py!gKo>bl{l)HE8 zsi;*=Q{I?Bg<1Ve$SjwQkM{ zlWDFMmO7UyXgYde$y~b?CMVZJeWOzf^!}>R-M%(HVUxDp`GF3e9t%i;q2k8Ey?h}n z6z=EY$M8rswuJ@08ZhV^RhAmgA%EnCmV<5C7o_G_^b9_nxyIw-k01F6OkIzX{I6LL zNuL*!`FjST!9kdrYh)?tAKXtPa7_g_(7#Y=y~XV`ke->UXAs%vfau&dI5TK~2llmB z$6s%?!QtsHC#x7?u`LpFR9}f~`pr{=rDGO3GBEpUD<0IU-+PIoj~X%SfZ?eYDDL;b uPqS7jvyfphble0c=PJ0uW-1AFKJG9#j{l3NI6wa8;K-yMS#(3DE$|1As08fy{*r7hN!FFxLee|ygSrho4B9~l=md{|M*(kXH{>4h81RF~ zK{p5h9rOcs8H5ahJ}?OOf@)CuTw%+Geo42Gyqr9 z9p_KlcA(Gsvw(k_zk9I83pju5e?$HjMjQVJc~g&IXGg&?K&rh?<897i@D?~h*Xx~e zLhlqf4YFVnybay~XTZB)3GhnES&TmbIq)H1{l}1>fO9l#4=yKniHsW5Ong0e0Zb@R z+zBnE)DfG!(M8n)t*KGfj9Kzvit3Y;(nf52pXFlLJO+GF-2U6pLTb5@MKn=HXx3tA zv>q7LL<6GZ=PSd`)LZDFo~Lk47o~ebssbB7X=$(I$2SbnsNn5WIYnF#D93KNxANh+%L@qgSF4o2&qQ?ud31EZmI3cA+@@mU9=~w}1MvGC$fq zJaXexa@5_rTN!uyu+#M48*=U{qssHM$b^F&22F*d#n4v6*H~J)s%4FJw^33^VZ7hFPu#tCN4TAW1D$j*JY*GoCHQZlcpaW}kz-)L=%m!ZY$X?Fe!6J1 z{QS5v)_+O}l|)Ox(l^|nz&HJLYoe!klGfJWsJ+!?xF(MZx?b~;arwYug_2GkR-Jr` zB{N<9*AMy*BRs8JY1G-p%gt}#6M@CoDfa3_X2wCb7G!&i1K~FoIX(ioE zjY{>GNA@Um;h3Kfx6~52nf%6uiB5cdXR|;L6x)kBtIWa_VoB$(G}C7f1n7x5I~iBJ zrT#P2s@Ov?A^yJFUGD=-TS*4DPB8}R+mXl$TJFe2~d=)))={Xv2 o`auZlY1m80zrX76R9tJi`1z%)EniLK=TpyqWbF4ZiCk9s1FiFp+yDRo diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs index 4c18358..f53b436 100644 --- a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs +++ b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/Program.cs @@ -12,12 +12,20 @@ namespace LemmaGenSentenceLemmatizer { if (args.Length == 1) { - SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]); - string line = Console.ReadLine(); - while (!string.IsNullOrEmpty(line)) + try { - Console.WriteLine(lemmatizer.lemmatizeSentence(line)); - line = Console.ReadLine(); + SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]); + string line = Console.ReadLine(); + while (line != null) + { + Console.WriteLine(lemmatizer.lemmatizeSentence(line)); + line = Console.ReadLine(); + } + + } + catch (Exception ex) + { + Console.WriteLine("Exception occurred: " + ex.Message); } diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe index 3f36045aafe7617bd1ab49dcfc4f76bdf187f72a..d33ee58421ed8bb8f2a14bfb809592da1464caca 100644 GIT binary patch delta 1411 zcmZ9MZA_b06vzMf-u5YU<)JSOI-smw5ej{gZV(kVW5d`WqMPf8tD6yorYS?Zz7d_w z@{-|ZgxEZZ&JRmQOiWa=Ov!W~mJv-%{5n4v)NENI-{%)!W(@J%JGKz-lm71cpL6bc z?!C|Lo#~wE{NMsl%$473Vqcd(dzpVt?Z^T%Szr$VW`$mm40EAMNPvI_7~T1Q*j>O^J>&lQ4L?<5c&5qNtfa$iW$WVir=Z6kFZWTMa4gr z=MDsUGNOw(%)7W1XZbkUuRwY{ysJcAkpen~#5i|inyBFfdE}%XPK~MFjxb-~F|wuC zj-@RnNU!h`ahcEKs8^0FBgWq)&$?0z?lr2(vGG{hNfmrDv$TK~Wa(u7pyVuEq)ZfG zXra|J#FYKCq7Ri=8M_+z|| zX5wiCa1k#ejAe9+C}t^1Bm0Wz!Ixym@U0rQj45$UN|a+6&2iek8Se&vIO} zp{);ZViZZ7L4o!zgGqRaqx2M!qbG?mbi-@(?sz~~!}+P-zq#H1+>z_48|nFq&rceI zw)(zGiQSLnLtWggulGziKw zuG(7Jfbo-UQ{HNcxmzdXBINoS8PHF+V80F=iyw^N=|B2ab}<_}e_(&Q^VhjQD2(y$ zQ)8LY;v31yLjKfLI$s=_@aBzO`hDm7TX&_b_NlbJWnV43WJIfHS9SY87JlQvs>6Ms Il{+l|0RlDZmH+?% delta 1302 zcmZ9MZA_b06vzMf-1d2(tWR4iFJlOW#?jKXbP?a!OeB;!rkSuh9E!}P8ti1SwME0? zES(7|IC9?|x)Ntw+E_YjfYkctpC%D$*AU#&@f-Vr`w0N_Xdv8PBYUSSF z5G}D7wdA=*+;f6BKbD|IE!S0uTD>>PK<-1%l#x0$F2l48^_py=6{rg`q19D({tWfE z+)me_lCBot^2SD9E60J4yZU;r@;>1gfRqH5_z9;Ax$NY4o{xm<7VsO-;r(f>*Z?0N z;rsUwqB7-)-22YE($@$EFKzW_l21`mx>c9>ZS40mC7~5yL~wI=&Yj zf6Q>ku#Dj%+PUY0+{*&a%NT!;dV>X=k=?k+b`RSTc^@vDgasUyL&kG}nWTfvMw-Ga z_)!$tZk1C=pa!RzLDmUu!6jxGA2DP2mU+A34#PeS>4gXIJu_`wj~Qn04||?LR3wnc zouUaz>_P^v*&DBhuf>ex`eVN#c%D%uyRLK0yu_Q_Yr?%3%w*KV))sH)!>G$7K^vv`7P4Ok~ zYRVc}=4ifMPIvWw3K*xz)|aSiNc*REb!okDf>-Wd5|IGZilIMkK*)8i+)p zCnGJsMfqqN`bxB{bZ}aZHh7*U%@p*US?1?x)qIZBYU~}7V)0Kr?|hFB?SEKN8%eK| z7>p_Hzb-exaNcIIAy3=iw?cCxA5c7O8PI2}kY@|Cm1qWjo))Y$Z_+Dk&|RTztLHxp One!=p+Zi6+?ED8xmzyvE delta 645 zcmZpuX{g!2!X+^?b6*4_0|Udp_X^L%tbe<;zbl*eS-*MiW*IIW4iQD;!^fr{owhFH z4D-zCOuwdBd|LEsqvIMz&VwL5KyYxgpvZqlA(#B*(%jU%lH!=K%;M6-9EIS?yy9My zbHw_wOa11UZ9lV8a{494$$}yVlN}@!Sen<*Q{9{*v4W8^0cv)@WKk(~RuLdCV6vlB zJF6+sKJLk9rQBJW7}yxxCSR1(pIjr&VI%?+XA)qHWNetv)NmTe2FiW{ihTfL5MTtt z00@&o2uMF+n7m6`t)8EOp+OYL6#&xGK>FSLe+{xg8mOc}4oC|FX>A}4G`_(QNCVY2 z7y)TPAZ-JrLF(;+v;>gv1f->av@?(fn$_S3q-B7#Kad8R)xgQX#lXv;1mrOQX+EGC zfeJtZWE{vqA)w*jK&%gD3o@91gf}0Tv1FVW&@f3(g)v~Vs+>HKbd`J0cyY3+yf%DnwqeE|Ll5`TbNSt(s%AXU>AQFVe%gI4=1LUQBFuDc(BJ3dirr%D_0vBS5P~{RX;wf7fR+IU^UlEsK$$ASd5A{#b6*t2_N!+l0Dm; zW45vm%9-O1$g5LG1#7GE^zH~Ll!DwLf@M=~zdF89K-8TEN|WTjuG{Dke$-q{c zzOSd9>hh8sFkK3oY5`|PUr7O`O&@0R8#J*Z+dAi}2J3l^h8F}o?M=CkA7VYpZ2aP8|Mko4GZbK;${{|q7E0_q z*zB>n*#!OJTAO9YPT$XeTUUE*#9n2k&9zZ-!Guy>O1kbk^(Y|eJqG&gPxxa>Jz`l; zh#(`ytvk^|77|3n?|@dPh!6?5gi0VN@`&<4l d&DXo_b;i^0(=ZHMZ8SqHAz{O0ICvZG`~pmk);9nE literal 0 HcmV?d00001 diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache index b26246f07e0c6578bef656ce6d663fcc7ab293e3..70c7d05cf3bd9ea20f3c2bfdd4ac6e817625f5fd 100644 GIT binary patch literal 22553 zcmeHNdvIJ;8Q-(r?B-t5Y=Z*j9Ujs`<0fgFCM^`&rcG06OKD1}HtqIi_a@nPAK~uZ z^g&;lQCgllBZ3dmKjbkoDhdey$U{Y)K^`hTM&+gB1Eqk@DENnhzu(!tchBA3VoQ=S z?(E6zm-Bu1+;hJ3o!>p*cOE|B^ZDLI;(w+FKgiZKiI|niW(MuZvRon=k8CiqJF=1P zjAgboZ#1oJB9mU+K0ngNzn13iT++^2=Hj%Ovn?ap(!4G=kW9obH+OHy>@d@d2ioQh zb_{k7wztPS+KhQd9UD8RavOcSv$mP)vklv1DQz&3G$l9fjQk^LkxZH~8!BWYy+(G} zHU^R=wbrJA1W4YqXb1)L)fpq+W5n!?6)NuX+`&6+6;v6=06N3p*L<;|TLYe#=%?TQUWZghU6BQm#GFw!lXmCe0)-%Y8cY)D5#+c8ed-ErEOAE&;F8z+gaSbn%pKzpN;*|Ft@ZA3`= zOnb#HpoyP;R`#_4ZAT}wZQ^k{8re9pz~7TH&XXNzMBVYp?0_^Ldl4@5Vgg~|_F@L= z=F~8Ob#wYCpBgw3ZJyYna2t~2aUvQRp3L#!SGqcpzDBn8fyr)dt!YPkZF@wv_Q8Vj zCS#Oj11F(@Nd+^VzijDbw8S|s?#^)&?~9ZzQ6{HW=+fl8Zosb3pXbxd`ey+B)ZU z>UA{s|K2!$zaPe-;=XI=Xp4J)(yQxwuth4%7<69Sy!M599YHZwmHlw3vIOr`RW?$h ztl&q+@8nZ2TY22Hg3;PY+ChSIK|HwO2RT)HH%}_7;D!^&R8@9=DGwY&K&pwPM2w8(R<91X`8u$#IRyLRun~%E|4S2FpwtBO_+K z$I7IpabqWNR-v!hRVZxdc;Xx(#6!Y0M{&mo+Zv{oh{>HFtVy@EZ{B=7K~PI`Y3YgG z+S;;(IQ2(Q-IKQmeA8DT(nb*~Y|Lt?zj$!Z8F?$r1h3IF;WA<%$T{A$MIzgBO zJ)Bf|0GVJ6WWrRAqp}d4=Xg{$@wzq$(nJr;BMLv*R*>BF^nUE52t z52T6CnJK{O`@i_81{A4_($nct_sAn$%*xY`hG^k3JE1tf%kMbE$M8K^zZ43i6Q_-leK5!7<#XeNV3h~6J z2lgU2F3WmbAQ!k6bsmesy|2!oK6Q*`{y(EU1}&-UBw_@^ zN1}#F`*i-lPcR;o7P%D}X{gU2BN!n{@okRyc8VD@O7UkM@#iRJ%qZ>md1UHj+^B0` zKwjGF4rI0}wwl1Xz&Xm%?nE160Da{b8#oBYC@^>cyKDpJ+9e96XqPAoxOQ;?v<*_( z(=YRV{It73gyD$qcEtBk%m`E(;9g`L1Gr%M3WTZwCJ-#H0q&z3R~7mkI0$A#fx!b9 zpbwmDfb|{&FxYhs-~xlut2Dqj_&$ExH$epBSt)+N5kE*V<5_8dhmfi08eD5Uj1sDA zm_Tc|hhmOLlHbfDRIpby%}S4HdQ{V_@R)}2bh&A^Ln>|XC^FJCkFg1c%u@WgBYuM7 zB#1Q4cPK7SF1q$z2;>*?J!E2&^M$x3k*0VO4T+0YQ!MhBVxek^E{`b~s=KCeKNH5@ zvd>Q;BTexmHo<6Iil27G&rr?HLRI5^4UE zcek|YdK2L)!g;=1kHr6Ks-v366Rrr9pBckF5Z;bjZ`y`61cF;wjI4cv(ToT%W~sLI zlfSlp!s@fIetH7~T<=`7(zQ3i5}~^GS5OgFJ{ZxcFuItoemxC^uLtkI!5;=K`+5S5 zb8Ap?uH04OQv59rzUngHh9D27eGDej|)cN9h!{|4tSO8y0gt1?uJ>5QvT z7lBSN72s;L2C6CsRS<|#JYxuKeln{kl-^d*Bi>RNUHn6Z(%(H$;?6}ml(>UYgHjbk zsS$|rct&S2scSO`rZ*M*h&L2Q7q6>eI^qEn_q)o#6y{zRlV5NnY79RvDjM)>3Hy9c z%mL4Rq#E9;7;h&7RLazK3fTPAHIr!jlL9dDM}^VFA5^sc-h;Ng02&u<&XPF5y~i4q zsu)UV0Wrb+F`llS?E4 zO8h`!bn$%^V^4Z8#uK~p$?EYMl&X8Ox)g{x%A#-y!SuL-AMu#N=;BcoOyBl^>4{2U z3iEiE$uFJ+9g}%42|NY9Hg#3;)YSt}DO1-V)dM8n$tVXB?`G8Iq$-}1 zHUcru9kr`n9Bm?w?p2^8?ok+B+^yp1E)R})iKh%l;(18N@GDILUWBOutcn4)6;M)~ zUev|dHe&2c3arEz6-F0#su;V&gE3y3axvy^F7WbH4N6rEr9mLZb$UZCkcJ7Q+ZF7H z+Y|<;$10F+^#F<2?8vlCDRWXp#K#b$`iU)xs#L*`e=!jbsMi)1$IJ(J$Bi_p_ z!%_GZNSXYC7a_-F-b(_ng0DfXilH_NP*uDbvl9>UtG`{u-5~{P;yQ)V#V1tUecXdP z-hOp)=dS*E3$_NTDhAblAV!I)4!BS{NGKgp&?ELMj4t-6P}=K(5^qeGL+MBjO4YrQ za2*ii*#kY~g6S~9v{S*47*!Zu3!XZ5HHd9|HTH1`@-Jgne@WO_-4qn_~(b77?Jd*bo zZ9ZDjS!G3MAFU|Q&b+nq3~j8E0z3lo7UfY$rJ`a<;X$b}!z;_`=sijuVeU40o8zw2 zSS8K4p}^!9+)!YV(#r*5t_QufZ7UQlGxShd(cz;M6%6IZDm6#1%j6gIx-3$f!kkUL Y8XYVYEz{@_9x(aE;X;u-U!*7e2d60KTvH*0Z|>R_rYbS*_BU zK0Rmpa{B9Y`kal4SS_X2B9p>hju6_WiC0cR9wd+&je+DFo?hJ6H+rMOSjxW(SBt@*hPz zAz;mVc7Dpvg`VG8Sx;)xE3DhT^!KpSNiQ$wd~d_aOFzYidFQ4yu%aM&*>+1Zh{ZZO zVu{4ARI9OOs(l@5G3S}4{{L}d{MkL6mGu!&(;!KJ>|t#;{*y@%XL2HgDJ}bUp?3;e z33{KnF@EI9%tg5*eOb)O-V;F}3(&j_GBxF9rXVk0^!&9<`uyojb#7xMJ(?b>3#KRM zS8RVhop;?NRb;nixI&QgeAii2!=x}wt}Vk<)c!)jRRftAXmxcWbIQw=3bGgmnafU4 zvfbYKu#|Va-l2iPQR;RTxC%0}vQV&}m&JKVQm={+vHyPURk@-RHs{MloRLs%c`j7r zSr~7loeMith@un#I# zb))ax{q393iNHb))PsRm+NmxAeby;{N{vQS($>+=%P>71hGko^7o#n-Nr`GX`!GOT zu|~JXk@nE$5^v^1B<~@4nB=`A?<3hya)9LhBp)Dogye%HkCJ={B*6|2*KBaaiE0k+ zFwMAfZnW=Rt{sHuyu7pI$b6Sww&k1WVZFQZa$OeluthNKT|v$cL+L9Q(CwFsp;M3* zAGUj`YV7%0EFO=G_-?w!*h9DNi4gsQT;rYGM}zENu}7D>7oxl3G{`vSHr_F<4 z%tIvf;>nMKSQ|r7^XS0X(3m9%phgWou7tF`Aix@ho@f*z02_s#Y!FHipby$!{N?X2 zynehJd+`(&ZyqM;14%v&qVGk52lqi!&W~TZR0e9Adg&(_07(vl=*kf^XpQh1QbHO~ zY493_h8u(uG>8wHn!k9Vx2)z7EZ!U?IRcU#1Bs}aj-pY6<4UNm=B}e?IA04Q^ze&9 zcNcw)joG8Q(IbOLhezbV_|XIn)t0Oyy;HXx=$12O2lJut6qmc4VBRULyOJK~(25=4 z8l96t;Q1%~+=>(8Mvc&LrZ`pN6_rA!Qz(>F(AMS4g1x>j^HaW8=wP^KJX%>Rf|^Q@ z7ctmEh@Xp^I_hyCZtK!_aw;CgmH7JL!opUpd0%g%^!*KZuU_Ro(p#7lAjxMy=2Tb_ zxhqyehvcKB{0>|)7v+50h&PWqr2wW;FaE$y_L;&FUxevIT?NIYRgG%x&$cg&A?_s{P z=&6%7>0a=04xgLnNj?s;m)4ENAhiF>`R#&8B1Io_bZ~i@LIvgwjlL$uqM| z#hkPQnaWmfx!qZ2P)@kZp6`S!h5b=>%JV0^V(5GBzAQ|XS6t?B$lT1?j{k7A4|UW+ zXVGz;a5I^ma_!}yM|D*HUUYWGM29M!@p!7cVs|RD7_FJ1;*zsm@@;0Ms>TZU^lEi8 zGTC?c)xl<~*tE1`cRQcT{39A@F@SFEm3rdIIqcqnIu08$AC`Fm?>~e88%r6t zaY}xO&e$Q|#4O`2)b2^o(GkcQZ=%n;QGfJP#`so(=KlZFA9i5oO_|r!q`hwEa3Etw zEY);xY!SaH3?pSrhmoR-WiekUKVyQ9Oj*%SqQabNWp5wpZM!4a86hSG9K}G4sK}FDYs^-;d^BSA!I+X%=!`Rn+{4KM97fKsNRMy&~ zjh2Zu)5Cz`4wlO^YTO3%l?sCwP|cUY)l~B(Fm=_?bvLVqzF6s?#P9KMZh#1SXVv^- zwfP2{&wwb^Y_hqo8q2(i2Bnfuqmpa0N|f!pg@ME?mDL-!!Mq$%#Y?&>ZbwwXTS2ob z=;qa$Uq(f#;uTZ`9lvURwc7kyHuEZ=nm<=<{yZu@bO|hRgSNTBg&Jr|X3yEHo{xu9 z56Au-lH~kF=b{X4e;EY79MQMM1TBa#XeYMY z4JN*GgTei;a)bE-Hn^n;dkrsmUM=%=P*Kv3jtDiz5`lh>{K`4PH?THWK%35yK`*ld zs_HM3+wfX^2OFHK{;+A8Uj0nxnfQk3mCNT9}M=MKQ@PesACp;OU_w~eLZzCIH z@P4?1w{48K?_h&9)Ae1jWwd<{HE~q~n8;|1B^GqFU5TKLu|yNvj8(MmKxrF8=?B>0 zWH|?AnQwwA1L-Z)#3wZDh)Wt{iFqAJ7b8Go^w|WY9tN8%alzn^W~;FnjNNvywvDm& z6Kt?%wtfn>jJBVlCeCR96SEp)iL*M|&P340IKK&P2IKu5C~adX{Sq4-xpDmp4P`L> z8Z~iR!;d(nF_xIr!E`bLCT544z|_N}5KCO#0&Qu!m}}X=+cw7A+t^^ubo~Zw8EwBs zO&r$%CXQ*0C7#jIHW5J^^KwmSGnk*-fzmdH((karwHw#(!IXjY2h_xE$H{qy< zIaZdqV2-tw`|DxuEJcxD+{|1)F!Hz81aoK}IZ8Iv$TW3pA-X&P#f_I6QbEE@^_C`n0vNs;_s3?zmOHuA3OI&alDZ<}c J^xV#_e*;S&kiY-{ diff --git a/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe b/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/obj/Debug/LemmaGenSentenceLemmatizer.exe index 3f36045aafe7617bd1ab49dcfc4f76bdf187f72a..d33ee58421ed8bb8f2a14bfb809592da1464caca 100644 GIT binary patch delta 1411 zcmZ9MZA_b06vzMf-u5YU<)JSOI-smw5ej{gZV(kVW5d`WqMPf8tD6yorYS?Zz7d_w z@{-|ZgxEZZ&JRmQOiWa=Ov!W~mJv-%{5n4v)NENI-{%)!W(@J%JGKz-lm71cpL6bc z?!C|Lo#~wE{NMsl%$473Vqcd(dzpVt?Z^T%Szr$VW`$mm40EAMNPvI_7~T1Q*j>O^J>&lQ4L?<5c&5qNtfa$iW$WVir=Z6kFZWTMa4gr z=MDsUGNOw(%)7W1XZbkUuRwY{ysJcAkpen~#5i|inyBFfdE}%XPK~MFjxb-~F|wuC zj-@RnNU!h`ahcEKs8^0FBgWq)&$?0z?lr2(vGG{hNfmrDv$TK~Wa(u7pyVuEq)ZfG zXra|J#FYKCq7Ri=8M_+z|| zX5wiCa1k#ejAe9+C}t^1Bm0Wz!Ixym@U0rQj45$UN|a+6&2iek8Se&vIO} zp{);ZViZZ7L4o!zgGqRaqx2M!qbG?mbi-@(?sz~~!}+P-zq#H1+>z_48|nFq&rceI zw)(zGiQSLnLtWggulGziKw zuG(7Jfbo-UQ{HNcxmzdXBINoS8PHF+V80F=iyw^N=|B2ab}<_}e_(&Q^VhjQD2(y$ zQ)8LY;v31yLjKfLI$s=_@aBzO`hDm7TX&_b_NlbJWnV43WJIfHS9SY87JlQvs>6Ms Il{+l|0RlDZmH+?% delta 1302 zcmZ9MZA_b06vzMf-1d2(tWR4iFJlOW#?jKXbP?a!OeB;!rkSuh9E!}P8ti1SwME0? zES(7|IC9?|x)Ntw+E_YjfYkctpC%D$*AU#&@f-Vr`w0N_Xdv8PBYUSSF z5G}D7wdA=*+;f6BKbD|IE!S0uTD>>PK<-1%l#x0$F2l48^_py=6{rg`q19D({tWfE z+)me_lCBot^2SD9E60J4yZU;r@;>1gfRqH5_z9;Ax$NY4o{xm<7VsO-;r(f>*Z?0N z;rsUwqB7-)-22YE($@$EFKzW_l21`mx>c9>ZS40mC7~5yL~wI=&Yj zf6Q>ku#Dj%+PUY0+{*&a%NT!;dV>X=k=?k+b`RSTc^@vDgasUyL&kG}nWTfvMw-Ga z_)!$tZk1C=pa!RzLDmUu!6jxGA2DP2mU+A34#PeS>4gXIJu_`wj~Qn04||?LR3wnc zouUaz>_P^v*&DBhuf>ex`eVN#c%D%uyRLK0yu_Q_Yr?%3%w*KV))sH)!>G$7K^vv`7P4Ok~ zYRVc}=4ifMPIvWw3K*xz)|aSiNc*REb!okDf>-Wd5|IGZilIMkK*)8i+)p zCnGJsMfqqN`bxB{bZ}aZHh7*U%@p*US?1?x)qIZBYU~}7V)0Kr?|hFB?SEKN8%eK| z7>p_Hzb-exaNcIIAy3=iw?cCxA5c7O8PI2}kY@|Cm1qWjo))Y$Z_+Dk&|RTztLHxp One!=p+Zi6+?ED8xmzyvE delta 645 zcmZpuX{g!2!X+^?b6*4_0|Udp_X^L%tbe<;zbl*eS-*MiW*IIW4iQD;!^fr{owhFH z4D-zCOuwdBd|LEsqvIMz&VwL5KyYxgpvZqlA(#B*(%jU%lH!=K%;M6-9EIS?yy9My zbHw_wOa11UZ9lV8a{494$$}yVlN}@!Sen<*Q{9{*v4W8^0cv)@WKk(~RuLdCV6vlB zJF6+sKJLk9rQBJW7}yxxCSR1(pIjr&VI%?+XA)qHWNetv)NmTe2FiW{ihTfL5MTtt z00@&o2uMF+n7m6`t)8EOp+OYL6#&xGK>FSLe+{xg8mOc}4oC|FX>A}4G`_(QNCVY2 z7y)TPAZ-JrLF(;+v;>gv1f->av@?(fn$_S3q-B7#Kad8R)xgQX#lXv;1mrOQX+EGC zfeJtZWE{vqA)w*jK&%gD3o@91gf}0Tv1FVW&@f3(g)v~Vs+>HKbd`J0cyY3+yf% corpora/$(CORPUS_NAME)/aligned.txt + cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt + +clean-intermediate-files: + rm -f corpora/$(CORPUS_NAME)/*.lem + rm -f corpora/$(CORPUS_NAME)/*.low + rm -f corpora/$(CORPUS_NAME)/*.classes + rm -f corpora/$(CORPUS_NAME)/*.classes.cats + rm -f corpora/$(CORPUS_NAME)/*.vcb + rm -f corpora/$(CORPUS_NAME)/*.snt + rm -f corpora/$(CORPUS_NAME)/*.cooc + rm -f corpora/$(CORPUS_NAME)/aligned*part* + rm -f corpora/$(CORPUS_NAME)/giza.cfg + clean: rm -f corpora/$(CORPUS_NAME)/*.tok diff --git a/mgiza-aligner/clean-corpus-n.perl b/mgiza-aligner/clean-corpus-n.perl new file mode 100755 index 0000000..76a09e5 --- /dev/null +++ b/mgiza-aligner/clean-corpus-n.perl @@ -0,0 +1,168 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ +use warnings; +use strict; +use Getopt::Long; +my $help; +my $lc = 0; # lowercase the corpus? +my $ignore_ratio = 0; +my $ignore_xml = 0; +my $enc = "utf8"; # encoding of the input and output files + # set to anything else you wish, but I have not tested it yet +my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars + # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000) + # and crashes if it encounters a word that exceeds it +my $ratio = 9; + +GetOptions( + "help" => \$help, + "lowercase|lc" => \$lc, + "encoding=s" => \$enc, + "ratio=f" => \$ratio, + "ignore-ratio" => \$ignore_ratio, + "ignore-xml" => \$ignore_xml, + "max-word-length|mwl=s" => \$max_word_length +) or exit(1); + +if (scalar(@ARGV) < 6 || $help) { + print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n"; + exit; +} + +my $corpus = $ARGV[0]; +my $l1 = $ARGV[1]; +my $l2 = $ARGV[2]; +my $out = $ARGV[3]; +my $min = $ARGV[4]; +my $max = $ARGV[5]; + +my $linesRetainedFile = ""; +if (scalar(@ARGV) > 6) { + $linesRetainedFile = $ARGV[6]; + open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile"; +} + +print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n"; + +my $opn = undef; +my $l1input = "$corpus.$l1"; +if (-e $l1input) { + $opn = $l1input; +} elsif (-e $l1input.".gz") { + $opn = "gunzip -c $l1input.gz |"; +} else { + die "Error: $l1input does not exist"; +} +open(F,$opn) or die "Can't open '$opn'"; +$opn = undef; +my $l2input = "$corpus.$l2"; +if (-e $l2input) { + $opn = $l2input; +} elsif (-e $l2input.".gz") { + $opn = "gunzip -c $l2input.gz |"; +} else { + die "Error: $l2input does not exist"; +} + +open(E,$opn) or die "Can't open '$opn'"; + +open(FO,">$out.$l1") or die "Can't write $out.$l1"; +open(EO,">$out.$l2") or die "Can't write $out.$l2"; + +# necessary for proper lowercasing +my $binmode; +if ($enc eq "utf8") { + $binmode = ":utf8"; +} else { + $binmode = ":encoding($enc)"; +} +binmode(F, $binmode); +binmode(E, $binmode); +binmode(FO, $binmode); +binmode(EO, $binmode); + +my $innr = 0; +my $outnr = 0; +my $factored_flag; +while(my $f = ) { + $innr++; + print STDERR "." if $innr % 10000 == 0; + print STDERR "($innr)" if $innr % 100000 == 0; + my $e = ; + die "$corpus.$l2 is too short!" if !defined $e; + chomp($e); + chomp($f); + if ($innr == 1) { + $factored_flag = ($e =~ /\|/ || $f =~ /\|/); + } + + #if lowercasing, lowercase + if ($lc) { + $e = lc($e); + $f = lc($f); + } + + $e =~ s/\|//g unless $factored_flag; + $e =~ s/\s+/ /g; + $e =~ s/^ //; + $e =~ s/ $//; + $f =~ s/\|//g unless $factored_flag; + $f =~ s/\s+/ /g; + $f =~ s/^ //; + $f =~ s/ $//; + next if $f eq ''; + next if $e eq ''; + + my $ec = &word_count($e); + my $fc = &word_count($f); + next if $ec > $max; + next if $fc > $max; + next if $ec < $min; + next if $fc < $min; + next if !$ignore_ratio && $ec/$fc > $ratio; + next if !$ignore_ratio && $fc/$ec > $ratio; + # Skip this segment if any factor is longer than $max_word_length + my $max_word_length_plus_one = $max_word_length + 1; + next if $e =~ /[^\s\|]{$max_word_length_plus_one}/; + next if $f =~ /[^\s\|]{$max_word_length_plus_one}/; + + # An extra check: none of the factors can be blank! + die "There is a blank factor in $corpus.$l1 on line $innr: $f" + if $f =~ /[ \|]\|/; + die "There is a blank factor in $corpus.$l2 on line $innr: $e" + if $e =~ /[ \|]\|/; + + $outnr++; + print FO $f."\n"; + print EO $e."\n"; + + if ($linesRetainedFile ne "") { + print LINES_RETAINED $innr."\n"; + } +} + +if ($linesRetainedFile ne "") { + close LINES_RETAINED; +} + +print STDERR "\n"; +my $e = ; +die "$corpus.$l2 is too long!" if defined $e; + +print STDERR "Input sentences: $innr Output sentences: $outnr\n"; + +sub word_count { + my ($line) = @_; + if ($ignore_xml) { + $line =~ s/<\S[^>]*\S>/ /g; + $line =~ s/\s+/ /g; + $line =~ s/^ //g; + $line =~ s/ $//g; + } + my @w = split(/ /,$line); + return scalar @w; +} diff --git a/mgiza-aligner/sortGizaAlignments.py b/mgiza-aligner/sortGizaAlignments.py new file mode 100755 index 0000000..e6762ca --- /dev/null +++ b/mgiza-aligner/sortGizaAlignments.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 + +import sys, re + +examples_dict = {} +p = re.compile("# Sentence pair \((\d+)\)") + +i = 0 +for line in sys.stdin: + line = line.strip() + if i % 3 == 0: + current_example = [line] + m = p.match(line) + if m: + current_key = int(m.group(1)) + else: + raise Exception("Wrong line: "+line) + elif i % 3 == 1: + current_example.append(line) + else: + current_example.append(line) + examples_dict[current_key] = current_example + i+=1 + +for key in sorted(examples_dict.keys()): + print ('\n'.join(examples_dict[key])) diff --git a/tests/addAlignedLemmatizedTM.py b/tests/addAlignedLemmatizedTM.py old mode 100644 new mode 100755 index ee8a246..aecc7dd --- a/tests/addAlignedLemmatizedTM.py +++ b/tests/addAlignedLemmatizedTM.py @@ -21,10 +21,15 @@ def file_len(fname): pass return i + 1 -def add_data(data): +def add_examples(examplesData): req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') - json.loads(urllib2.urlopen(req, json.dumps(data)).read()) + response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read()) + if response['status'] == 'error': + raise Exception(response['message']) + +if len(sys.argv) != 7: + raise Exception("wrong number of arguments") name = sys.argv[1] sourceFile = sys.argv[2] @@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)): raise Exception("alignments file is not exactly 3 times longer than source and target") -totalLines = file_len(sourceFile) +totalExamples = file_len(sourceFile) data = { 'operation': 'addTm', 'sourceLangId':sourceLangId, 'targetLangId':targetLangId, - 'name':name + 'name':name, + 'tmLemmatized':True } req = urllib2.Request(address) @@ -60,35 +66,35 @@ data = { 'tmId':tmId } -sentences = [] +examples = [] start = time.time() -with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines: +with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af: + for lineNumber in range(totalExamples): + sourceSentence = sf.readline().strip() + targetSentence = tf.readline().strip() - lineNumber = 0 - for line in sourceLines: - line = line.strip() - if lineNumber % 3 == 1: - currSentence.append(line) - elif lineNumber % 3 == 2: - currSentence.append(line) - currSentence.reverse() - sentences.append(currSentence) - currSentence = [] - if len(sentences) >= BUFFER_SIZE: - data['sentences'] = sentences - add_data(data) - mark = time.time() - print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start))) - sentences = [] - lineNumber += 1 + # skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files. + af.readline() + af.readline() + + alignmentString = af.readline().strip() + + examples.append([sourceSentence, targetSentence, alignmentString]) + + if len(examples) >= BUFFER_SIZE: + data['examples'] = examples + add_examples(data) + mark = time.time() + print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start)) + examples = [] -if len(sentences) > 0: - data['sentences'] = sentences - add_data(data) +if len(examples) > 0: + data['examples'] = examples + add_examples(data) end = time.time() -print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start))) +print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start)) print "Generating index..." start = time.time() diff --git a/tests/addLemmatizedTM.sh b/tests/addLemmatizedTM.sh new file mode 100755 index 0000000..ba53f7f --- /dev/null +++ b/tests/addLemmatizedTM.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +CORPUS_NAME="europarl_sample" +SRC_LANG_ID=2 +TRG_LANG_ID=1 + +./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt diff --git a/tests/addTm.py b/tests/addTm.py index e3bfaa3..c36e791 100755 --- a/tests/addTm.py +++ b/tests/addTm.py @@ -16,7 +16,8 @@ data = { 'operation': 'addTm', 'sourceLangId':int(sys.argv[1]), 'targetLangId':int(sys.argv[2]), - 'name':sys.argv[3] + 'name':sys.argv[3], + 'tmLemmatized':bool(int(sys.argv[4])) } req = urllib2.Request(address)