From 883aebe919ae8c158dace08b0e3f3b41413ae64f Mon Sep 17 00:00:00 2001 From: rjawor Date: Fri, 1 Jan 2016 22:10:51 +0100 Subject: [PATCH] optimized adding aligned files --- concordia-server/CMakeLists.txt | 1 - concordia-server/aligned_unit.cpp | 15 ------- concordia-server/aligned_unit.hpp | 40 ------------------ concordia-server/index_controller.cpp | 59 +++++++++++---------------- concordia-server/index_controller.hpp | 12 +++--- concordia-server/unit_dao.cpp | 56 +++++++++++++------------ concordia-server/unit_dao.hpp | 11 +++-- tests/addAlignedFile.py | 3 +- 8 files changed, 66 insertions(+), 131 deletions(-) delete mode 100644 concordia-server/aligned_unit.cpp delete mode 100644 concordia-server/aligned_unit.hpp diff --git a/concordia-server/CMakeLists.txt b/concordia-server/CMakeLists.txt index 53412a5..82c95d8 100644 --- a/concordia-server/CMakeLists.txt +++ b/concordia-server/CMakeLists.txt @@ -14,7 +14,6 @@ add_executable(concordia_server_process simple_search_result.cpp complete_concordia_search_result.cpp tm_dao.cpp - aligned_unit.cpp ) target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc) diff --git a/concordia-server/aligned_unit.cpp b/concordia-server/aligned_unit.cpp deleted file mode 100644 index 2cb7eab..0000000 --- a/concordia-server/aligned_unit.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include "aligned_unit.hpp" - - -AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence, - const TokenizedSentence & targetSentence, - std::vector > alignments): - _sourceSentence(sourceSentence), - _targetSentence(targetSentence), - _alignments(alignments) { -} - - -AlignedUnit::~AlignedUnit() { -} - diff --git a/concordia-server/aligned_unit.hpp b/concordia-server/aligned_unit.hpp deleted file mode 100644 index e992386..0000000 --- a/concordia-server/aligned_unit.hpp +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef ALIGNED_UNIT_HDR -#define ALIGNED_UNIT_HDR - -#include -#include - -#include - -class AlignedUnit { -public: - /*! Constructor. - */ - AlignedUnit(const TokenizedSentence & sourceSentence, - const TokenizedSentence & targetSentence, - std::vector > alignments); - /*! Destructor. - */ - virtual ~AlignedUnit(); - - TokenizedSentence getSourceSentence() const { - return _sourceSentence; - } - - TokenizedSentence getTargetSentence() const { - return _targetSentence; - } - - std::vector > getAlignments() const { - return _alignments; - } - -private: - TokenizedSentence _sourceSentence; - - TokenizedSentence _targetSentence; - - std::vector > _alignments; -}; - -#endif diff --git a/concordia-server/index_controller.cpp b/concordia-server/index_controller.cpp index c96c9d7..60d65f0 100644 --- a/concordia-server/index_controller.cpp +++ b/concordia-server/index_controller.cpp @@ -87,20 +87,23 @@ void IndexController::addSentences( void IndexController::addAlignedSentences( rapidjson::Writer & jsonWriter, - const std::vector & sourceSentences, + const std::vector & rawSourceSentences, const std::vector & targetSentences, const int tmId) { try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - std::vector alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId); - std::vector sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId); - int index = 0; - for(std::vector::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) { - it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index)); - index++; - } + std::vector sourceSentences; + std::vector > > allAlignments; + _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences); + + std::vector tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true); + std::vector tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false); + std::vector sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId); + for(int index = 0; index < tokenizedSourceSentences.size(); index++) { + it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index)); + } jsonWriter.StartObject(); jsonWriter.String("status"); jsonWriter.String("success"); @@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer IndexController::_getAlignedUnits(const std::vector & sourceSentences, - const std::vector & targetSentences, - const int tmId) { - std::vector result; - for (int i = 0; i & sourceSentences, + std::vector > > & allAlignments, + const std::vector & rawSourceSentences) { + + for (int i = 0; i sourceTokens; + std::string sourceSentence = ""; std::vector > alignments; - UnicodeString s(sourceSentence.c_str()); + UnicodeString s(rawSourceSentence.c_str()); boost::u32regex_iterator begin( boost::make_u32regex_iterator( s, @@ -177,27 +179,14 @@ std::vector IndexController::_getAlignedUnits(const std::vector::iterator it = _concordiasMap->find(tmId); - if (it != _concordiasMap->end()) { - TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true); - TokenizedSentence targetTS = it->second->tokenize(targetSentence, true); - - result.push_back(AlignedUnit(sourceTS, targetTS, alignments)); - } + sourceSentences.push_back(sourceSentence); + allAlignments.push_back(alignments); } - return result; -} - -std::string IndexController::_trim(std::string & str) { - size_t first = str.find_first_not_of(' '); - size_t last = str.find_last_not_of(' '); - return str.substr(first, (last-first+1)); } diff --git a/concordia-server/index_controller.hpp b/concordia-server/index_controller.hpp index 9994043..dea675d 100644 --- a/concordia-server/index_controller.hpp +++ b/concordia-server/index_controller.hpp @@ -10,7 +10,6 @@ #include "unit_dao.hpp" -#include "aligned_unit.hpp" #include "rapidjson/writer.h" @@ -35,7 +34,7 @@ public: const int tmId); void addAlignedSentences(rapidjson::Writer & jsonWriter, - const std::vector & sourceSentences, + const std::vector & rawSourceSentences, const std::vector & targetSentences, const int tmId); @@ -43,12 +42,11 @@ public: const int tmId); private: - std::vector _getAlignedUnits(const std::vector & sourceSentences, - const std::vector & targetSentences, - const int tmId); + void _getSourceSentencesAndAlignments( + std::vector & sourceSentences, + std::vector > > & allAlignments, + const std::vector & rawSourceSentences); - std::string _trim(std::string & str); - boost::shared_ptr > _concordiasMap; UnitDAO _unitDAO; diff --git a/concordia-server/unit_dao.cpp b/concordia-server/unit_dao.cpp index 2c19741..b7ed073 100644 --- a/concordia-server/unit_dao.cpp +++ b/concordia-server/unit_dao.cpp @@ -46,18 +46,18 @@ std::vector UnitDAO::addSentences( return newIds; } -std::vector UnitDAO::addAlignedUnits( - const std::vector & alignedUnits, +std::vector UnitDAO::addAlignedSentences( + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector > > & allAlignments, const int tmId) { - //TODO - DBconnection connection; std::vector newIds; connection.startTransaction(); - BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) { - newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId)); + for (int i=0; i< sourceSentences.size(); i++) { + newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId)); } connection.endTransaction(); @@ -194,17 +194,19 @@ int UnitDAO::_addSingleSentence( int UnitDAO::_addAlignedUnit( - DBconnection & connection, - const AlignedUnit & alignedUnit, - const int tmId) { + DBconnection & connection, + const TokenizedSentence & sourceSentence, + const TokenizedSentence & targetSentence, + const std::vector > & alignments, + const int tmId) { std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; std::vector params; - params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence())); - params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence())); + params.push_back(new StringParam(sourceSentence.getSentence())); + params.push_back(new StringParam(targetSentence.getSentence())); params.push_back(new IntParam(tmId)); - params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence()))); - params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence()))); + params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence))); + params.push_back(new IntArrayParam(_getTokenPositions(targetSentence))); PGresult * result = connection.execute(query, params); int newId = connection.getIntValue(result, 0, 0); @@ -214,23 +216,23 @@ int UnitDAO::_addAlignedUnit( } // add alignments - for(int i=0;i params; - params.push_back(new IntParam(newId)); - params.push_back(new IntParam(i)); - params.push_back(new IntParam(alignedUnit.getAlignments()[i][j])); - - PGresult * result = connection.execute(query, params); - connection.clearResult(result); - BOOST_FOREACH (QueryParam * param, params) { - delete param; - } + bool nonEmpty = false; + std::stringstream alignmentsQuery; + alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values "; + + for(int i=0;i #include -#include "aligned_unit.hpp" #include "simple_search_result.hpp" #include "complete_concordia_search_result.hpp" #include "db_connection.hpp" @@ -35,8 +34,10 @@ public: const std::vector & targetSentences, const int tmId); - std::vector addAlignedUnits( - const std::vector & alignedUnits, + std::vector addAlignedSentences( + const std::vector & sourceSentences, + const std::vector & targetSentences, + const std::vector > > & allAlignments, const int tmId); std::vector getSearchResults(const std::vector & fragments); @@ -58,7 +59,9 @@ private: int _addAlignedUnit( DBconnection & connection, - const AlignedUnit & alignedUnit, + const TokenizedSentence & sourceSentence, + const TokenizedSentence & targetSentence, + const std::vector > & alignments, const int tmId); }; diff --git a/tests/addAlignedFile.py b/tests/addAlignedFile.py index 57c6467..22846f2 100755 --- a/tests/addAlignedFile.py +++ b/tests/addAlignedFile.py @@ -43,7 +43,6 @@ data = { req = urllib2.Request(address) req.add_header('Content-Type', 'application/json') response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) -print response tmId = int(response['newTmId']) print "Added new tm: %d" % tmId @@ -80,7 +79,7 @@ if len(sentences) > 0: add_data(data) end = time.time() -print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start)) +print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start))) print "Generating index..." start = time.time()