optimized adding aligned files

2016-01-01 22:10:51 +01:00 · 2016-01-01 22:10:51 +01:00 · 883aebe919
commit 883aebe919
parent 129f154d5e
8 changed files with 66 additions and 131 deletions
--- a/concordia-server/CMakeLists.txt
+++ b/concordia-server/CMakeLists.txt
@ -14,7 +14,6 @@ add_executable(concordia_server_process
                  simple_search_result.cpp
                  complete_concordia_search_result.cpp
                  tm_dao.cpp
                  aligned_unit.cpp
              )
 target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
--- a/concordia-server/aligned_unit.cpp
+++ b/concordia-server/aligned_unit.cpp
@ -1,15 +0,0 @@
 #include "aligned_unit.hpp"
 AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
                         const TokenizedSentence & targetSentence,
                         std::vector<std::vector<int> > alignments):
                         _sourceSentence(sourceSentence),
                         _targetSentence(targetSentence),
                         _alignments(alignments) {
 }
 AlignedUnit::~AlignedUnit() {
 }
--- a/concordia-server/aligned_unit.hpp
+++ b/concordia-server/aligned_unit.hpp
@ -1,40 +0,0 @@
 #ifndef ALIGNED_UNIT_HDR
 #define ALIGNED_UNIT_HDR
 #include <vector>
 #include <string>
 #include <concordia/tokenized_sentence.hpp>
 class AlignedUnit {
 public:
    /*! Constructor.
    */
    AlignedUnit(const TokenizedSentence & sourceSentence,
                const TokenizedSentence & targetSentence,
                std::vector<std::vector<int> > alignments);
    /*! Destructor.
    */
    virtual ~AlignedUnit();
    TokenizedSentence getSourceSentence() const {
        return _sourceSentence;
    }
    TokenizedSentence getTargetSentence() const {
        return _targetSentence;
    }
    std::vector<std::vector<int> > getAlignments() const {
        return _alignments;
    }
 private:
    TokenizedSentence _sourceSentence;
    TokenizedSentence _targetSentence;
    std::vector<std::vector<int> > _alignments;    
 };
 #endif
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -87,20 +87,23 @@ void IndexController::addSentences(
 void IndexController::addAlignedSentences(
                 rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
-                 const std::vector<std::string> & sourceSentences,
+                 const std::vector<std::string> & rawSourceSentences,
                 const std::vector<std::string> & targetSentences,
                 const int tmId) {
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
+            std::vector<std::string> sourceSentences;
-            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
+            std::vector<std::vector<std::vector<int> > > allAlignments;
-            int index = 0;
+            _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
            for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
                it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
                index++;
            } 
            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);            
            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
            for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
                it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
            } 
            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
 }
-std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
+void IndexController::_getSourceSentencesAndAlignments(
-                                                           const std::vector<std::string> & targetSentences,
+                        std::vector<std::string> & sourceSentences,
-                                                           const int tmId) {
+                        std::vector<std::vector<std::vector<int> > > & allAlignments,
-    std::vector<AlignedUnit> result;
+                        const std::vector<std::string> & rawSourceSentences) {
    for (int i = 0; i<sourceSentences.size(); i++) {
        std::string sourceSentence = sourceSentences[i];
        std::string targetSentence = targetSentences[i];
-        std::string rawSourceSentence;
+    for (int i = 0; i<rawSourceSentences.size(); i++) {
-        std::vector<TokenAnnotation> sourceTokens;
+        std::string rawSourceSentence = rawSourceSentences[i];
        std::string sourceSentence = "";
        std::vector<std::vector<int> > alignments;
-        UnicodeString s(sourceSentence.c_str());
+        UnicodeString s(rawSourceSentence.c_str());
        boost::u32regex_iterator<const UChar*> begin(
                           boost::make_u32regex_iterator(
                               s,
@ -177,27 +179,14 @@ std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std
                    tokenAlignments.push_back(n);
                }
                alignments.push_back(tokenAlignments);
-                rawSourceSentence += token + " ";
+                sourceSentence += token + " ";
            }
        }
-        rawSourceSentence = _trim(rawSourceSentence);
+        sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
-        
+        sourceSentences.push_back(sourceSentence);
-        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
+        allAlignments.push_back(alignments);
        if (it != _concordiasMap->end()) {
            TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
            TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
            result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
    }
 }
    return result;
 }
 std::string IndexController::_trim(std::string & str) {
    size_t first = str.find_first_not_of(' ');
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last-first+1));
 }
--- a/concordia-server/index_controller.hpp
+++ b/concordia-server/index_controller.hpp
@ -10,7 +10,6 @@
 #include "unit_dao.hpp"
 #include "aligned_unit.hpp"
 #include "rapidjson/writer.h"
@ -35,7 +34,7 @@ public:
                     const int tmId);
    void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
-                             const std::vector<std::string> & sourceSentences,
+                             const std::vector<std::string> & rawSourceSentences,
                             const std::vector<std::string> & targetSentences,
                             const int tmId);
@ -43,11 +42,10 @@ public:
                             const int tmId);
 private:
-    std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
+    void _getSourceSentencesAndAlignments(
-                                              const std::vector<std::string> & targetSentences,
+                            std::vector<std::string> & sourceSentences,
-                                              const int tmId);
+                            std::vector<std::vector<std::vector<int> > > & allAlignments,
-
+                            const std::vector<std::string> & rawSourceSentences);
    std::string _trim(std::string & str);
    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -46,18 +46,18 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
    return newIds;
 }
-std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
+std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
-         const std::vector<AlignedUnit> & alignedUnits,
+         const std::vector<TokenizedSentence> & sourceSentences,
         const std::vector<TokenizedSentence> & targetSentences,
         const std::vector<std::vector<std::vector<int> > > & allAlignments,
         const int tmId) {
    //TODO
    DBconnection connection;
    std::vector<SUFFIX_MARKER_TYPE> newIds;
    connection.startTransaction();
-    BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
+    for (int i=0; i< sourceSentences.size(); i++) {
-        newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
+        newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
    }
    connection.endTransaction();
@ -195,16 +195,18 @@ int UnitDAO::_addSingleSentence(
 int UnitDAO::_addAlignedUnit(
     DBconnection & connection,
-         const AlignedUnit & alignedUnit,
+     const TokenizedSentence & sourceSentence,
     const TokenizedSentence & targetSentence,
     const std::vector<std::vector<int> > & alignments,
     const int tmId) {
    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
    std::vector<QueryParam*> params;
-    params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
+    params.push_back(new StringParam(sourceSentence.getSentence()));
-    params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
+    params.push_back(new StringParam(targetSentence.getSentence()));
    params.push_back(new IntParam(tmId));
-    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
+    params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
-    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
+    params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
@ -214,22 +216,22 @@ int UnitDAO::_addAlignedUnit(
    }
    // add alignments
-    for(int i=0;i<alignedUnit.getAlignments().size();i++) {
+    bool nonEmpty = false;
-        for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
+    std::stringstream alignmentsQuery;
-            std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
+    alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
            std::vector<QueryParam*> params;
            params.push_back(new IntParam(newId));
            params.push_back(new IntParam(i));
            params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
-            PGresult * result = connection.execute(query, params);
+    for(int i=0;i<alignments.size();i++) {
        for (int j=0;j<alignments[i].size();j++) {
            nonEmpty = true;
            alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
        }
    }
    if (nonEmpty) {
        query = alignmentsQuery.str();
        query = query.substr(0, query.length()-1);
        PGresult * result = connection.execute(query);
        connection.clearResult(result);    
            BOOST_FOREACH (QueryParam * param, params) {
                delete param;
    }
        }
    }
    return newId;
 }
--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -11,7 +11,6 @@
 #include <concordia/concordia_search_result.hpp>
 #include <boost/shared_ptr.hpp>
 #include "aligned_unit.hpp"
 #include "simple_search_result.hpp"
 #include "complete_concordia_search_result.hpp"
 #include "db_connection.hpp"
@ -35,8 +34,10 @@ public:
             const std::vector<std::string> & targetSentences,
             const int tmId);
-    std::vector<SUFFIX_MARKER_TYPE> addAlignedUnits(
+    std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
-             const std::vector<AlignedUnit> & alignedUnits,
+             const std::vector<TokenizedSentence> & sourceSentences,
             const std::vector<TokenizedSentence> & targetSentences,
             const std::vector<std::vector<std::vector<int> > > & allAlignments,
             const int tmId);
    std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
@ -58,7 +59,9 @@ private:
    int _addAlignedUnit(
         DBconnection & connection,
-         const AlignedUnit & alignedUnit,
+         const TokenizedSentence & sourceSentence,
         const TokenizedSentence & targetSentence,
         const std::vector<std::vector<int> > & alignments,
         const int tmId);
 };
--- a/tests/addAlignedFile.py
+++ b/tests/addAlignedFile.py
@ -43,7 +43,6 @@ data = {
 req = urllib2.Request(address)
 req.add_header('Content-Type', 'application/json')
 response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
 print response
 tmId = int(response['newTmId'])
 print "Added new tm: %d" % tmId
@ -80,7 +79,7 @@ if len(sentences) > 0:
    add_data(data)
 end = time.time()
-print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
+print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
 print "Generating index..."
 start = time.time()