optimized adding aligned files

2016-01-01 22:10:51 +01:00 · 2016-01-01 22:10:51 +01:00 · 883aebe919
commit 883aebe919
parent 129f154d5e
8 changed files with 66 additions and 131 deletions
--- a/concordia-server/CMakeLists.txt
+++ b/concordia-server/CMakeLists.txt
@ -14,7 +14,6 @@ add_executable(concordia_server_process
                  simple_search_result.cpp
                  complete_concordia_search_result.cpp
                  tm_dao.cpp
-                  aligned_unit.cpp
              )
 target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

--- a/concordia-server/aligned_unit.cpp
+++ b/concordia-server/aligned_unit.cpp
@ -1,15 +0,0 @@
-#include "aligned_unit.hpp"
-
-
-AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
-                         const TokenizedSentence & targetSentence,
-                         std::vector<std::vector<int> > alignments):
-                         _sourceSentence(sourceSentence),
-                         _targetSentence(targetSentence),
-                         _alignments(alignments) {
-}
-
-
-AlignedUnit::~AlignedUnit() {
-}
-
--- a/concordia-server/aligned_unit.hpp
+++ b/concordia-server/aligned_unit.hpp
@ -1,40 +0,0 @@
-#ifndef ALIGNED_UNIT_HDR
-#define ALIGNED_UNIT_HDR
-
-#include <vector>
-#include <string>
-
-#include <concordia/tokenized_sentence.hpp>
-
-class AlignedUnit {
-public:
-    /*! Constructor.
-    */
-    AlignedUnit(const TokenizedSentence & sourceSentence,
-                const TokenizedSentence & targetSentence,
-                std::vector<std::vector<int> > alignments);
-    /*! Destructor.
-    */
-    virtual ~AlignedUnit();
-    
-    TokenizedSentence getSourceSentence() const {
-        return _sourceSentence;
-    }
-
-    TokenizedSentence getTargetSentence() const {
-        return _targetSentence;
-    }
-
-    std::vector<std::vector<int> > getAlignments() const {
-        return _alignments;
-    }
-    
-private:
-    TokenizedSentence _sourceSentence;
-        
-    TokenizedSentence _targetSentence;
-
-    std::vector<std::vector<int> > _alignments;    
-};
-
-#endif
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -87,20 +87,23 @@ void IndexController::addSentences(

 void IndexController::addAlignedSentences(
                 rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
-                 const std::vector<std::string> & sourceSentences,
+                 const std::vector<std::string> & rawSourceSentences,
                 const std::vector<std::string> & targetSentences,
                 const int tmId) {
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
-            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
-            int index = 0;
-            for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
-                it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
-                index++;
-            } 
+            std::vector<std::string> sourceSentences;
+            std::vector<std::vector<std::vector<int> > > allAlignments;
+            _getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
+            
+            std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);            
+            std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);

+            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
+            for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
+                it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
+            } 
            jsonWriter.StartObject();
            jsonWriter.String("status");
            jsonWriter.String("success");
@ -137,19 +140,18 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf

 }

-std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
-                                                           const std::vector<std::string> & targetSentences,
-                                                           const int tmId) {
-    std::vector<AlignedUnit> result;
-    for (int i = 0; i<sourceSentences.size(); i++) {
-        std::string sourceSentence = sourceSentences[i];
-        std::string targetSentence = targetSentences[i];
+void IndexController::_getSourceSentencesAndAlignments(
+                        std::vector<std::string> & sourceSentences,
+                        std::vector<std::vector<std::vector<int> > > & allAlignments,
+                        const std::vector<std::string> & rawSourceSentences) {
+
+    for (int i = 0; i<rawSourceSentences.size(); i++) {
+        std::string rawSourceSentence = rawSourceSentences[i];
        
-        std::string rawSourceSentence;
-        std::vector<TokenAnnotation> sourceTokens;
+        std::string sourceSentence = "";
        std::vector<std::vector<int> > alignments;
        
-        UnicodeString s(sourceSentence.c_str());
+        UnicodeString s(rawSourceSentence.c_str());
        boost::u32regex_iterator<const UChar*> begin(
                           boost::make_u32regex_iterator(
                               s,
@ -177,27 +179,14 @@ std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std
                    tokenAlignments.push_back(n);
                }
                alignments.push_back(tokenAlignments);
-                rawSourceSentence += token + " ";
+                sourceSentence += token + " ";
            }
        }
        
-        rawSourceSentence = _trim(rawSourceSentence);
+        sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
        
-        
-        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
-        if (it != _concordiasMap->end()) {
-            TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
-            TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
-                    
-            result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
-        }
+        sourceSentences.push_back(sourceSentence);
+        allAlignments.push_back(alignments);
    }
-    return result;
-}
-
-std::string IndexController::_trim(std::string & str) {
-    size_t first = str.find_first_not_of(' ');
-    size_t last = str.find_last_not_of(' ');
-    return str.substr(first, (last-first+1));
 }

--- a/concordia-server/index_controller.hpp
+++ b/concordia-server/index_controller.hpp
@ -10,7 +10,6 @@


 #include "unit_dao.hpp"
-#include "aligned_unit.hpp"

 #include "rapidjson/writer.h"

@ -35,7 +34,7 @@ public:
                     const int tmId);

    void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
-                             const std::vector<std::string> & sourceSentences,
+                             const std::vector<std::string> & rawSourceSentences,
                             const std::vector<std::string> & targetSentences,
                             const int tmId);

@ -43,12 +42,11 @@ public:
                             const int tmId);
    
 private:
-    std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
-                                              const std::vector<std::string> & targetSentences,
-                                              const int tmId);
+    void _getSourceSentencesAndAlignments(
+                            std::vector<std::string> & sourceSentences,
+                            std::vector<std::vector<std::vector<int> > > & allAlignments,
+                            const std::vector<std::string> & rawSourceSentences);

-    std::string _trim(std::string & str);
-    
    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
    
    UnitDAO _unitDAO;
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -46,18 +46,18 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
    return newIds;
 }

-std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
-         const std::vector<AlignedUnit> & alignedUnits,
+std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
+         const std::vector<TokenizedSentence> & sourceSentences,
+         const std::vector<TokenizedSentence> & targetSentences,
+         const std::vector<std::vector<std::vector<int> > > & allAlignments,
         const int tmId) {
-    //TODO
-

    DBconnection connection;
    std::vector<SUFFIX_MARKER_TYPE> newIds;
    connection.startTransaction();

-    BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
-        newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
+    for (int i=0; i< sourceSentences.size(); i++) {
+        newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
    }
    
    connection.endTransaction();
@ -194,17 +194,19 @@ int UnitDAO::_addSingleSentence(


 int UnitDAO::_addAlignedUnit(
-         DBconnection & connection,
-         const AlignedUnit & alignedUnit,
-         const int tmId) {
+     DBconnection & connection,
+     const TokenizedSentence & sourceSentence,
+     const TokenizedSentence & targetSentence,
+     const std::vector<std::vector<int> > & alignments,
+     const int tmId) {
        
    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
    std::vector<QueryParam*> params;
-    params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
-    params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
+    params.push_back(new StringParam(sourceSentence.getSentence()));
+    params.push_back(new StringParam(targetSentence.getSentence()));
    params.push_back(new IntParam(tmId));
-    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
-    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
+    params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
+    params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
    
    PGresult * result = connection.execute(query, params);
    int newId = connection.getIntValue(result, 0, 0);
@ -214,23 +216,23 @@ int UnitDAO::_addAlignedUnit(
    }
    
    // add alignments
-    for(int i=0;i<alignedUnit.getAlignments().size();i++) {
-        for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
-            std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
-            std::vector<QueryParam*> params;
-            params.push_back(new IntParam(newId));
-            params.push_back(new IntParam(i));
-            params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
-            
-            PGresult * result = connection.execute(query, params);
-            connection.clearResult(result);
-            BOOST_FOREACH (QueryParam * param, params) {
-                delete param;
-            }
+    bool nonEmpty = false;
+    std::stringstream alignmentsQuery;
+    alignmentsQuery << "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values ";
+
+    for(int i=0;i<alignments.size();i++) {
+        for (int j=0;j<alignments[i].size();j++) {
+            nonEmpty = true;
+            alignmentsQuery << "(" << newId << "," << i << "," << alignments[i][j] << "),";
        }
    }
+    if (nonEmpty) {
+        query = alignmentsQuery.str();
+        query = query.substr(0, query.length()-1);
+        PGresult * result = connection.execute(query);
+        connection.clearResult(result);    
+    }

-    
    return newId;
 }

--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -11,7 +11,6 @@
 #include <concordia/concordia_search_result.hpp>
 #include <boost/shared_ptr.hpp>

-#include "aligned_unit.hpp"
 #include "simple_search_result.hpp"
 #include "complete_concordia_search_result.hpp"
 #include "db_connection.hpp"
@ -35,8 +34,10 @@ public:
             const std::vector<std::string> & targetSentences,
             const int tmId);
             
-    std::vector<SUFFIX_MARKER_TYPE> addAlignedUnits(
-             const std::vector<AlignedUnit> & alignedUnits,
+    std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
+             const std::vector<TokenizedSentence> & sourceSentences,
+             const std::vector<TokenizedSentence> & targetSentences,
+             const std::vector<std::vector<std::vector<int> > > & allAlignments,
             const int tmId);
    
    std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
@ -58,7 +59,9 @@ private:

    int _addAlignedUnit(
         DBconnection & connection,
-         const AlignedUnit & alignedUnit,
+         const TokenizedSentence & sourceSentence,
+         const TokenizedSentence & targetSentence,
+         const std::vector<std::vector<int> > & alignments,
         const int tmId);
 };

--- a/tests/addAlignedFile.py
+++ b/tests/addAlignedFile.py
@ -43,7 +43,6 @@ data = {
 req = urllib2.Request(address)
 req.add_header('Content-Type', 'application/json')
 response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
-print response
 tmId = int(response['newTmId'])
 print "Added new tm: %d" % tmId

@ -80,7 +79,7 @@ if len(sentences) > 0:
    add_data(data)
    
 end = time.time()
-print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
+print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))

 print "Generating index..."
 start = time.time()