storing and retrieving alignments

2015-12-29 22:13:21 +01:00 · 2015-12-29 22:13:21 +01:00 · 05d99168ed
commit 05d99168ed
parent 67f485cbc2
17 changed files with 395 additions and 63 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -99,6 +99,20 @@ if(WITH_PCRE)
  set(HAVE_PCRE 1)
 endif(WITH_PCRE)

+# ----------------------------------------------------
+# ICU (I feeeeel youuuuu...)
+# ----------------------------------------------------
+find_library(ICU_LIB NAMES icui18n)
+find_path(ICU_INCLUDE unicode)
+
+if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
+  message(STATUS "Found ICU: ${ICU_LIB}")
+  include_directories(${ICU_INCLUDE})
+  link_directories(${ICU_LIB})
+else()
+  message(FATAL_ERROR "ICU not found")
+endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
+
 # ----------------------------------------------------
 # Boost
 # ----------------------------------------------------
--- a/clearIndex.sh
+++ b/clearIndex.sh
@ -1,5 +1,5 @@
 #!/bin/sh

-rm -rf index/*
+sudo rm -rf index/*
 cd db
-./recreateDb.sh
+./recreateDb.sh 
--- a/concordia-server/CMakeLists.txt
+++ b/concordia-server/CMakeLists.txt
@ -16,5 +16,5 @@ add_executable(concordia_server_process
                  tm_dao.cpp
                  aligned_unit.cpp
              )
-target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case)
+target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

--- a/concordia-server/aligned_unit.cpp
+++ b/concordia-server/aligned_unit.cpp
@ -1,9 +1,15 @@
 #include "aligned_unit.hpp"


-AlignedUnit::AlignedUnit() {
+AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
+                         const TokenizedSentence & targetSentence,
+                         std::vector<std::vector<int> > alignments):
+                         _sourceSentence(sourceSentence),
+                         _targetSentence(targetSentence),
+                         _alignments(alignments) {
 }

+
 AlignedUnit::~AlignedUnit() {
 }

--- a/concordia-server/aligned_unit.hpp
+++ b/concordia-server/aligned_unit.hpp
@ -10,19 +10,29 @@ class AlignedUnit {
 public:
    /*! Constructor.
    */
-    AlignedUnit();
+    AlignedUnit(const TokenizedSentence & sourceSentence,
+                const TokenizedSentence & targetSentence,
+                std::vector<std::vector<int> > alignments);
    /*! Destructor.
    */
    virtual ~AlignedUnit();
    
-    boost::shared_ptr<TokenizedSentence> getSourceSentence() {
+    TokenizedSentence getSourceSentence() const {
        return _sourceSentence;
    }
+
+    TokenizedSentence getTargetSentence() const {
+        return _targetSentence;
+    }
+
+    std::vector<std::vector<int> > getAlignments() const {
+        return _alignments;
+    }
    
 private:
-    boost::shared_ptr<TokenizedSentence> _sourceSentence;
+    TokenizedSentence _sourceSentence;
        
-    boost::shared_ptr<TokenizedSentence> _targetSentence;
+    TokenizedSentence _targetSentence;

    std::vector<std::vector<int> > _alignments;    
 };
--- a/concordia-server/concordia_server_process.cpp
+++ b/concordia-server/concordia_server_process.cpp
@ -58,37 +58,43 @@ int main(int argc, char** argv) {
    std::streambuf * cout_streambuf = std::cout.rdbuf();
    std::streambuf * cerr_streambuf = std::cerr.rdbuf();

-    ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
-    Logger::log("Concordia server initiated successfully, waiting for requests");
+    try {
+        ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
+        Logger::log("Concordia server initiated successfully, waiting for requests");

-    FCGX_Request request;
+        FCGX_Request request;

-    FCGX_Init();
-    FCGX_InitRequest(&request, 0, 0);
+        FCGX_Init();
+        FCGX_InitRequest(&request, 0, 0);

-    while (FCGX_Accept_r(&request) == 0) {
-        fcgi_streambuf cin_fcgi_streambuf(request.in);
-        fcgi_streambuf cout_fcgi_streambuf(request.out);
-        fcgi_streambuf cerr_fcgi_streambuf(request.err);
+        while (FCGX_Accept_r(&request) == 0) {
+            fcgi_streambuf cin_fcgi_streambuf(request.in);
+            fcgi_streambuf cout_fcgi_streambuf(request.out);
+            fcgi_streambuf cerr_fcgi_streambuf(request.err);

-        std::cin.rdbuf(&cin_fcgi_streambuf);
-        std::cout.rdbuf(&cout_fcgi_streambuf);
-        std::cerr.rdbuf(&cerr_fcgi_streambuf);
+            std::cin.rdbuf(&cin_fcgi_streambuf);
+            std::cout.rdbuf(&cout_fcgi_streambuf);
+            std::cerr.rdbuf(&cerr_fcgi_streambuf);

-        std::string content = get_request_content(request);
+            std::string content = get_request_content(request);
+            
+            std::string requestString(content);
+            std::cout << concordiaServer.handleRequest(requestString);
+            
+            // Note: the fcgi_streambuf destructor will auto flush
+        }
+
+        // restore stdio streambufs
+        std::cin.rdbuf(cin_streambuf);
+        std::cout.rdbuf(cout_streambuf);
+        std::cerr.rdbuf(cerr_streambuf);
+
+        Logger::log("Gracefully shutting down Concordia server process");
        
-        std::string requestString(content);
-        std::cout << concordiaServer.handleRequest(requestString);
-        
-        // Note: the fcgi_streambuf destructor will auto flush
+    } catch (ConcordiaException & e) {
+        std::stringstream errorstream;
+        errorstream << "FATAL CONCORDIA ERROR: " << e.what()<< " - shutting down";
+        Logger::log(errorstream.str());
    }
-
-    // restore stdio streambufs
-    std::cin.rdbuf(cin_streambuf);
-    std::cout.rdbuf(cout_streambuf);
-    std::cerr.rdbuf(cerr_streambuf);
- 
-    Logger::log("Shutting down Concordia server process");
-
    return 0;
 }
--- a/concordia-server/index_controller.cpp
+++ b/concordia-server/index_controller.cpp
@ -2,6 +2,15 @@

 #include <concordia/common/config.hpp>

+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+
+#include <boost/regex.hpp>
+#include <boost/regex/icu.hpp>
+#include <unicode/unistr.h>
+
 #include "json_generator.hpp"
 #include "logger.hpp"

@ -23,10 +32,10 @@ void IndexController::addSentence(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence);
+            TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
            int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);     
-            (*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId);
-            (*_concordiasMap)[tmId].refreshSAfromRAM();
+            it->second->addTokenizedExample(tokenizedSentence, sentenceId);
+            it->second->refreshSAfromRAM();

            jsonWriter.StartObject();
            jsonWriter.String("status");
@ -58,9 +67,9 @@ void IndexController::addSentences(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            std::vector<TokenizedSentence> tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences);
+            std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
-            (*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds);
+            it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);

            jsonWriter.StartObject();
            jsonWriter.String("status");
@ -84,13 +93,13 @@ void IndexController::addAlignedSentences(
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences);
+            std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
            std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
            int index = 0;
-            for(std::vector<AlignedUnit>::iterator it = alignedUnits.begin(); it != alignedUnits.end(); ++it) {
-                (*_concordiasMap)[tmId].addTokenizedExample(*(it->getSourceSentence()), sentenceIds.at(index));
+            for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
+                it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
                index++;
-            }            
+            } 

            jsonWriter.StartObject();
            jsonWriter.String("status");
@ -111,7 +120,7 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
    try {
        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
        if (it != _concordiasMap->end()) {
-            (*_concordiasMap)[tmId].refreshSAfromRAM();
+            it->second->refreshSAfromRAM();

            jsonWriter.StartObject();
            jsonWriter.String("status");
@ -129,13 +138,66 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
 }

 std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
-                                                           const std::vector<std::string> & targetSentences) {
-    //TODO
+                                                           const std::vector<std::string> & targetSentences,
+                                                           const int tmId) {
    std::vector<AlignedUnit> result;
+    for (int i = 0; i<sourceSentences.size(); i++) {
+        std::string sourceSentence = sourceSentences[i];
+        std::string targetSentence = targetSentences[i];
+        
+        std::string rawSourceSentence;
+        std::vector<TokenAnnotation> sourceTokens;
+        std::vector<std::vector<int> > alignments;
+        
+        UnicodeString s(sourceSentence.c_str());
+        boost::u32regex_iterator<const UChar*> begin(
+                           boost::make_u32regex_iterator(
+                               s,
+                               boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
+                           )
+                                               );
+        boost::u32regex_iterator<const UChar*> end;
+        
+        for (; begin != end; ++begin) {
+            UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
+            std::string token;
+            tokenUTF8.toUTF8String(token);

+            if (token != "NULL") {
+                std::string numbers((*begin)[2].first, (*begin)[2].second);            
+                std::istringstream iss(numbers);
+                std::vector<std::string> numberStrings;
+                std::copy(std::istream_iterator<std::string>(iss),
+                          std::istream_iterator<std::string>(),
+                          std::back_inserter(numberStrings));
+
+                std::vector<int> tokenAlignments;                
+                for (int j=0;j<numberStrings.size();j++) {
+                    int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
+                    tokenAlignments.push_back(n);
+                }
+                alignments.push_back(tokenAlignments);
+                rawSourceSentence += token + " ";
+            }
+        }
+        
+        rawSourceSentence = _trim(rawSourceSentence);
+        
+        
+        boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
+        if (it != _concordiasMap->end()) {
+            TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
+            TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
+                    
+            result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
+        }
+    }
    return result;
 }

-
-
+std::string IndexController::_trim(std::string & str) {
+    size_t first = str.find_first_not_of(' ');
+    size_t last = str.find_last_not_of(' ');
+    return str.substr(first, (last-first+1));
+}

--- a/concordia-server/index_controller.hpp
+++ b/concordia-server/index_controller.hpp
@ -6,6 +6,7 @@
 #include <concordia/concordia.hpp>
 #include <concordia/concordia_exception.hpp>
 #include <boost/ptr_container/ptr_map.hpp>
+#include <concordia/token_annotation.hpp>


 #include "unit_dao.hpp"
@ -43,8 +44,11 @@ public:
    
 private:
    std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
-                                              const std::vector<std::string> & targetSentences);
+                                              const std::vector<std::string> & targetSentences,
+                                              const int tmId);

+    std::string _trim(std::string & str);
+    
    boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
    
    UnitDAO _unitDAO;
--- a/concordia-server/json_generator.cpp
+++ b/concordia-server/json_generator.cpp
@ -1,5 +1,6 @@
 #include "json_generator.hpp"

+#include <boost/foreach.hpp>

 JsonGenerator::JsonGenerator() {
 }
@ -34,7 +35,19 @@ void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer>
    jsonWriter.String("sourceSegment");
    jsonWriter.String(result.getSourceSegment().c_str());
    jsonWriter.String("targetSegment");
-    jsonWriter.String(result.getTargetSegment().c_str());                        
+    jsonWriter.String(result.getTargetSegment().c_str());
+    jsonWriter.String("targetFragments");
+    jsonWriter.StartArray();
+
+    for (std::vector<std::pair<int,int> >::const_iterator it = result.getTargetFragments().begin(); 
+            it != result.getTargetFragments().end(); it++) {
+        jsonWriter.StartArray();
+        jsonWriter.Int(it->first);
+        jsonWriter.Int(it->second);
+        jsonWriter.EndArray();
+    }    
+    jsonWriter.EndArray();
+
    jsonWriter.EndObject();
 }

--- a/concordia-server/searcher_controller.cpp
+++ b/concordia-server/searcher_controller.cpp
@ -19,7 +19,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
                                      const int tmId) {
    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
-        std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern));
+        std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));

        jsonWriter.StartObject();
        jsonWriter.String("status");
@ -42,7 +42,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff

    boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
    if (it != _concordiasMap->end()) {
-        CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern));
+        CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
        
        jsonWriter.StartObject();
        jsonWriter.String("status");
--- a/concordia-server/simple_search_result.cpp
+++ b/concordia-server/simple_search_result.cpp
@ -20,3 +20,8 @@ SimpleSearchResult::SimpleSearchResult(
 SimpleSearchResult::~SimpleSearchResult() {
 }

+void SimpleSearchResult::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
+    _targetFragments.push_back(targetFragment);
+}
+
+
--- a/concordia-server/simple_search_result.hpp
+++ b/concordia-server/simple_search_result.hpp
@ -2,6 +2,7 @@
 #define SIMPLE_SEARCH_RESULT_HDR

 #include <string>
+#include <vector>

 class SimpleSearchResult {
 public:
@ -47,6 +48,12 @@ public:
        return _targetSegment;
    }
    
+    const std::vector<std::pair<int,int> > & getTargetFragments() const {
+        return _targetFragments;
+    }
+    
+    void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
+    
 private:
    int _id;
    
@ -61,6 +68,8 @@ private:
    std::string _sourceSegment;

    std::string _targetSegment;
+    
+    std::vector<std::pair<int,int> > _targetFragments;
 };

 #endif
--- a/concordia-server/unit_dao.cpp
+++ b/concordia-server/unit_dao.cpp
@ -1,5 +1,7 @@
 #include "unit_dao.hpp"

+#include<sstream>
+
 #include "query_param.hpp"
 #include "string_param.hpp"
 #include "int_param.hpp"
@ -48,7 +50,17 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
         const std::vector<AlignedUnit> & alignedUnits,
         const int tmId) {
    //TODO
+
+
+    DBconnection connection;
    std::vector<SUFFIX_MARKER_TYPE> newIds;
+    connection.startTransaction();
+
+    BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
+        newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
+    }
+    
+    connection.endTransaction();
    return newIds;      
 }

@ -84,23 +96,66 @@ void UnitDAO::_getResultsFromFragments(
            matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
        }
        
+        
+        
        std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
        std::vector<QueryParam*> params;
        params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
        params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
        params.push_back(new IntParam(fragment.getExampleId()));
        PGresult * result = connection.execute(query, params);
-        results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0),      // example id
-                                             matchedPatternStart,
-                                             matchedPatternEnd,
-                                             connection.getIntValue(result,0,3),      // matched example start
-                                             connection.getIntValue(result,0,4),      // matched example end
-                                             connection.getStringValue(result,0,1),   // source segment
-                                             connection.getStringValue(result,0,2))); // target segment
+        SimpleSearchResult ssResult(connection.getIntValue(result,0,0),      // example id
+                                    matchedPatternStart,
+                                    matchedPatternEnd,
+                                    connection.getIntValue(result,0,3),      // matched example start
+                                    connection.getIntValue(result,0,4),      // matched example end
+                                    connection.getStringValue(result,0,1),   // source segment
+                                    connection.getStringValue(result,0,2));  // target segment
        connection.clearResult(result);
        BOOST_FOREACH (QueryParam * param, params) {
            delete param;
        }
+
+        //TODO now add all target fragments matched with this fragment        
+        std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
+        std::vector<QueryParam*> targetParams;
+        targetParams.push_back(new IntParam(fragment.getExampleId()));
+        targetParams.push_back(new IntParam(fragment.getExampleOffset()));
+        targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1));
+        PGresult * targetResult = connection.execute(targetQuery, targetParams);
+
+        int prevPos = -2;
+        int currStart = -1;
+        int currEnd = -1;
+        
+        for (int i=0;i<connection.getRowCount(targetResult);i++) {
+            int targetPos = connection.getIntValue(targetResult, i, 0);
+            int targetStart = connection.getIntValue(targetResult, i, 1);
+            int targetEnd = connection.getIntValue(targetResult, i, 2);
+            
+            if (prevPos < targetPos - 1) { // beginning of detached fragment
+                // check if there is a fragment to end
+                if (currStart >= 0) {
+                    ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
+                }
+                currStart = targetStart;
+            }
+
+            currEnd = targetEnd;            
+            prevPos = targetPos;
+        }
+
+        // check if there are remaining fragments
+        if (currStart >= 0) {
+            ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
+        }
+
+        connection.clearResult(targetResult);
+        BOOST_FOREACH (QueryParam * param, targetParams) {
+            delete param;
+        }
+        
+        results.push_back(ssResult);
    }    
    connection.endTransaction();
 }
@ -138,5 +193,45 @@ int UnitDAO::_addSingleSentence(
 }


+int UnitDAO::_addAlignedUnit(
+         DBconnection & connection,
+         const AlignedUnit & alignedUnit,
+         const int tmId) {
+        
+    std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
+    std::vector<QueryParam*> params;
+    params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
+    params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
+    params.push_back(new IntParam(tmId));
+    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
+    params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
+    
+    PGresult * result = connection.execute(query, params);
+    int newId = connection.getIntValue(result, 0, 0);
+    connection.clearResult(result);
+    BOOST_FOREACH (QueryParam * param, params) {
+        delete param;
+    }
+    
+    // add alignments
+    for(int i=0;i<alignedUnit.getAlignments().size();i++) {
+        for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
+            std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
+            std::vector<QueryParam*> params;
+            params.push_back(new IntParam(newId));
+            params.push_back(new IntParam(i));
+            params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
+            
+            PGresult * result = connection.execute(query, params);
+            connection.clearResult(result);
+            BOOST_FOREACH (QueryParam * param, params) {
+                delete param;
+            }
+        }
+    }
+
+    
+    return newId;
+}


--- a/concordia-server/unit_dao.hpp
+++ b/concordia-server/unit_dao.hpp
@ -56,6 +56,10 @@ private:
         const std::string & targetSentence,
         const int tmId);

+    int _addAlignedUnit(
+         DBconnection & connection,
+         const AlignedUnit & alignedUnit,
+         const int tmId);
 };

 #endif
--- a/db/concordia_server.sql
+++ b/db/concordia_server.sql
@ -20,7 +20,14 @@ CREATE TABLE unit (
    source_segment text,
    target_segment text,
    source_tokens integer[],
-    target_tokens integer[],
-    alignments integer[][]
+    target_tokens integer[]
+);
+
+DROP TABLE IF EXISTS alignment;
+CREATE TABLE alignment (
+    id SERIAL PRIMARY KEY,
+    unit_id integer,
+    source_token_pos integer,
+    target_token_pos integer
 );

--- a/tests/addAlignedFile.py
+++ b/tests/addAlignedFile.py
@ -0,0 +1,99 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import unittest
+import json
+import urllib2
+import sys
+import host
+import time
+
+BUFFER_SIZE = 500
+
+address = 'http://'+host.concordia_host
+if len(host.concordia_port) > 0:
+    address += ':'+host.concordia_port
+
+
+def file_len(fname):
+    with open(fname) as f:
+        for i, l in enumerate(f):
+            pass
+    return i + 1
+
+def add_data(data):
+    req = urllib2.Request(address)
+    req.add_header('Content-Type', 'application/json')
+    response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
+    #print response
+    
+sourceFile = sys.argv[1]
+sourceLangId = int(sys.argv[2])
+targetLangId = int(sys.argv[3])
+name = sys.argv[4]
+
+totalLines = file_len(sourceFile)
+
+data = {
+    'operation': 'addTm',
+    'sourceLangId':sourceLangId,
+    'targetLangId':targetLangId,
+    'name':name
+}
+
+req = urllib2.Request(address)
+req.add_header('Content-Type', 'application/json')
+response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
+print response
+tmId = int(response['newTmId'])
+print "Added new tm: %d" % tmId
+
+data = {
+    'operation': 'addAlignedSentences',
+    'tmId':tmId
+}
+
+sentences = []
+currSentence = []
+start = time.time()
+with open(sourceFile) as sourceLines:
+    lineNumber = 0
+    for line in sourceLines:
+        line = line.strip()
+        if lineNumber % 3 == 1:
+            currSentence.append(line)
+        elif lineNumber % 3 == 2:
+            currSentence.append(line)
+            currSentence.reverse()
+            sentences.append(currSentence)
+            currSentence = []
+            if len(sentences) >= BUFFER_SIZE:
+                data['sentences'] = sentences
+                add_data(data)
+                mark = time.time()
+                print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/3*(mark-start))
+        lineNumber += 1
+                
+
+if len(sentences) > 0:
+    data['sentences'] = sentences
+    add_data(data)
+    
+end = time.time()
+print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
+
+print "Generating index..."
+start = time.time()
+data = {
+    'operation': 'refreshIndex',
+    'tmId' : tmId
+}
+req = urllib2.Request(address)
+req.add_header('Content-Type', 'application/json')
+urllib2.urlopen(req, json.dumps(data)).read()
+
+end = time.time()
+print "Index regeneration complete. The operation took %.4f s" % (end - start)
+
+
+
--- a/upstart/cmake_stubs/pgbouncer.conf.in
+++ b/upstart/cmake_stubs/pgbouncer.conf.in
@ -2,9 +2,7 @@

 description "pgbouncer"

-start on (net-device-up
-          and local-filesystems
-          and runlevel [2345])
+start on started postgresql

 stop on runlevel [016]