diff --git a/CMakeLists.txt b/CMakeLists.txt index 5038610..04235b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,20 @@ if(WITH_PCRE) set(HAVE_PCRE 1) endif(WITH_PCRE) +# ---------------------------------------------------- +# ICU (I feeeeel youuuuu...) +# ---------------------------------------------------- +find_library(ICU_LIB NAMES icui18n) +find_path(ICU_INCLUDE unicode) + +if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE}) + message(STATUS "Found ICU: ${ICU_LIB}") + include_directories(${ICU_INCLUDE}) + link_directories(${ICU_LIB}) +else() + message(FATAL_ERROR "ICU not found") +endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE}) + # ---------------------------------------------------- # Boost # ---------------------------------------------------- diff --git a/clearIndex.sh b/clearIndex.sh index 52d52ca..e824b59 100755 --- a/clearIndex.sh +++ b/clearIndex.sh @@ -1,5 +1,5 @@ #!/bin/sh -rm -rf index/* +sudo rm -rf index/* cd db -./recreateDb.sh +./recreateDb.sh diff --git a/concordia-server/CMakeLists.txt b/concordia-server/CMakeLists.txt index 228d004..53412a5 100644 --- a/concordia-server/CMakeLists.txt +++ b/concordia-server/CMakeLists.txt @@ -16,5 +16,5 @@ add_executable(concordia_server_process tm_dao.cpp aligned_unit.cpp ) -target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case) +target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc) diff --git a/concordia-server/aligned_unit.cpp b/concordia-server/aligned_unit.cpp index f6e9aa9..2cb7eab 100644 --- a/concordia-server/aligned_unit.cpp +++ b/concordia-server/aligned_unit.cpp @@ -1,9 +1,15 @@ #include "aligned_unit.hpp" -AlignedUnit::AlignedUnit() { +AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence, + const TokenizedSentence & targetSentence, + std::vector > alignments): + _sourceSentence(sourceSentence), + _targetSentence(targetSentence), + _alignments(alignments) { } + AlignedUnit::~AlignedUnit() { } diff --git a/concordia-server/aligned_unit.hpp b/concordia-server/aligned_unit.hpp index 9f6ad6a..e992386 100644 --- a/concordia-server/aligned_unit.hpp +++ b/concordia-server/aligned_unit.hpp @@ -10,19 +10,29 @@ class AlignedUnit { public: /*! Constructor. */ - AlignedUnit(); + AlignedUnit(const TokenizedSentence & sourceSentence, + const TokenizedSentence & targetSentence, + std::vector > alignments); /*! Destructor. */ virtual ~AlignedUnit(); - boost::shared_ptr getSourceSentence() { + TokenizedSentence getSourceSentence() const { return _sourceSentence; } + + TokenizedSentence getTargetSentence() const { + return _targetSentence; + } + + std::vector > getAlignments() const { + return _alignments; + } private: - boost::shared_ptr _sourceSentence; + TokenizedSentence _sourceSentence; - boost::shared_ptr _targetSentence; + TokenizedSentence _targetSentence; std::vector > _alignments; }; diff --git a/concordia-server/concordia_server_process.cpp b/concordia-server/concordia_server_process.cpp index 6dc48f3..570a361 100644 --- a/concordia-server/concordia_server_process.cpp +++ b/concordia-server/concordia_server_process.cpp @@ -58,37 +58,43 @@ int main(int argc, char** argv) { std::streambuf * cout_streambuf = std::cout.rdbuf(); std::streambuf * cerr_streambuf = std::cerr.rdbuf(); - ConcordiaServer concordiaServer(CONFIG_FILE_PATH); - Logger::log("Concordia server initiated successfully, waiting for requests"); + try { + ConcordiaServer concordiaServer(CONFIG_FILE_PATH); + Logger::log("Concordia server initiated successfully, waiting for requests"); - FCGX_Request request; + FCGX_Request request; - FCGX_Init(); - FCGX_InitRequest(&request, 0, 0); + FCGX_Init(); + FCGX_InitRequest(&request, 0, 0); - while (FCGX_Accept_r(&request) == 0) { - fcgi_streambuf cin_fcgi_streambuf(request.in); - fcgi_streambuf cout_fcgi_streambuf(request.out); - fcgi_streambuf cerr_fcgi_streambuf(request.err); + while (FCGX_Accept_r(&request) == 0) { + fcgi_streambuf cin_fcgi_streambuf(request.in); + fcgi_streambuf cout_fcgi_streambuf(request.out); + fcgi_streambuf cerr_fcgi_streambuf(request.err); - std::cin.rdbuf(&cin_fcgi_streambuf); - std::cout.rdbuf(&cout_fcgi_streambuf); - std::cerr.rdbuf(&cerr_fcgi_streambuf); + std::cin.rdbuf(&cin_fcgi_streambuf); + std::cout.rdbuf(&cout_fcgi_streambuf); + std::cerr.rdbuf(&cerr_fcgi_streambuf); - std::string content = get_request_content(request); + std::string content = get_request_content(request); + + std::string requestString(content); + std::cout << concordiaServer.handleRequest(requestString); + + // Note: the fcgi_streambuf destructor will auto flush + } + + // restore stdio streambufs + std::cin.rdbuf(cin_streambuf); + std::cout.rdbuf(cout_streambuf); + std::cerr.rdbuf(cerr_streambuf); + + Logger::log("Gracefully shutting down Concordia server process"); - std::string requestString(content); - std::cout << concordiaServer.handleRequest(requestString); - - // Note: the fcgi_streambuf destructor will auto flush + } catch (ConcordiaException & e) { + std::stringstream errorstream; + errorstream << "FATAL CONCORDIA ERROR: " << e.what()<< " - shutting down"; + Logger::log(errorstream.str()); } - - // restore stdio streambufs - std::cin.rdbuf(cin_streambuf); - std::cout.rdbuf(cout_streambuf); - std::cerr.rdbuf(cerr_streambuf); - - Logger::log("Shutting down Concordia server process"); - return 0; } diff --git a/concordia-server/index_controller.cpp b/concordia-server/index_controller.cpp index 5079ca8..c96c9d7 100644 --- a/concordia-server/index_controller.cpp +++ b/concordia-server/index_controller.cpp @@ -2,6 +2,15 @@ #include +#include +#include +#include +#include + +#include +#include +#include + #include "json_generator.hpp" #include "logger.hpp" @@ -23,10 +32,10 @@ void IndexController::addSentence( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence); + TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence); int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId); - (*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId); - (*_concordiasMap)[tmId].refreshSAfromRAM(); + it->second->addTokenizedExample(tokenizedSentence, sentenceId); + it->second->refreshSAfromRAM(); jsonWriter.StartObject(); jsonWriter.String("status"); @@ -58,9 +67,9 @@ void IndexController::addSentences( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - std::vector tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences); + std::vector tokenizedSentences = it->second->tokenizeAll(sourceSentences); std::vector sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId); - (*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds); + it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds); jsonWriter.StartObject(); jsonWriter.String("status"); @@ -84,13 +93,13 @@ void IndexController::addAlignedSentences( try { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - std::vector alignedUnits = _getAlignedUnits(sourceSentences, targetSentences); + std::vector alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId); std::vector sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId); int index = 0; - for(std::vector::iterator it = alignedUnits.begin(); it != alignedUnits.end(); ++it) { - (*_concordiasMap)[tmId].addTokenizedExample(*(it->getSourceSentence()), sentenceIds.at(index)); + for(std::vector::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) { + it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index)); index++; - } + } jsonWriter.StartObject(); jsonWriter.String("status"); @@ -111,7 +120,7 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - (*_concordiasMap)[tmId].refreshSAfromRAM(); + it->second->refreshSAfromRAM(); jsonWriter.StartObject(); jsonWriter.String("status"); @@ -129,13 +138,66 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer IndexController::_getAlignedUnits(const std::vector & sourceSentences, - const std::vector & targetSentences) { - //TODO + const std::vector & targetSentences, + const int tmId) { std::vector result; + for (int i = 0; i sourceTokens; + std::vector > alignments; + + UnicodeString s(sourceSentence.c_str()); + boost::u32regex_iterator begin( + boost::make_u32regex_iterator( + s, + boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase) + ) + ); + boost::u32regex_iterator end; + + for (; begin != end; ++begin) { + UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1)); + std::string token; + tokenUTF8.toUTF8String(token); + if (token != "NULL") { + std::string numbers((*begin)[2].first, (*begin)[2].second); + std::istringstream iss(numbers); + std::vector numberStrings; + std::copy(std::istream_iterator(iss), + std::istream_iterator(), + std::back_inserter(numberStrings)); + + std::vector tokenAlignments; + for (int j=0;j::iterator it = _concordiasMap->find(tmId); + if (it != _concordiasMap->end()) { + TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true); + TokenizedSentence targetTS = it->second->tokenize(targetSentence, true); + + result.push_back(AlignedUnit(sourceTS, targetTS, alignments)); + } + } return result; } - - +std::string IndexController::_trim(std::string & str) { + size_t first = str.find_first_not_of(' '); + size_t last = str.find_last_not_of(' '); + return str.substr(first, (last-first+1)); +} diff --git a/concordia-server/index_controller.hpp b/concordia-server/index_controller.hpp index d760466..9994043 100644 --- a/concordia-server/index_controller.hpp +++ b/concordia-server/index_controller.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "unit_dao.hpp" @@ -43,8 +44,11 @@ public: private: std::vector _getAlignedUnits(const std::vector & sourceSentences, - const std::vector & targetSentences); + const std::vector & targetSentences, + const int tmId); + std::string _trim(std::string & str); + boost::shared_ptr > _concordiasMap; UnitDAO _unitDAO; diff --git a/concordia-server/json_generator.cpp b/concordia-server/json_generator.cpp index 0492c5d..b2ca255 100644 --- a/concordia-server/json_generator.cpp +++ b/concordia-server/json_generator.cpp @@ -1,5 +1,6 @@ #include "json_generator.hpp" +#include JsonGenerator::JsonGenerator() { } @@ -34,7 +35,19 @@ void JsonGenerator::writeSearchResult(rapidjson::Writer jsonWriter.String("sourceSegment"); jsonWriter.String(result.getSourceSegment().c_str()); jsonWriter.String("targetSegment"); - jsonWriter.String(result.getTargetSegment().c_str()); + jsonWriter.String(result.getTargetSegment().c_str()); + jsonWriter.String("targetFragments"); + jsonWriter.StartArray(); + + for (std::vector >::const_iterator it = result.getTargetFragments().begin(); + it != result.getTargetFragments().end(); it++) { + jsonWriter.StartArray(); + jsonWriter.Int(it->first); + jsonWriter.Int(it->second); + jsonWriter.EndArray(); + } + jsonWriter.EndArray(); + jsonWriter.EndObject(); } diff --git a/concordia-server/searcher_controller.cpp b/concordia-server/searcher_controller.cpp index bf5b0a3..a9ac8ba 100644 --- a/concordia-server/searcher_controller.cpp +++ b/concordia-server/searcher_controller.cpp @@ -19,7 +19,7 @@ void SearcherController::simpleSearch(rapidjson::Writer const int tmId) { boost::ptr_map::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - std::vector results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern)); + std::vector results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern)); jsonWriter.StartObject(); jsonWriter.String("status"); @@ -42,7 +42,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer::iterator it = _concordiasMap->find(tmId); if (it != _concordiasMap->end()) { - CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern)); + CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern)); jsonWriter.StartObject(); jsonWriter.String("status"); diff --git a/concordia-server/simple_search_result.cpp b/concordia-server/simple_search_result.cpp index dc3ff64..37281cd 100644 --- a/concordia-server/simple_search_result.cpp +++ b/concordia-server/simple_search_result.cpp @@ -20,3 +20,8 @@ SimpleSearchResult::SimpleSearchResult( SimpleSearchResult::~SimpleSearchResult() { } +void SimpleSearchResult::addMatchedTargetFragment(const std::pair & targetFragment) { + _targetFragments.push_back(targetFragment); +} + + diff --git a/concordia-server/simple_search_result.hpp b/concordia-server/simple_search_result.hpp index 3499485..c060c26 100644 --- a/concordia-server/simple_search_result.hpp +++ b/concordia-server/simple_search_result.hpp @@ -2,6 +2,7 @@ #define SIMPLE_SEARCH_RESULT_HDR #include +#include class SimpleSearchResult { public: @@ -47,6 +48,12 @@ public: return _targetSegment; } + const std::vector > & getTargetFragments() const { + return _targetFragments; + } + + void addMatchedTargetFragment(const std::pair & targetFragment); + private: int _id; @@ -61,6 +68,8 @@ private: std::string _sourceSegment; std::string _targetSegment; + + std::vector > _targetFragments; }; #endif diff --git a/concordia-server/unit_dao.cpp b/concordia-server/unit_dao.cpp index c070812..2c19741 100644 --- a/concordia-server/unit_dao.cpp +++ b/concordia-server/unit_dao.cpp @@ -1,5 +1,7 @@ #include "unit_dao.hpp" +#include + #include "query_param.hpp" #include "string_param.hpp" #include "int_param.hpp" @@ -48,7 +50,17 @@ std::vector UnitDAO::addAlignedUnits( const std::vector & alignedUnits, const int tmId) { //TODO + + + DBconnection connection; std::vector newIds; + connection.startTransaction(); + + BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) { + newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId)); + } + + connection.endTransaction(); return newIds; } @@ -84,23 +96,66 @@ void UnitDAO::_getResultsFromFragments( matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd(); } + + std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;"; std::vector params; params.push_back(new IntParam(2*fragment.getExampleOffset()+1)); params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength()))); params.push_back(new IntParam(fragment.getExampleId())); PGresult * result = connection.execute(query, params); - results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), // example id - matchedPatternStart, - matchedPatternEnd, - connection.getIntValue(result,0,3), // matched example start - connection.getIntValue(result,0,4), // matched example end - connection.getStringValue(result,0,1), // source segment - connection.getStringValue(result,0,2))); // target segment + SimpleSearchResult ssResult(connection.getIntValue(result,0,0), // example id + matchedPatternStart, + matchedPatternEnd, + connection.getIntValue(result,0,3), // matched example start + connection.getIntValue(result,0,4), // matched example end + connection.getStringValue(result,0,1), // source segment + connection.getStringValue(result,0,2)); // target segment connection.clearResult(result); BOOST_FOREACH (QueryParam * param, params) { delete param; } + + //TODO now add all target fragments matched with this fragment + std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos"; + std::vector targetParams; + targetParams.push_back(new IntParam(fragment.getExampleId())); + targetParams.push_back(new IntParam(fragment.getExampleOffset())); + targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1)); + PGresult * targetResult = connection.execute(targetQuery, targetParams); + + int prevPos = -2; + int currStart = -1; + int currEnd = -1; + + for (int i=0;i= 0) { + ssResult.addMatchedTargetFragment(std::pair(currStart,currEnd)); + } + currStart = targetStart; + } + + currEnd = targetEnd; + prevPos = targetPos; + } + + // check if there are remaining fragments + if (currStart >= 0) { + ssResult.addMatchedTargetFragment(std::pair(currStart,currEnd)); + } + + connection.clearResult(targetResult); + BOOST_FOREACH (QueryParam * param, targetParams) { + delete param; + } + + results.push_back(ssResult); } connection.endTransaction(); } @@ -138,5 +193,45 @@ int UnitDAO::_addSingleSentence( } +int UnitDAO::_addAlignedUnit( + DBconnection & connection, + const AlignedUnit & alignedUnit, + const int tmId) { + + std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; + std::vector params; + params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence())); + params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence())); + params.push_back(new IntParam(tmId)); + params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence()))); + params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence()))); + + PGresult * result = connection.execute(query, params); + int newId = connection.getIntValue(result, 0, 0); + connection.clearResult(result); + BOOST_FOREACH (QueryParam * param, params) { + delete param; + } + + // add alignments + for(int i=0;i params; + params.push_back(new IntParam(newId)); + params.push_back(new IntParam(i)); + params.push_back(new IntParam(alignedUnit.getAlignments()[i][j])); + + PGresult * result = connection.execute(query, params); + connection.clearResult(result); + BOOST_FOREACH (QueryParam * param, params) { + delete param; + } + } + } + + + return newId; +} diff --git a/concordia-server/unit_dao.hpp b/concordia-server/unit_dao.hpp index 273d6f2..4cd9fff 100644 --- a/concordia-server/unit_dao.hpp +++ b/concordia-server/unit_dao.hpp @@ -56,6 +56,10 @@ private: const std::string & targetSentence, const int tmId); + int _addAlignedUnit( + DBconnection & connection, + const AlignedUnit & alignedUnit, + const int tmId); }; #endif diff --git a/db/concordia_server.sql b/db/concordia_server.sql index e4b7b49..4982244 100644 --- a/db/concordia_server.sql +++ b/db/concordia_server.sql @@ -20,7 +20,14 @@ CREATE TABLE unit ( source_segment text, target_segment text, source_tokens integer[], - target_tokens integer[], - alignments integer[][] + target_tokens integer[] +); + +DROP TABLE IF EXISTS alignment; +CREATE TABLE alignment ( + id SERIAL PRIMARY KEY, + unit_id integer, + source_token_pos integer, + target_token_pos integer ); diff --git a/tests/addAlignedFile.py b/tests/addAlignedFile.py new file mode 100755 index 0000000..8549f4b --- /dev/null +++ b/tests/addAlignedFile.py @@ -0,0 +1,99 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest +import json +import urllib2 +import sys +import host +import time + +BUFFER_SIZE = 500 + +address = 'http://'+host.concordia_host +if len(host.concordia_port) > 0: + address += ':'+host.concordia_port + + +def file_len(fname): + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + +def add_data(data): + req = urllib2.Request(address) + req.add_header('Content-Type', 'application/json') + response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) + #print response + +sourceFile = sys.argv[1] +sourceLangId = int(sys.argv[2]) +targetLangId = int(sys.argv[3]) +name = sys.argv[4] + +totalLines = file_len(sourceFile) + +data = { + 'operation': 'addTm', + 'sourceLangId':sourceLangId, + 'targetLangId':targetLangId, + 'name':name +} + +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) +print response +tmId = int(response['newTmId']) +print "Added new tm: %d" % tmId + +data = { + 'operation': 'addAlignedSentences', + 'tmId':tmId +} + +sentences = [] +currSentence = [] +start = time.time() +with open(sourceFile) as sourceLines: + lineNumber = 0 + for line in sourceLines: + line = line.strip() + if lineNumber % 3 == 1: + currSentence.append(line) + elif lineNumber % 3 == 2: + currSentence.append(line) + currSentence.reverse() + sentences.append(currSentence) + currSentence = [] + if len(sentences) >= BUFFER_SIZE: + data['sentences'] = sentences + add_data(data) + mark = time.time() + print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/3*(mark-start)) + lineNumber += 1 + + +if len(sentences) > 0: + data['sentences'] = sentences + add_data(data) + +end = time.time() +print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start)) + +print "Generating index..." +start = time.time() +data = { + 'operation': 'refreshIndex', + 'tmId' : tmId +} +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +urllib2.urlopen(req, json.dumps(data)).read() + +end = time.time() +print "Index regeneration complete. The operation took %.4f s" % (end - start) + + + diff --git a/upstart/cmake_stubs/pgbouncer.conf.in b/upstart/cmake_stubs/pgbouncer.conf.in index db80af8..8ecdd27 100644 --- a/upstart/cmake_stubs/pgbouncer.conf.in +++ b/upstart/cmake_stubs/pgbouncer.conf.in @@ -2,9 +2,7 @@ description "pgbouncer" -start on (net-device-up - and local-filesystems - and runlevel [2345]) +start on started postgresql stop on runlevel [016]