storing and retrieving alignments

This commit is contained in:
rjawor 2015-12-29 22:13:21 +01:00
parent 67f485cbc2
commit 05d99168ed
17 changed files with 395 additions and 63 deletions

View File

@ -99,6 +99,20 @@ if(WITH_PCRE)
set(HAVE_PCRE 1)
endif(WITH_PCRE)
# ----------------------------------------------------
# ICU (I feeeeel youuuuu...)
# ----------------------------------------------------
find_library(ICU_LIB NAMES icui18n)
find_path(ICU_INCLUDE unicode)
if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
message(STATUS "Found ICU: ${ICU_LIB}")
include_directories(${ICU_INCLUDE})
link_directories(${ICU_LIB})
else()
message(FATAL_ERROR "ICU not found")
endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
# ----------------------------------------------------
# Boost
# ----------------------------------------------------

View File

@ -1,5 +1,5 @@
#!/bin/sh
rm -rf index/*
sudo rm -rf index/*
cd db
./recreateDb.sh
./recreateDb.sh

View File

@ -16,5 +16,5 @@ add_executable(concordia_server_process
tm_dao.cpp
aligned_unit.cpp
)
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case)
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

View File

@ -1,9 +1,15 @@
#include "aligned_unit.hpp"
AlignedUnit::AlignedUnit() {
AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments):
_sourceSentence(sourceSentence),
_targetSentence(targetSentence),
_alignments(alignments) {
}
AlignedUnit::~AlignedUnit() {
}

View File

@ -10,19 +10,29 @@ class AlignedUnit {
public:
/*! Constructor.
*/
AlignedUnit();
AlignedUnit(const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
std::vector<std::vector<int> > alignments);
/*! Destructor.
*/
virtual ~AlignedUnit();
boost::shared_ptr<TokenizedSentence> getSourceSentence() {
TokenizedSentence getSourceSentence() const {
return _sourceSentence;
}
TokenizedSentence getTargetSentence() const {
return _targetSentence;
}
std::vector<std::vector<int> > getAlignments() const {
return _alignments;
}
private:
boost::shared_ptr<TokenizedSentence> _sourceSentence;
TokenizedSentence _sourceSentence;
boost::shared_ptr<TokenizedSentence> _targetSentence;
TokenizedSentence _targetSentence;
std::vector<std::vector<int> > _alignments;
};

View File

@ -58,37 +58,43 @@ int main(int argc, char** argv) {
std::streambuf * cout_streambuf = std::cout.rdbuf();
std::streambuf * cerr_streambuf = std::cerr.rdbuf();
ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
Logger::log("Concordia server initiated successfully, waiting for requests");
try {
ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
Logger::log("Concordia server initiated successfully, waiting for requests");
FCGX_Request request;
FCGX_Request request;
FCGX_Init();
FCGX_InitRequest(&request, 0, 0);
FCGX_Init();
FCGX_InitRequest(&request, 0, 0);
while (FCGX_Accept_r(&request) == 0) {
fcgi_streambuf cin_fcgi_streambuf(request.in);
fcgi_streambuf cout_fcgi_streambuf(request.out);
fcgi_streambuf cerr_fcgi_streambuf(request.err);
while (FCGX_Accept_r(&request) == 0) {
fcgi_streambuf cin_fcgi_streambuf(request.in);
fcgi_streambuf cout_fcgi_streambuf(request.out);
fcgi_streambuf cerr_fcgi_streambuf(request.err);
std::cin.rdbuf(&cin_fcgi_streambuf);
std::cout.rdbuf(&cout_fcgi_streambuf);
std::cerr.rdbuf(&cerr_fcgi_streambuf);
std::cin.rdbuf(&cin_fcgi_streambuf);
std::cout.rdbuf(&cout_fcgi_streambuf);
std::cerr.rdbuf(&cerr_fcgi_streambuf);
std::string content = get_request_content(request);
std::string content = get_request_content(request);
std::string requestString(content);
std::cout << concordiaServer.handleRequest(requestString);
// Note: the fcgi_streambuf destructor will auto flush
}
// restore stdio streambufs
std::cin.rdbuf(cin_streambuf);
std::cout.rdbuf(cout_streambuf);
std::cerr.rdbuf(cerr_streambuf);
Logger::log("Gracefully shutting down Concordia server process");
std::string requestString(content);
std::cout << concordiaServer.handleRequest(requestString);
// Note: the fcgi_streambuf destructor will auto flush
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "FATAL CONCORDIA ERROR: " << e.what()<< " - shutting down";
Logger::log(errorstream.str());
}
// restore stdio streambufs
std::cin.rdbuf(cin_streambuf);
std::cout.rdbuf(cout_streambuf);
std::cerr.rdbuf(cerr_streambuf);
Logger::log("Shutting down Concordia server process");
return 0;
}

View File

@ -2,6 +2,15 @@
#include <concordia/common/config.hpp>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <unicode/unistr.h>
#include "json_generator.hpp"
#include "logger.hpp"
@ -23,10 +32,10 @@ void IndexController::addSentence(
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence);
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
(*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId);
(*_concordiasMap)[tmId].refreshSAfromRAM();
it->second->addTokenizedExample(tokenizedSentence, sentenceId);
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
@ -58,9 +67,9 @@ void IndexController::addSentences(
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<TokenizedSentence> tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences);
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
(*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds);
it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
jsonWriter.StartObject();
jsonWriter.String("status");
@ -84,13 +93,13 @@ void IndexController::addAlignedSentences(
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences);
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
int index = 0;
for(std::vector<AlignedUnit>::iterator it = alignedUnits.begin(); it != alignedUnits.end(); ++it) {
(*_concordiasMap)[tmId].addTokenizedExample(*(it->getSourceSentence()), sentenceIds.at(index));
for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
index++;
}
}
jsonWriter.StartObject();
jsonWriter.String("status");
@ -111,7 +120,7 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
(*_concordiasMap)[tmId].refreshSAfromRAM();
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
@ -129,13 +138,66 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
}
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences) {
//TODO
const std::vector<std::string> & targetSentences,
const int tmId) {
std::vector<AlignedUnit> result;
for (int i = 0; i<sourceSentences.size(); i++) {
std::string sourceSentence = sourceSentences[i];
std::string targetSentence = targetSentences[i];
std::string rawSourceSentence;
std::vector<TokenAnnotation> sourceTokens;
std::vector<std::vector<int> > alignments;
UnicodeString s(sourceSentence.c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(
s,
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
)
);
boost::u32regex_iterator<const UChar*> end;
for (; begin != end; ++begin) {
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
std::string token;
tokenUTF8.toUTF8String(token);
if (token != "NULL") {
std::string numbers((*begin)[2].first, (*begin)[2].second);
std::istringstream iss(numbers);
std::vector<std::string> numberStrings;
std::copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter(numberStrings));
std::vector<int> tokenAlignments;
for (int j=0;j<numberStrings.size();j++) {
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
tokenAlignments.push_back(n);
}
alignments.push_back(tokenAlignments);
rawSourceSentence += token + " ";
}
}
rawSourceSentence = _trim(rawSourceSentence);
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
}
}
return result;
}
std::string IndexController::_trim(std::string & str) {
size_t first = str.find_first_not_of(' ');
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last-first+1));
}

View File

@ -6,6 +6,7 @@
#include <concordia/concordia.hpp>
#include <concordia/concordia_exception.hpp>
#include <boost/ptr_container/ptr_map.hpp>
#include <concordia/token_annotation.hpp>
#include "unit_dao.hpp"
@ -43,8 +44,11 @@ public:
private:
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences);
const std::vector<std::string> & targetSentences,
const int tmId);
std::string _trim(std::string & str);
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
UnitDAO _unitDAO;

View File

@ -1,5 +1,6 @@
#include "json_generator.hpp"
#include <boost/foreach.hpp>
JsonGenerator::JsonGenerator() {
}
@ -34,7 +35,19 @@ void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer>
jsonWriter.String("sourceSegment");
jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.String("targetFragments");
jsonWriter.StartArray();
for (std::vector<std::pair<int,int> >::const_iterator it = result.getTargetFragments().begin();
it != result.getTargetFragments().end(); it++) {
jsonWriter.StartArray();
jsonWriter.Int(it->first);
jsonWriter.Int(it->second);
jsonWriter.EndArray();
}
jsonWriter.EndArray();
jsonWriter.EndObject();
}

View File

@ -19,7 +19,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
const int tmId) {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern));
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
jsonWriter.StartObject();
jsonWriter.String("status");
@ -42,7 +42,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern));
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
jsonWriter.StartObject();
jsonWriter.String("status");

View File

@ -20,3 +20,8 @@ SimpleSearchResult::SimpleSearchResult(
SimpleSearchResult::~SimpleSearchResult() {
}
void SimpleSearchResult::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
_targetFragments.push_back(targetFragment);
}

View File

@ -2,6 +2,7 @@
#define SIMPLE_SEARCH_RESULT_HDR
#include <string>
#include <vector>
class SimpleSearchResult {
public:
@ -47,6 +48,12 @@ public:
return _targetSegment;
}
const std::vector<std::pair<int,int> > & getTargetFragments() const {
return _targetFragments;
}
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
private:
int _id;
@ -61,6 +68,8 @@ private:
std::string _sourceSegment;
std::string _targetSegment;
std::vector<std::pair<int,int> > _targetFragments;
};
#endif

View File

@ -1,5 +1,7 @@
#include "unit_dao.hpp"
#include<sstream>
#include "query_param.hpp"
#include "string_param.hpp"
#include "int_param.hpp"
@ -48,7 +50,17 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
const std::vector<AlignedUnit> & alignedUnits,
const int tmId) {
//TODO
DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds;
connection.startTransaction();
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
}
connection.endTransaction();
return newIds;
}
@ -84,23 +96,66 @@ void UnitDAO::_getResultsFromFragments(
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
}
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(fragment.getExampleId()));
PGresult * result = connection.execute(query, params);
results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), // example id
matchedPatternStart,
matchedPatternEnd,
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2))); // target segment
SimpleSearchResult ssResult(connection.getIntValue(result,0,0), // example id
matchedPatternStart,
matchedPatternEnd,
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
connection.getStringValue(result,0,2)); // target segment
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
//TODO now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(fragment.getExampleId()));
targetParams.push_back(new IntParam(fragment.getExampleOffset()));
targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
int prevPos = -2;
int currStart = -1;
int currEnd = -1;
for (int i=0;i<connection.getRowCount(targetResult);i++) {
int targetPos = connection.getIntValue(targetResult, i, 0);
int targetStart = connection.getIntValue(targetResult, i, 1);
int targetEnd = connection.getIntValue(targetResult, i, 2);
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
currStart = targetStart;
}
currEnd = targetEnd;
prevPos = targetPos;
}
// check if there are remaining fragments
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
BOOST_FOREACH (QueryParam * param, targetParams) {
delete param;
}
results.push_back(ssResult);
}
connection.endTransaction();
}
@ -138,5 +193,45 @@ int UnitDAO::_addSingleSentence(
}
int UnitDAO::_addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const int tmId) {
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
params.push_back(new IntParam(tmId));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
// add alignments
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
std::vector<QueryParam*> params;
params.push_back(new IntParam(newId));
params.push_back(new IntParam(i));
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
PGresult * result = connection.execute(query, params);
connection.clearResult(result);
BOOST_FOREACH (QueryParam * param, params) {
delete param;
}
}
}
return newId;
}

View File

@ -56,6 +56,10 @@ private:
const std::string & targetSentence,
const int tmId);
int _addAlignedUnit(
DBconnection & connection,
const AlignedUnit & alignedUnit,
const int tmId);
};
#endif

View File

@ -20,7 +20,14 @@ CREATE TABLE unit (
source_segment text,
target_segment text,
source_tokens integer[],
target_tokens integer[],
alignments integer[][]
target_tokens integer[]
);
DROP TABLE IF EXISTS alignment;
CREATE TABLE alignment (
id SERIAL PRIMARY KEY,
unit_id integer,
source_token_pos integer,
target_token_pos integer
);

99
tests/addAlignedFile.py Executable file
View File

@ -0,0 +1,99 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import host
import time
BUFFER_SIZE = 500
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def add_data(data):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
#print response
sourceFile = sys.argv[1]
sourceLangId = int(sys.argv[2])
targetLangId = int(sys.argv[3])
name = sys.argv[4]
totalLines = file_len(sourceFile)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
print response
tmId = int(response['newTmId'])
print "Added new tm: %d" % tmId
data = {
'operation': 'addAlignedSentences',
'tmId':tmId
}
sentences = []
currSentence = []
start = time.time()
with open(sourceFile) as sourceLines:
lineNumber = 0
for line in sourceLines:
line = line.strip()
if lineNumber % 3 == 1:
currSentence.append(line)
elif lineNumber % 3 == 2:
currSentence.append(line)
currSentence.reverse()
sentences.append(currSentence)
currSentence = []
if len(sentences) >= BUFFER_SIZE:
data['sentences'] = sentences
add_data(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/3*(mark-start))
lineNumber += 1
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
print "Generating index..."
start = time.time()
data = {
'operation': 'refreshIndex',
'tmId' : tmId
}
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Index regeneration complete. The operation took %.4f s" % (end - start)

View File

@ -2,9 +2,7 @@
description "pgbouncer"
start on (net-device-up
and local-filesystems
and runlevel [2345])
start on started postgresql
stop on runlevel [016]