storing and retrieving alignments
This commit is contained in:
parent
67f485cbc2
commit
05d99168ed
@ -99,6 +99,20 @@ if(WITH_PCRE)
|
||||
set(HAVE_PCRE 1)
|
||||
endif(WITH_PCRE)
|
||||
|
||||
# ----------------------------------------------------
|
||||
# ICU (I feeeeel youuuuu...)
|
||||
# ----------------------------------------------------
|
||||
find_library(ICU_LIB NAMES icui18n)
|
||||
find_path(ICU_INCLUDE unicode)
|
||||
|
||||
if(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
|
||||
message(STATUS "Found ICU: ${ICU_LIB}")
|
||||
include_directories(${ICU_INCLUDE})
|
||||
link_directories(${ICU_LIB})
|
||||
else()
|
||||
message(FATAL_ERROR "ICU not found")
|
||||
endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
|
||||
|
||||
# ----------------------------------------------------
|
||||
# Boost
|
||||
# ----------------------------------------------------
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
|
||||
rm -rf index/*
|
||||
sudo rm -rf index/*
|
||||
cd db
|
||||
./recreateDb.sh
|
||||
|
@ -16,5 +16,5 @@ add_executable(concordia_server_process
|
||||
tm_dao.cpp
|
||||
aligned_unit.cpp
|
||||
)
|
||||
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case)
|
||||
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
||||
|
||||
|
@ -1,9 +1,15 @@
|
||||
#include "aligned_unit.hpp"
|
||||
|
||||
|
||||
AlignedUnit::AlignedUnit() {
|
||||
AlignedUnit::AlignedUnit(const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
std::vector<std::vector<int> > alignments):
|
||||
_sourceSentence(sourceSentence),
|
||||
_targetSentence(targetSentence),
|
||||
_alignments(alignments) {
|
||||
}
|
||||
|
||||
|
||||
AlignedUnit::~AlignedUnit() {
|
||||
}
|
||||
|
||||
|
@ -10,19 +10,29 @@ class AlignedUnit {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
AlignedUnit();
|
||||
AlignedUnit(const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
std::vector<std::vector<int> > alignments);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~AlignedUnit();
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> getSourceSentence() {
|
||||
TokenizedSentence getSourceSentence() const {
|
||||
return _sourceSentence;
|
||||
}
|
||||
|
||||
private:
|
||||
boost::shared_ptr<TokenizedSentence> _sourceSentence;
|
||||
TokenizedSentence getTargetSentence() const {
|
||||
return _targetSentence;
|
||||
}
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> _targetSentence;
|
||||
std::vector<std::vector<int> > getAlignments() const {
|
||||
return _alignments;
|
||||
}
|
||||
|
||||
private:
|
||||
TokenizedSentence _sourceSentence;
|
||||
|
||||
TokenizedSentence _targetSentence;
|
||||
|
||||
std::vector<std::vector<int> > _alignments;
|
||||
};
|
||||
|
@ -58,37 +58,43 @@ int main(int argc, char** argv) {
|
||||
std::streambuf * cout_streambuf = std::cout.rdbuf();
|
||||
std::streambuf * cerr_streambuf = std::cerr.rdbuf();
|
||||
|
||||
ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
|
||||
Logger::log("Concordia server initiated successfully, waiting for requests");
|
||||
try {
|
||||
ConcordiaServer concordiaServer(CONFIG_FILE_PATH);
|
||||
Logger::log("Concordia server initiated successfully, waiting for requests");
|
||||
|
||||
FCGX_Request request;
|
||||
FCGX_Request request;
|
||||
|
||||
FCGX_Init();
|
||||
FCGX_InitRequest(&request, 0, 0);
|
||||
FCGX_Init();
|
||||
FCGX_InitRequest(&request, 0, 0);
|
||||
|
||||
while (FCGX_Accept_r(&request) == 0) {
|
||||
fcgi_streambuf cin_fcgi_streambuf(request.in);
|
||||
fcgi_streambuf cout_fcgi_streambuf(request.out);
|
||||
fcgi_streambuf cerr_fcgi_streambuf(request.err);
|
||||
while (FCGX_Accept_r(&request) == 0) {
|
||||
fcgi_streambuf cin_fcgi_streambuf(request.in);
|
||||
fcgi_streambuf cout_fcgi_streambuf(request.out);
|
||||
fcgi_streambuf cerr_fcgi_streambuf(request.err);
|
||||
|
||||
std::cin.rdbuf(&cin_fcgi_streambuf);
|
||||
std::cout.rdbuf(&cout_fcgi_streambuf);
|
||||
std::cerr.rdbuf(&cerr_fcgi_streambuf);
|
||||
std::cin.rdbuf(&cin_fcgi_streambuf);
|
||||
std::cout.rdbuf(&cout_fcgi_streambuf);
|
||||
std::cerr.rdbuf(&cerr_fcgi_streambuf);
|
||||
|
||||
std::string content = get_request_content(request);
|
||||
std::string content = get_request_content(request);
|
||||
|
||||
std::string requestString(content);
|
||||
std::cout << concordiaServer.handleRequest(requestString);
|
||||
std::string requestString(content);
|
||||
std::cout << concordiaServer.handleRequest(requestString);
|
||||
|
||||
// Note: the fcgi_streambuf destructor will auto flush
|
||||
// Note: the fcgi_streambuf destructor will auto flush
|
||||
}
|
||||
|
||||
// restore stdio streambufs
|
||||
std::cin.rdbuf(cin_streambuf);
|
||||
std::cout.rdbuf(cout_streambuf);
|
||||
std::cerr.rdbuf(cerr_streambuf);
|
||||
|
||||
Logger::log("Gracefully shutting down Concordia server process");
|
||||
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "FATAL CONCORDIA ERROR: " << e.what()<< " - shutting down";
|
||||
Logger::log(errorstream.str());
|
||||
}
|
||||
|
||||
// restore stdio streambufs
|
||||
std::cin.rdbuf(cin_streambuf);
|
||||
std::cout.rdbuf(cout_streambuf);
|
||||
std::cerr.rdbuf(cerr_streambuf);
|
||||
|
||||
Logger::log("Shutting down Concordia server process");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2,6 +2,15 @@
|
||||
|
||||
#include <concordia/common/config.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/regex/icu.hpp>
|
||||
#include <unicode/unistr.h>
|
||||
|
||||
#include "json_generator.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
@ -23,10 +32,10 @@ void IndexController::addSentence(
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence tokenizedSentence = (*_concordiasMap)[tmId].tokenize(sourceSentence);
|
||||
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
||||
(*_concordiasMap)[tmId].addTokenizedExample(tokenizedSentence, sentenceId);
|
||||
(*_concordiasMap)[tmId].refreshSAfromRAM();
|
||||
it->second->addTokenizedExample(tokenizedSentence, sentenceId);
|
||||
it->second->refreshSAfromRAM();
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
@ -58,9 +67,9 @@ void IndexController::addSentences(
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<TokenizedSentence> tokenizedSentences = (*_concordiasMap)[tmId].tokenizeAll(sourceSentences);
|
||||
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
|
||||
(*_concordiasMap)[tmId].addAllTokenizedExamples(tokenizedSentences, sentenceIds);
|
||||
it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
@ -84,11 +93,11 @@ void IndexController::addAlignedSentences(
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences);
|
||||
std::vector<AlignedUnit> alignedUnits = _getAlignedUnits(sourceSentences, targetSentences, tmId);
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedUnits(alignedUnits, tmId);
|
||||
int index = 0;
|
||||
for(std::vector<AlignedUnit>::iterator it = alignedUnits.begin(); it != alignedUnits.end(); ++it) {
|
||||
(*_concordiasMap)[tmId].addTokenizedExample(*(it->getSourceSentence()), sentenceIds.at(index));
|
||||
for(std::vector<AlignedUnit>::iterator ait = alignedUnits.begin(); ait != alignedUnits.end(); ++ait) {
|
||||
it->second->addTokenizedExample(ait->getSourceSentence(), sentenceIds.at(index));
|
||||
index++;
|
||||
}
|
||||
|
||||
@ -111,7 +120,7 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
(*_concordiasMap)[tmId].refreshSAfromRAM();
|
||||
it->second->refreshSAfromRAM();
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
@ -129,13 +138,66 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
||||
}
|
||||
|
||||
std::vector<AlignedUnit> IndexController::_getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences) {
|
||||
//TODO
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId) {
|
||||
std::vector<AlignedUnit> result;
|
||||
for (int i = 0; i<sourceSentences.size(); i++) {
|
||||
std::string sourceSentence = sourceSentences[i];
|
||||
std::string targetSentence = targetSentences[i];
|
||||
|
||||
std::string rawSourceSentence;
|
||||
std::vector<TokenAnnotation> sourceTokens;
|
||||
std::vector<std::vector<int> > alignments;
|
||||
|
||||
UnicodeString s(sourceSentence.c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(
|
||||
boost::make_u32regex_iterator(
|
||||
s,
|
||||
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
|
||||
)
|
||||
);
|
||||
boost::u32regex_iterator<const UChar*> end;
|
||||
|
||||
for (; begin != end; ++begin) {
|
||||
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
|
||||
std::string token;
|
||||
tokenUTF8.toUTF8String(token);
|
||||
|
||||
if (token != "NULL") {
|
||||
std::string numbers((*begin)[2].first, (*begin)[2].second);
|
||||
std::istringstream iss(numbers);
|
||||
std::vector<std::string> numberStrings;
|
||||
std::copy(std::istream_iterator<std::string>(iss),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::back_inserter(numberStrings));
|
||||
|
||||
std::vector<int> tokenAlignments;
|
||||
for (int j=0;j<numberStrings.size();j++) {
|
||||
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
|
||||
tokenAlignments.push_back(n);
|
||||
}
|
||||
alignments.push_back(tokenAlignments);
|
||||
rawSourceSentence += token + " ";
|
||||
}
|
||||
}
|
||||
|
||||
rawSourceSentence = _trim(rawSourceSentence);
|
||||
|
||||
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence sourceTS = it->second->tokenize(rawSourceSentence, true);
|
||||
TokenizedSentence targetTS = it->second->tokenize(targetSentence, true);
|
||||
|
||||
result.push_back(AlignedUnit(sourceTS, targetTS, alignments));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string IndexController::_trim(std::string & str) {
|
||||
size_t first = str.find_first_not_of(' ');
|
||||
size_t last = str.find_last_not_of(' ');
|
||||
return str.substr(first, (last-first+1));
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <concordia/concordia.hpp>
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
#include <boost/ptr_container/ptr_map.hpp>
|
||||
#include <concordia/token_annotation.hpp>
|
||||
|
||||
|
||||
#include "unit_dao.hpp"
|
||||
@ -43,7 +44,10 @@ public:
|
||||
|
||||
private:
|
||||
std::vector<AlignedUnit> _getAlignedUnits(const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences);
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
std::string _trim(std::string & str);
|
||||
|
||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "json_generator.hpp"
|
||||
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
JsonGenerator::JsonGenerator() {
|
||||
}
|
||||
@ -35,6 +36,18 @@ void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer>
|
||||
jsonWriter.String(result.getSourceSegment().c_str());
|
||||
jsonWriter.String("targetSegment");
|
||||
jsonWriter.String(result.getTargetSegment().c_str());
|
||||
jsonWriter.String("targetFragments");
|
||||
jsonWriter.StartArray();
|
||||
|
||||
for (std::vector<std::pair<int,int> >::const_iterator it = result.getTargetFragments().begin();
|
||||
it != result.getTargetFragments().end(); it++) {
|
||||
jsonWriter.StartArray();
|
||||
jsonWriter.Int(it->first);
|
||||
jsonWriter.Int(it->second);
|
||||
jsonWriter.EndArray();
|
||||
}
|
||||
jsonWriter.EndArray();
|
||||
|
||||
jsonWriter.EndObject();
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
||||
const int tmId) {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults((*_concordiasMap)[tmId].simpleSearch(pattern));
|
||||
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
@ -42,7 +42,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
|
||||
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult((*_concordiasMap)[tmId].concordiaSearch(pattern));
|
||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
|
@ -20,3 +20,8 @@ SimpleSearchResult::SimpleSearchResult(
|
||||
SimpleSearchResult::~SimpleSearchResult() {
|
||||
}
|
||||
|
||||
void SimpleSearchResult::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
|
||||
_targetFragments.push_back(targetFragment);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define SIMPLE_SEARCH_RESULT_HDR
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class SimpleSearchResult {
|
||||
public:
|
||||
@ -47,6 +48,12 @@ public:
|
||||
return _targetSegment;
|
||||
}
|
||||
|
||||
const std::vector<std::pair<int,int> > & getTargetFragments() const {
|
||||
return _targetFragments;
|
||||
}
|
||||
|
||||
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
|
||||
|
||||
private:
|
||||
int _id;
|
||||
|
||||
@ -61,6 +68,8 @@ private:
|
||||
std::string _sourceSegment;
|
||||
|
||||
std::string _targetSegment;
|
||||
|
||||
std::vector<std::pair<int,int> > _targetFragments;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "unit_dao.hpp"
|
||||
|
||||
#include<sstream>
|
||||
|
||||
#include "query_param.hpp"
|
||||
#include "string_param.hpp"
|
||||
#include "int_param.hpp"
|
||||
@ -48,7 +50,17 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedUnits(
|
||||
const std::vector<AlignedUnit> & alignedUnits,
|
||||
const int tmId) {
|
||||
//TODO
|
||||
|
||||
|
||||
DBconnection connection;
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
connection.startTransaction();
|
||||
|
||||
BOOST_FOREACH(const AlignedUnit & alignedUnit, alignedUnits) {
|
||||
newIds.push_back(_addAlignedUnit(connection, alignedUnit, tmId));
|
||||
}
|
||||
|
||||
connection.endTransaction();
|
||||
return newIds;
|
||||
}
|
||||
|
||||
@ -84,23 +96,66 @@ void UnitDAO::_getResultsFromFragments(
|
||||
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
|
||||
params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
|
||||
params.push_back(new IntParam(fragment.getExampleId()));
|
||||
PGresult * result = connection.execute(query, params);
|
||||
results.push_back(SimpleSearchResult(connection.getIntValue(result,0,0), // example id
|
||||
matchedPatternStart,
|
||||
matchedPatternEnd,
|
||||
connection.getIntValue(result,0,3), // matched example start
|
||||
connection.getIntValue(result,0,4), // matched example end
|
||||
connection.getStringValue(result,0,1), // source segment
|
||||
connection.getStringValue(result,0,2))); // target segment
|
||||
SimpleSearchResult ssResult(connection.getIntValue(result,0,0), // example id
|
||||
matchedPatternStart,
|
||||
matchedPatternEnd,
|
||||
connection.getIntValue(result,0,3), // matched example start
|
||||
connection.getIntValue(result,0,4), // matched example end
|
||||
connection.getStringValue(result,0,1), // source segment
|
||||
connection.getStringValue(result,0,2)); // target segment
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
//TODO now add all target fragments matched with this fragment
|
||||
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
|
||||
std::vector<QueryParam*> targetParams;
|
||||
targetParams.push_back(new IntParam(fragment.getExampleId()));
|
||||
targetParams.push_back(new IntParam(fragment.getExampleOffset()));
|
||||
targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1));
|
||||
PGresult * targetResult = connection.execute(targetQuery, targetParams);
|
||||
|
||||
int prevPos = -2;
|
||||
int currStart = -1;
|
||||
int currEnd = -1;
|
||||
|
||||
for (int i=0;i<connection.getRowCount(targetResult);i++) {
|
||||
int targetPos = connection.getIntValue(targetResult, i, 0);
|
||||
int targetStart = connection.getIntValue(targetResult, i, 1);
|
||||
int targetEnd = connection.getIntValue(targetResult, i, 2);
|
||||
|
||||
if (prevPos < targetPos - 1) { // beginning of detached fragment
|
||||
// check if there is a fragment to end
|
||||
if (currStart >= 0) {
|
||||
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
|
||||
}
|
||||
currStart = targetStart;
|
||||
}
|
||||
|
||||
currEnd = targetEnd;
|
||||
prevPos = targetPos;
|
||||
}
|
||||
|
||||
// check if there are remaining fragments
|
||||
if (currStart >= 0) {
|
||||
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
|
||||
}
|
||||
|
||||
connection.clearResult(targetResult);
|
||||
BOOST_FOREACH (QueryParam * param, targetParams) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
results.push_back(ssResult);
|
||||
}
|
||||
connection.endTransaction();
|
||||
}
|
||||
@ -138,5 +193,45 @@ int UnitDAO::_addSingleSentence(
|
||||
}
|
||||
|
||||
|
||||
int UnitDAO::_addAlignedUnit(
|
||||
DBconnection & connection,
|
||||
const AlignedUnit & alignedUnit,
|
||||
const int tmId) {
|
||||
|
||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new StringParam(alignedUnit.getSourceSentence().getSentence()));
|
||||
params.push_back(new StringParam(alignedUnit.getTargetSentence().getSentence()));
|
||||
params.push_back(new IntParam(tmId));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getSourceSentence())));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(alignedUnit.getTargetSentence())));
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
int newId = connection.getIntValue(result, 0, 0);
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
// add alignments
|
||||
for(int i=0;i<alignedUnit.getAlignments().size();i++) {
|
||||
for (int j=0;j<alignedUnit.getAlignments()[i].size();j++) {
|
||||
std::string query = "INSERT INTO alignment(unit_id, source_token_pos, target_token_pos) values($1::integer,$2::integer,$3::integer)";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(newId));
|
||||
params.push_back(new IntParam(i));
|
||||
params.push_back(new IntParam(alignedUnit.getAlignments()[i][j]));
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
|
@ -56,6 +56,10 @@ private:
|
||||
const std::string & targetSentence,
|
||||
const int tmId);
|
||||
|
||||
int _addAlignedUnit(
|
||||
DBconnection & connection,
|
||||
const AlignedUnit & alignedUnit,
|
||||
const int tmId);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -20,7 +20,14 @@ CREATE TABLE unit (
|
||||
source_segment text,
|
||||
target_segment text,
|
||||
source_tokens integer[],
|
||||
target_tokens integer[],
|
||||
alignments integer[][]
|
||||
target_tokens integer[]
|
||||
);
|
||||
|
||||
DROP TABLE IF EXISTS alignment;
|
||||
CREATE TABLE alignment (
|
||||
id SERIAL PRIMARY KEY,
|
||||
unit_id integer,
|
||||
source_token_pos integer,
|
||||
target_token_pos integer
|
||||
);
|
||||
|
||||
|
99
tests/addAlignedFile.py
Executable file
99
tests/addAlignedFile.py
Executable file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import urllib2
|
||||
import sys
|
||||
import host
|
||||
import time
|
||||
|
||||
BUFFER_SIZE = 500
|
||||
|
||||
address = 'http://'+host.concordia_host
|
||||
if len(host.concordia_port) > 0:
|
||||
address += ':'+host.concordia_port
|
||||
|
||||
|
||||
def file_len(fname):
|
||||
with open(fname) as f:
|
||||
for i, l in enumerate(f):
|
||||
pass
|
||||
return i + 1
|
||||
|
||||
def add_data(data):
|
||||
req = urllib2.Request(address)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||
#print response
|
||||
|
||||
sourceFile = sys.argv[1]
|
||||
sourceLangId = int(sys.argv[2])
|
||||
targetLangId = int(sys.argv[3])
|
||||
name = sys.argv[4]
|
||||
|
||||
totalLines = file_len(sourceFile)
|
||||
|
||||
data = {
|
||||
'operation': 'addTm',
|
||||
'sourceLangId':sourceLangId,
|
||||
'targetLangId':targetLangId,
|
||||
'name':name
|
||||
}
|
||||
|
||||
req = urllib2.Request(address)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||
print response
|
||||
tmId = int(response['newTmId'])
|
||||
print "Added new tm: %d" % tmId
|
||||
|
||||
data = {
|
||||
'operation': 'addAlignedSentences',
|
||||
'tmId':tmId
|
||||
}
|
||||
|
||||
sentences = []
|
||||
currSentence = []
|
||||
start = time.time()
|
||||
with open(sourceFile) as sourceLines:
|
||||
lineNumber = 0
|
||||
for line in sourceLines:
|
||||
line = line.strip()
|
||||
if lineNumber % 3 == 1:
|
||||
currSentence.append(line)
|
||||
elif lineNumber % 3 == 2:
|
||||
currSentence.append(line)
|
||||
currSentence.reverse()
|
||||
sentences.append(currSentence)
|
||||
currSentence = []
|
||||
if len(sentences) >= BUFFER_SIZE:
|
||||
data['sentences'] = sentences
|
||||
add_data(data)
|
||||
mark = time.time()
|
||||
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/3*(mark-start))
|
||||
lineNumber += 1
|
||||
|
||||
|
||||
if len(sentences) > 0:
|
||||
data['sentences'] = sentences
|
||||
add_data(data)
|
||||
|
||||
end = time.time()
|
||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/3*(end-start))
|
||||
|
||||
print "Generating index..."
|
||||
start = time.time()
|
||||
data = {
|
||||
'operation': 'refreshIndex',
|
||||
'tmId' : tmId
|
||||
}
|
||||
req = urllib2.Request(address)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
urllib2.urlopen(req, json.dumps(data)).read()
|
||||
|
||||
end = time.time()
|
||||
print "Index regeneration complete. The operation took %.4f s" % (end - start)
|
||||
|
||||
|
||||
|
@ -2,9 +2,7 @@
|
||||
|
||||
description "pgbouncer"
|
||||
|
||||
start on (net-device-up
|
||||
and local-filesystems
|
||||
and runlevel [2345])
|
||||
start on started postgresql
|
||||
|
||||
stop on runlevel [016]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user