multiple fragment occurences

This commit is contained in:
Rafał Jaworski 2017-04-23 00:43:15 +02:00
parent f43dabb09e
commit a617a6b8ae
30 changed files with 328 additions and 231 deletions

View File

@ -1,47 +1,28 @@
- sudo apt-get install postgresql libfcgi-dev libpq-dev
- sudo apt-get install postgresql libfcgi-dev libpq-dev mono-complete
- clone github repo, mkdir build, cd build, ../cmake.sh, make
- sudo -u postgres psql postgres
- create user concordia with encrypted password 'concordia';
- CREATE DATABASE concordia_server ENCODING 'UTF8' OWNER concordia TEMPLATE template0;
- grant all privileges on database concordia_server to concordia;
- sudo vim /etc/postgresql/9.3/main/pg_hba.conf: change "local all all peer" to "local all all md5"
- pgbouncer: wget https://pgbouncer.github.io/downloads/files/1.6/pgbouncer-1.6.tar.gz
- sudo apt-get install libevent-dev
- pgbouncer:
- sudo apt-get install autoconf automake m4 libtool pkg-config libevent-dev autogen
$ git clone https://github.com/pgbouncer/pgbouncer.git
$ cd pgbouncer
$ git submodule init
$ git submodule update
$ ./autogen.sh
$ ./configure ...
$ make
$ make install
- ./db/startPGbouncer.sh
- ./db/recreateDb.sh
- nginx:
sudo -s
nginx=stable # use nginx=development for latest development version
add-apt-repository ppa:nginx/$nginx
apt-get update
- nginx:
apt-get install nginx
sites-available:
cat_html:
# Default server configuration
#
server {
listen 80 default_server;
listen [::]:80 default_server;
# SSL configuration
#
# listen 443 ssl default_server;
# listen [::]:443 ssl default_server;
#
# Note: You should disable gzip for SSL traffic.
# See: https://bugs.debian.org/773332
#
# Read up on ssl_ciphers to ensure a secure configuration.
# See: https://bugs.debian.org/765782
#
# Self signed certs generated by the ssl-cert package
# Don't use them in a production server!
#
# include snippets/snakeoil.conf;
root /var/www/html;
rename default to fcgi_concordia
fcgi_concordia:
server {
@ -95,8 +76,15 @@
}
- add links in sites-enabled, sudo service nginx restart
- sudo apt-get install php apache2 libapache2-mod-php
- install cat html to /var/www/html (adjust ajax requests)
- sudo apt-get install spawn-fcgi
- mkdir index
- ./db/startPGbouncer.sh
- ./scripts/restart.sh
- install upstart scripts
mgiza-aligner:
- cd mgiza, mgizapp
- sudo apt-get install libboost-thread-dev
- follow instructions in INSTALL

View File

@ -1,5 +1,4 @@
1. Prepare host.cfg file with the address and port number of Concordia. See host.cfg_example. WARNING there should not be any empty lines in the .cfg files.
2. Prepare version file for each tm in Concordia in teh "versions: directory.
2. Prepare version file for each tm in Concordia in the "versions-" directory.
3. Clean a directory on your webserver (that supports PHP).
4. sudo ./publish.py PATH_ON_SERVER.

View File

@ -122,7 +122,7 @@
cursor:pointer;
}
.fragmentDetails {
.example {
border-style: solid;
border-width: 5px;
border-color:#19424F;

View File

@ -47,6 +47,7 @@ function phraseSearchHandle(tmid, intervals) {
function renderResult(data) {
var res = '';
var disablePhraseSearch = true;
if (typeof(data['result']['bestOverlayScore']) === 'undefined') {
// ignore
@ -89,29 +90,35 @@ function renderResult(data) {
}
function renderFragment(fragment, number) {
var result = '<div style="display:none" id="fragment'+number+'" class="fragmentDetails"><table><tr><td>';
var result = '<div style="display:none" id="fragment'+number+'" class="fragmentDetails">';
// source segment
var sourceSegment = fragment['sourceSegment'];
result += sourceSegment.slice(0, fragment['matchedExampleStart']);
result += '<span class="matchedFragment">';
result += sourceSegment.slice(fragment['matchedExampleStart'], fragment['matchedExampleEnd']);
result += '</span>';
result += sourceSegment.slice(fragment['matchedExampleEnd']);
for (j=0;j<fragment['occurences'].length;j++) {
var occurence = fragment['occurences'][j];
result += '<table class="example"><tr><td>';
// target segment
result += '</td></tr><tr><td>';
var targetSegment = fragment['targetSegment'];
var currStart = 0;
for (i=0;i<fragment['targetFragments'].length;i++) {
result += targetSegment.slice(currStart, fragment['targetFragments'][i][0]);
// source segment
var sourceSegment = occurence['sourceSegment'];
result += sourceSegment.slice(0, ['matchedExampleStart']);
result += '<span class="matchedFragment">';
result += targetSegment.slice(fragment['targetFragments'][i][0], fragment['targetFragments'][i][1]);
result += sourceSegment.slice(occurence['matchedExampleStart'], occurence['matchedExampleEnd']);
result += '</span>';
currStart = fragment['targetFragments'][i][1];
result += sourceSegment.slice(occurence['matchedExampleEnd']);
// target segment
result += '</td></tr><tr><td>';
var targetSegment = occurence['targetSegment'];
var currStart = 0;
for (i=0;i<occurence['targetFragments'].length;i++) {
result += targetSegment.slice(currStart, occurence['targetFragments'][i][0]);
result += '<span class="matchedFragment">';
result += targetSegment.slice(occurence['targetFragments'][i][0], occurence['targetFragments'][i][1]);
result += '</span>';
currStart = occurence['targetFragments'][i][1];
}
result += targetSegment.slice(currStart);
result += '</td></tr></table>';
}
result += targetSegment.slice(currStart);
result += '</td></tr></table></div>';
result += '</div>';
return result;
}

View File

@ -0,0 +1,8 @@
dir@#@stocznia_enpl
concordia_host@#@localhost
concordia_port@#@8800
tmid@#@5
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. Please enter an English sentence in the field below and press Enter (or use the search button). You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
enjoy@#@Enjoy your work with the system!
prompt@#@Enter search pattern (English sentence):
suggestion@#@This is a large ballast tank

View File

@ -0,0 +1,8 @@
dir@#@stocznia_plen
concordia_host@#@localhost
concordia_port@#@8800
tmid@#@2
desc@#@Witamy w interaktywnym demo systemu Concordia. System znajduje najdłuższe fragmenty zdania wejściowego w pamięci tłumaczeń. Proszę wpisać polskie zdanie w poniższe pole i nacisnąć Enter (albo użyć przycisku "search"). Aby zapoznać się z systemem możesz użyć wcześniej przygotowanych przykładów - po prostu kliknij link "apply" przy wybranym przykładzie. Po wyszukaniu, kliknij na wybrany podświetlony fragment, aby zobaczyć jego kontekst.
enjoy@#@Życzymy udanej pracy z systemem!
prompt@#@Wprowadź zdanie (po polsku):
suggestion@#@To jest bardzo duży zbiornik balastowy

View File

@ -1 +0,0 @@
../versions_available/jrc_enpl.cfg

View File

@ -1 +0,0 @@
../versions_available/jrc_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/setimes_enhr.cfg

View File

@ -0,0 +1 @@
../versions_available/stocznia_enpl.cfg

View File

@ -0,0 +1 @@
../versions_available/stocznia_plen.cfg

View File

@ -10,9 +10,14 @@ CompleteConcordiaSearchResult::CompleteConcordiaSearchResult(
CompleteConcordiaSearchResult::~CompleteConcordiaSearchResult() {
}
void CompleteConcordiaSearchResult::addToBestOverlay(const SimpleSearchResult & result) {
_bestOverlay.push_back(result);
}
void CompleteConcordiaSearchResult::offsetPattern(int offset) {
BOOST_FOREACH(SimpleSearchResult & simpleResult, _bestOverlay) {
simpleResult.offsetPattern(offset);
}
}
}

View File

@ -14,20 +14,22 @@ public:
/*! Destructor.
*/
virtual ~CompleteConcordiaSearchResult();
const double getBestOverlayScore() {
return _bestOverlayScore;
}
std::vector<SimpleSearchResult> & getBestOverlay() {
std::vector<SimpleSearchResult> getBestOverlay() const {
return _bestOverlay;
}
void addToBestOverlay(const SimpleSearchResult & result);
void offsetPattern(int offset);
private:
double _bestOverlayScore;
std::vector<SimpleSearchResult> _bestOverlay;
};

View File

@ -66,9 +66,11 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
int tmId = _getIntParameter(d, TM_ID_PARAM);
// loading data from json
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
/*
Logger::log("addSentences");
Logger::logInt("sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
*/
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 2) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
@ -85,9 +87,11 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
/*
Logger::log("addAlignedSentences");
Logger::logInt("sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
*/
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 2) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
@ -105,9 +109,11 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
/*
Logger::log("addAlignedLemmatizedSentences");
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
*/
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 3) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");

View File

@ -0,0 +1,21 @@
#include "example_occurence.hpp"
ExampleOccurence::ExampleOccurence(
const int id,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment):
_id(id),
_matchedExampleStart(matchedExampleStart),
_matchedExampleEnd(matchedExampleEnd),
_sourceSegment(sourceSegment),
_targetSegment(targetSegment) {
}
ExampleOccurence::~ExampleOccurence() {
}
void ExampleOccurence::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
_targetFragments.push_back(targetFragment);
}

View File

@ -0,0 +1,61 @@
#ifndef EXAMPLE_OCCURENCE_HDR
#define EXAMPLE_OCCURENCE_HDR
#include <string>
#include <vector>
class ExampleOccurence {
public:
/*! Constructor.
*/
ExampleOccurence (const int id,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment
);
/*! Destructor.
*/
virtual ~ExampleOccurence();
int getId() const {
return _id;
}
int getMatchedExampleStart() const {
return _matchedExampleStart;
}
int getMatchedExampleEnd() const {
return _matchedExampleEnd;
}
const std::string & getSourceSegment() const {
return _sourceSegment;
}
const std::string & getTargetSegment() const {
return _targetSegment;
}
const std::vector<std::pair<int,int> > & getTargetFragments() const {
return _targetFragments;
}
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
private:
int _id;
int _matchedExampleStart;
int _matchedExampleEnd;
std::string _sourceSegment;
std::string _targetSegment;
std::vector<std::pair<int,int> > _targetFragments;
};
#endif

View File

@ -135,13 +135,13 @@ void IndexController::addAlignedLemmatizedSentences(
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
}

View File

@ -1,6 +1,7 @@
#include "json_generator.hpp"
#include <boost/foreach.hpp>
#include "example_occurence.hpp"
JsonGenerator::JsonGenerator() {
}
@ -19,37 +20,42 @@ void JsonGenerator::signalError(rapidjson::Writer<rapidjson::StringBuffer> & jso
jsonWriter.EndObject();
}
void JsonGenerator::writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
void JsonGenerator::writeSimpleSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const SimpleSearchResult & result) {
jsonWriter.StartObject();
jsonWriter.String("id");
jsonWriter.Int(result.getId());
jsonWriter.String("matchedPatternStart");
jsonWriter.Int(result.getMatchedPatternStart());
jsonWriter.String("matchedPatternEnd");
jsonWriter.Int(result.getMatchedPatternEnd());
jsonWriter.String("matchedExampleStart");
jsonWriter.Int(result.getMatchedExampleStart());
jsonWriter.String("matchedExampleEnd");
jsonWriter.Int(result.getMatchedExampleEnd());
jsonWriter.String("sourceSegment");
jsonWriter.String(result.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(result.getTargetSegment().c_str());
jsonWriter.String("targetFragments");
jsonWriter.String("occurences");
jsonWriter.StartArray();
for (std::vector<std::pair<int,int> >::const_iterator it = result.getTargetFragments().begin();
it != result.getTargetFragments().end(); it++) {
jsonWriter.StartArray();
jsonWriter.Int(it->first);
jsonWriter.Int(it->second);
jsonWriter.EndArray();
}
jsonWriter.EndArray();
BOOST_FOREACH(ExampleOccurence occurence, result.getOccurences()) {
jsonWriter.StartObject();
jsonWriter.String("id");
jsonWriter.Int(occurence.getId());
jsonWriter.String("matchedExampleStart");
jsonWriter.Int(occurence.getMatchedExampleStart());
jsonWriter.String("matchedExampleEnd");
jsonWriter.Int(occurence.getMatchedExampleEnd());
jsonWriter.String("sourceSegment");
jsonWriter.String(occurence.getSourceSegment().c_str());
jsonWriter.String("targetSegment");
jsonWriter.String(occurence.getTargetSegment().c_str());
jsonWriter.String("targetFragments");
jsonWriter.StartArray(); // all target fragments
for (std::vector<std::pair<int,int> >::const_iterator it = occurence.getTargetFragments().begin();
it != occurence.getTargetFragments().end(); it++) {
jsonWriter.StartArray(); // single target fragment
jsonWriter.Int(it->first);
jsonWriter.Int(it->second);
jsonWriter.EndArray(); // single target fragment
}
jsonWriter.EndArray(); // all target fragments
jsonWriter.EndObject(); // occurence
}
jsonWriter.EndObject();
jsonWriter.EndArray(); //occurences
jsonWriter.EndObject(); //simple search result
}

View File

@ -19,8 +19,8 @@ public:
static void signalError(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & message);
static void writeSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const SimpleSearchResult & result);
static void writeSimpleSearchResult(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const SimpleSearchResult & result);
private:

View File

@ -44,6 +44,26 @@ void Logger::logString(std::string name, std::string value) {
root.info(ss.str());
}
void Logger::logFragment(const MatchedPatternFragment & fragment) {
log4cpp::Category & root = log4cpp::Category::getRoot();
if (_initialized == 0) {
_initialize(root);
}
std::stringstream ss;
ss << fragment;
root.info(ss.str());
}
void Logger::logConcordiaSearchResult(const ConcordiaSearchResult & result) {
log4cpp::Category & root = log4cpp::Category::getRoot();
if (_initialized == 0) {
_initialize(root);
}
std::stringstream ss;
ss << result;
root.info(ss.str());
}
void Logger::_initialize(log4cpp::Category & root) {
log4cpp::Appender *appender = new log4cpp::FileAppender("default", LOG_FILE_PATH);
log4cpp::PatternLayout *layout = new log4cpp::PatternLayout();
@ -52,8 +72,6 @@ void Logger::_initialize(log4cpp::Category & root) {
root.setPriority(log4cpp::Priority::INFO);
root.addAppender(appender);
_initialized = 1;
}

View File

@ -3,6 +3,8 @@
#include <string>
#include <sstream>
#include <concordia/matched_pattern_fragment.hpp>
#include <concordia/concordia_search_result.hpp>
#include "log4cpp/Category.hh"
@ -15,12 +17,17 @@ public:
/*! Destructor.
*/
virtual ~Logger();
static void log(std::string message);
static void logInt(std::string name, int value);
static void logString(std::string name, std::string value);
static void logFragment(const MatchedPatternFragment & fragment);
static void logConcordiaSearchResult(const ConcordiaSearchResult & result);
private:
static void _initialize(log4cpp::Category & root);

View File

@ -25,17 +25,12 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
SimpleSearchResult result = _unitDAO.getSimpleSearchResult(it->second->simpleSearch(pattern, true));
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.String("results");
jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & result, results) {
JsonGenerator::writeSearchResult(jsonWriter, result);
}
jsonWriter.EndArray();
jsonWriter.String("result");
JsonGenerator::writeSimpleSearchResult(jsonWriter, result);
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
@ -55,7 +50,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
Logger::log("concordiaPhraseSearch");
Logger::logString("short pattern", shortPattern);
std::vector<SimpleSearchResult> shortPatternResults = _unitDAO.getSearchResults(it->second->simpleSearch(shortPattern));
SimpleSearchResult shortPatternResult = _unitDAO.getSimpleSearchResult(it->second->simpleSearch(shortPattern));
@ -63,7 +58,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.String("found");
if (shortPatternResults.size() > 0) {
if (shortPatternResult.getOccurences().size() > 0) {
jsonWriter.Bool(true);
@ -76,10 +71,9 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
restResult.offsetPattern(currStart);
bestOverlay.insert(bestOverlay.end(), restResult.getBestOverlay().begin(), restResult.getBestOverlay().end());
SimpleSearchResult shortPatternresult = shortPatternResults[0];
shortPatternresult.setMatchedPatternStart(interval.getStart());
shortPatternresult.setMatchedPatternEnd(interval.getEnd());
bestOverlay.push_back(shortPatternresult);
shortPatternResult.setMatchedPatternStart(interval.getStart());
shortPatternResult.setMatchedPatternEnd(interval.getEnd());
bestOverlay.push_back(shortPatternResult);
currStart = interval.getEnd();
}
CompleteConcordiaSearchResult lastRestResult = _unitDAO.getConcordiaResult(
@ -92,7 +86,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
jsonWriter.String("bestOverlay");
jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & simpleResult, bestOverlay) {
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
JsonGenerator::writeSimpleSearchResult(jsonWriter, simpleResult);
}
jsonWriter.EndArray();
jsonWriter.EndObject();
@ -112,12 +106,18 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
std::string & pattern,
const int tmId) {
Logger::log("concordiaSearch");
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::string lemmatizedPattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
Logger::logString("pattern lemmatized", lemmatizedPattern);
TokenizedSentence originalPattern = it->second->tokenize(pattern, true, false);
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(lemmatizedPattern), originalPattern);
Logger::logInt("original pattern tokenized, token count", originalPattern.getTokens().size());
boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult = it->second->concordiaSearch(lemmatizedPattern, true);
Logger::log("concordia searched, result:");
Logger::logConcordiaSearchResult(*rawConcordiaResult);
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(rawConcordiaResult, originalPattern);
Logger::log("result got");
jsonWriter.StartObject();
jsonWriter.String("status");
@ -128,8 +128,8 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
jsonWriter.Double(result.getBestOverlayScore());
jsonWriter.String("bestOverlay");
jsonWriter.StartArray();
BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) {
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
BOOST_FOREACH(const SimpleSearchResult & simpleResult, result.getBestOverlay()) {
JsonGenerator::writeSimpleSearchResult(jsonWriter, simpleResult);
}
jsonWriter.EndArray();
jsonWriter.EndObject();

View File

@ -1,31 +1,20 @@
#include "simple_search_result.hpp"
SimpleSearchResult::SimpleSearchResult(
const int id,
const int matchedPatternStart,
const int matchedPatternEnd,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment):
_id(id),
const int matchedPatternEnd):
_matchedPatternStart(matchedPatternStart),
_matchedPatternEnd(matchedPatternEnd),
_matchedExampleStart(matchedExampleStart),
_matchedExampleEnd(matchedExampleEnd),
_sourceSegment(sourceSegment),
_targetSegment(targetSegment) {
_matchedPatternEnd(matchedPatternEnd) {
}
SimpleSearchResult::~SimpleSearchResult() {
}
void SimpleSearchResult::addMatchedTargetFragment(const std::pair<int,int> & targetFragment) {
_targetFragments.push_back(targetFragment);
void SimpleSearchResult::addOccurence(const ExampleOccurence & occurence) {
_occurences.push_back(occurence);
}
void SimpleSearchResult::offsetPattern(int offset) {
_matchedPatternStart += offset;
_matchedPatternEnd += offset;
}

View File

@ -1,6 +1,7 @@
#ifndef SIMPLE_SEARCH_RESULT_HDR
#define SIMPLE_SEARCH_RESULT_HDR
#include "example_occurence.hpp"
#include <string>
#include <vector>
@ -8,21 +9,11 @@ class SimpleSearchResult {
public:
/*! Constructor.
*/
SimpleSearchResult(const int id,
const int matchedPatternStart,
const int matchedPatternEnd,
const int matchedExampleStart,
const int matchedExampleEnd,
const std::string & sourceSegment,
const std::string & targetSegment
);
SimpleSearchResult(const int matchedPatternStart,
const int matchedPatternEnd);
/*! Destructor.
*/
virtual ~SimpleSearchResult();
int getId() const {
return _id;
}
int getMatchedPatternStart() const {
return _matchedPatternStart;
@ -40,46 +31,20 @@ public:
_matchedPatternEnd = newEnd;
}
int getMatchedExampleStart() const {
return _matchedExampleStart;
std::vector<ExampleOccurence> getOccurences() const {
return _occurences;
}
int getMatchedExampleEnd() const {
return _matchedExampleEnd;
}
void addOccurence(const ExampleOccurence & occurence);
const std::string & getSourceSegment() const {
return _sourceSegment;
}
const std::string & getTargetSegment() const {
return _targetSegment;
}
const std::vector<std::pair<int,int> > & getTargetFragments() const {
return _targetFragments;
}
void addMatchedTargetFragment(const std::pair<int,int> & targetFragment);
void offsetPattern(int offset);
private:
int _id;
std::vector<ExampleOccurence> _occurences;
int _matchedPatternStart;
int _matchedPatternEnd;
int _matchedExampleStart;
int _matchedExampleEnd;
std::string _sourceSegment;
std::string _targetSegment;
std::vector<std::pair<int,int> > _targetFragments;
};
#endif

View File

@ -8,6 +8,7 @@
#include "int_param.hpp"
#include "int_array_param.hpp"
#include "logger.hpp"
#include "example_occurence.hpp"
#include <libpq-fe.h>
#include <boost/foreach.hpp>
@ -65,58 +66,61 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
return newIds;
}
std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<MatchedPatternFragment> & fragments) {
std::vector<SimpleSearchResult> results;
SimpleSearchResult UnitDAO::getSimpleSearchResult(const MatchedPatternFragment & fragment) {
SimpleSearchResult result(fragment.getStart(), fragment.getEnd());
TokenizedSentence ts("");
_getResultsFromFragments(results, fragments, ts);
return results;
return _getResultFromFragment(fragment, ts);
}
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult) {
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
_getResultsFromFragments(result.getBestOverlay(),
rawConcordiaResult->getBestOverlay(),
rawConcordiaResult->getTokenizedPattern());
BOOST_FOREACH(MatchedPatternFragment fragment, rawConcordiaResult->getBestOverlay()) {
result.addToBestOverlay(_getResultFromFragment(fragment, rawConcordiaResult->getTokenizedPattern()));
}
return result;
}
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern) {
Logger::log("getConcordiaResult with original pattern");
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
_getResultsFromFragments(result.getBestOverlay(),
rawConcordiaResult->getBestOverlay(),
originalPattern);
BOOST_FOREACH(MatchedPatternFragment fragment, rawConcordiaResult->getBestOverlay()) {
Logger::log("Working on fragment:");
Logger::logFragment(fragment);
result.addToBestOverlay(_getResultFromFragment(fragment, originalPattern));
}
return result;
}
SimpleSearchResult UnitDAO::_getResultFromFragment(
const MatchedPatternFragment & fragment,
const TokenizedSentence & tokenizedPattern) {
void UnitDAO::_getResultsFromFragments(
std::vector<SimpleSearchResult> & results,
const std::vector<MatchedPatternFragment> & fragments,
const TokenizedSentence & tokenizedPattern) {
Logger::log("getResultFromFragment");
DBconnection connection;
connection.startTransaction();
BOOST_FOREACH(const MatchedPatternFragment & fragment, fragments) {
int matchedPatternStart = 0;
int matchedPatternEnd = 0;
if (tokenizedPattern.getTokens().size() > 0) {
// if it is concordia searching
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
}
int matchedPatternStart = 0;
int matchedPatternEnd = 0;
if (tokenizedPattern.getTokens().size() > 0) {
// if it is concordia searching
Logger::logInt("tokenizedPattern size",tokenizedPattern.getTokens().size());
Logger::logInt("fragment start",fragment.getStart());
Logger::logInt("fragment matched length",fragment.getMatchedLength());
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
}
SimpleSearchResult ssResult(matchedPatternStart, matchedPatternEnd);
Logger::log("simple search result created");
BOOST_FOREACH(SubstringOccurence sOccurence, fragment.getOccurences()) {
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
params.push_back(new IntParam(2*(fragment.getExampleOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(fragment.getExampleId()));
params.push_back(new IntParam(2*sOccurence.getOffset()+1));
params.push_back(new IntParam(2*(sOccurence.getOffset()+fragment.getMatchedLength())));
params.push_back(new IntParam(sOccurence.getId()));
PGresult * result = connection.execute(query, params);
SimpleSearchResult ssResult(connection.getIntValue(result,0,0), // example id
matchedPatternStart,
matchedPatternEnd,
ExampleOccurence occurence(connection.getIntValue(result,0,0), // example id
connection.getIntValue(result,0,3), // matched example start
connection.getIntValue(result,0,4), // matched example end
connection.getStringValue(result,0,1), // source segment
@ -129,9 +133,9 @@ void UnitDAO::_getResultsFromFragments(
// now add all target fragments matched with this fragment
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
std::vector<QueryParam*> targetParams;
targetParams.push_back(new IntParam(fragment.getExampleId()));
targetParams.push_back(new IntParam(fragment.getExampleOffset()));
targetParams.push_back(new IntParam(fragment.getExampleOffset() + fragment.getMatchedLength() - 1));
targetParams.push_back(new IntParam(sOccurence.getId()));
targetParams.push_back(new IntParam(sOccurence.getOffset()));
targetParams.push_back(new IntParam(sOccurence.getOffset() + fragment.getMatchedLength() - 1));
PGresult * targetResult = connection.execute(targetQuery, targetParams);
int prevPos = -2;
@ -146,7 +150,7 @@ void UnitDAO::_getResultsFromFragments(
if (prevPos < targetPos - 1) { // beginning of detached fragment
// check if there is a fragment to end
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
currStart = targetStart;
}
@ -157,7 +161,7 @@ void UnitDAO::_getResultsFromFragments(
// check if there are remaining fragments
if (currStart >= 0) {
ssResult.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
occurence.addMatchedTargetFragment(std::pair<int,int>(currStart,currEnd));
}
connection.clearResult(targetResult);
@ -165,9 +169,13 @@ void UnitDAO::_getResultsFromFragments(
delete param;
}
results.push_back(ssResult);
ssResult.addOccurence(occurence);
}
connection.endTransaction();
return ssResult;
}

View File

@ -41,16 +41,16 @@ public:
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) throw (ConcordiaException);
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern);
private:
void _getResultsFromFragments(std::vector<SimpleSearchResult> & results,
const std::vector<MatchedPatternFragment> & fragments,
const TokenizedSentence & tokenizedPattern);
SimpleSearchResult _getResultFromFragment(
const MatchedPatternFragment & fragment,
const TokenizedSentence & tokenizedPattern);
std::vector<int> _getTokenPositions(const TokenizedSentence & ts);

View File

@ -1,10 +1,7 @@
#!/bin/sh
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr clean-intermediate-files
make CORPUS_NAME=stocznia_plen SRC_LANG=pl TRG_LANG=en
make CORPUS_NAME=stocznia_plen SRC_LANG=pl TRG_LANG=en clean-intermediate-files
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl clean-intermediate-files
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en clean-intermediate-files
make CORPUS_NAME=stocznia_enpl SRC_LANG=en TRG_LANG=pl
make CORPUS_NAME=stocznia_enpl SRC_LANG=en TRG_LANG=pl clean-intermediate-files

View File

@ -22,11 +22,9 @@ data = {
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
response = urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response

View File

@ -21,11 +21,9 @@ if len(host.concordia_port) > 0:
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
response = urllib2.urlopen(req, json.dumps(data)).read()
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response

View File

@ -1,9 +1,16 @@
In order to configure Concordia as upstart job, copy the 3 .conf files into your /etc/init and run:
sudo apt-get install upstart upstart-sysv
sudo initctl reload-configuration
Also, add to the file /etc/init.d/postgresql the line:
initctl emit -n started JOB=postgresql
at the end of postgresql start sequence.
at the end of postgresql start sequence, after the loop:
for v in $versions; do
$1 $v || EXIT=$?
done