concordia-server/concordia-server/index_controller.cpp
2017-04-28 13:49:30 +02:00

234 lines
10 KiB
C++

#include "index_controller.hpp"
#include <concordia/common/config.hpp>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <boost/regex.hpp>
#include <boost/regex/icu.hpp>
#include <unicode/unistr.h>
#include "json_generator.hpp"
#include "logger.hpp"
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
throw(ConcordiaException):
_concordiasMap(concordiasMap),
_lemmatizerFacade(lemmatizerFacade) {
}
IndexController::~IndexController() {
}
void IndexController::addSentence(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & sourceSentence,
const std::string & targetSentence,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (std::exception & e) {
std::stringstream errorstream;
errorstream << "general error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (...) {
std::stringstream errorstream;
errorstream << "unexpected error occurred";
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::addSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::addAlignedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & rawSourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<std::string> sourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::addAlignedLemmatizedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const std::vector<std::string> & alignmentStrings,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<std::string> lemmatizedSourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::_getSourceSentencesAndAlignments(
std::vector<std::string> & sourceSentences,
std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<std::string> & rawSourceSentences) {
for (int i = 0; i<rawSourceSentences.size(); i++) {
std::string rawSourceSentence = rawSourceSentences[i];
std::string sourceSentence = "";
std::vector<std::vector<int> > alignments;
UnicodeString s(rawSourceSentence.c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(
s,
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
)
);
boost::u32regex_iterator<const UChar*> end;
for (; begin != end; ++begin) {
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
std::string token;
tokenUTF8.toUTF8String(token);
if (token != "NULL") {
std::string numbers((*begin)[2].first, (*begin)[2].second);
std::istringstream iss(numbers);
std::vector<std::string> numberStrings;
std::copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter(numberStrings));
std::vector<int> tokenAlignments;
for (int j=0;j<numberStrings.size();j++) {
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
tokenAlignments.push_back(n);
}
alignments.push_back(tokenAlignments);
sourceSentence += token + " ";
}
}
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
sourceSentences.push_back(sourceSentence);
allAlignments.push_back(alignments);
}
}