refactoring, lemmatizers
This commit is contained in:
parent
84d8102f58
commit
2690b15958
@ -20,8 +20,7 @@
|
||||
#include <boost/ptr_container/ptr_map.hpp>
|
||||
#include <boost/filesystem/path.hpp>
|
||||
|
||||
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
||||
throw(ConcordiaException) :
|
||||
ConcordiaServer::ConcordiaServer(const std::string & configFilePath):
|
||||
_configFilePath(configFilePath) {
|
||||
|
||||
std::vector<int> tmIds = _tmDAO.getTmIds();
|
||||
@ -57,12 +56,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
} else { // json parsed
|
||||
std::string operation = _getStringParameter(d, OPERATION_PARAM);
|
||||
if (operation == ADD_SENTENCE_OP) {
|
||||
std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM);
|
||||
std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM);
|
||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||
_indexController->addSentence(jsonWriter, sourceSentence, targetSentence, tmId);
|
||||
} else if (operation == ADD_SENTENCES_OP) {
|
||||
if (operation == ADD_SENTENCES_OP) {
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::string> concordiaSourceSentences;
|
||||
std::vector<std::string> targetSentences;
|
||||
@ -86,50 +80,6 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
||||
}
|
||||
}
|
||||
_indexController->addSentences(jsonWriter, sourceSentences, concordiaSourceSentences, targetSentences, alignments, sourceIds, tmId);
|
||||
} else if (operation == ADD_ALIGNED_SENTENCES_OP) {
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::string> targetSentences;
|
||||
int tmId = d[TM_ID_PARAM].GetInt();
|
||||
// loading data from json
|
||||
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
|
||||
/*
|
||||
Logger::log("addAlignedSentences");
|
||||
Logger::logInt("sentences to add", sentencesArray.Size());
|
||||
Logger::logInt("tm id", tmId);
|
||||
*/
|
||||
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
||||
if (sentencesArray[i].Size() != 2) {
|
||||
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
|
||||
break;
|
||||
} else {
|
||||
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
||||
targetSentences.push_back(sentencesArray[i][1].GetString());
|
||||
}
|
||||
}
|
||||
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
||||
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::string> targetSentences;
|
||||
std::vector<std::string> alignmentStrings;
|
||||
int tmId = d[TM_ID_PARAM].GetInt();
|
||||
// loading data from json
|
||||
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
|
||||
/*
|
||||
Logger::log("addAlignedLemmatizedSentences");
|
||||
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
|
||||
Logger::logInt("tm id", tmId);
|
||||
*/
|
||||
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
||||
if (sentencesArray[i].Size() != 3) {
|
||||
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
|
||||
break;
|
||||
} else {
|
||||
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
||||
targetSentences.push_back(sentencesArray[i][1].GetString());
|
||||
alignmentStrings.push_back(sentencesArray[i][2].GetString());
|
||||
}
|
||||
}
|
||||
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
||||
} else if (operation == GET_TMS_INFO_OP) {
|
||||
std::vector<Tm> tms = _tmDAO.getTms();
|
||||
|
||||
@ -348,8 +298,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
||||
|
||||
}
|
||||
|
||||
std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name)
|
||||
throw (ConcordiaException) {
|
||||
std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name) {
|
||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||
if (itr != d.MemberEnd()) {
|
||||
std::string value = itr->value.GetString();
|
||||
@ -359,8 +308,7 @@ std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const
|
||||
}
|
||||
}
|
||||
|
||||
int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name)
|
||||
throw (ConcordiaException) {
|
||||
int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name) {
|
||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||
if (itr != d.MemberEnd()) {
|
||||
int value = itr->value.GetInt();
|
||||
@ -370,8 +318,7 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
|
||||
}
|
||||
}
|
||||
|
||||
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
|
||||
throw (ConcordiaException) {
|
||||
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) {
|
||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||
if (itr != d.MemberEnd()) {
|
||||
bool value = itr->value.GetBool();
|
||||
@ -381,8 +328,7 @@ int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * nam
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v)
|
||||
throw (ConcordiaException) {
|
||||
std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v) {
|
||||
std::vector<std::vector<int> > result;
|
||||
for (rapidjson::SizeType i = 0; i < v.Size(); i++) {
|
||||
std::vector<int> innerArray;
|
||||
|
@ -26,8 +26,7 @@ public:
|
||||
\param configFilePath path to the Concordia configuration file
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
explicit ConcordiaServer(const std::string & configFilePath)
|
||||
throw(ConcordiaException);
|
||||
explicit ConcordiaServer(const std::string & configFilePath);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~ConcordiaServer();
|
||||
@ -37,13 +36,13 @@ public:
|
||||
private:
|
||||
void _logPhrase(std::string phraseString);
|
||||
|
||||
std::string _getStringParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||
std::string _getStringParameter(rapidjson::Document & d, const char * name);
|
||||
|
||||
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||
int _getIntParameter(rapidjson::Document & d, const char * name);
|
||||
|
||||
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||
int _getBoolParameter(rapidjson::Document & d, const char * name);
|
||||
|
||||
std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v) throw (ConcordiaException);
|
||||
std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v);
|
||||
|
||||
void _addTm(int tmId);
|
||||
|
||||
|
@ -32,12 +32,9 @@
|
||||
#define LINK_PARAM "link"
|
||||
#define SOURCES_PARAM "sources"
|
||||
|
||||
#define ADD_SENTENCE_OP "addSentence"
|
||||
#define ADD_SENTENCES_OP "addSentences"
|
||||
#define LEMMATIZE_OP "lemmatize"
|
||||
#define LEMMATIZE_ALL_OP "lemmatizeAll"
|
||||
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
||||
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
||||
#define ADD_REQUEST_OP "addRequest"
|
||||
#define GET_TMS_INFO_OP "getTmsInfo"
|
||||
#define GET_REQUESTS_INFO_OP "getRequestsInfo"
|
||||
|
@ -7,8 +7,8 @@
|
||||
#include "config.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
DBconnection::DBconnection() throw(ConcordiaException) {
|
||||
std::string connectionInfo = "dbname="DB_NAME" user="DB_USER" password="DB_PASSWORD" host="DB_HOST" port="DB_PORT;
|
||||
DBconnection::DBconnection() {
|
||||
std::string connectionInfo = "dbname=" DB_NAME " user=" DB_USER " password=" DB_PASSWORD " host=" DB_HOST " port=" DB_PORT;
|
||||
_connection = PQconnectdb(connectionInfo.c_str());
|
||||
if (PQstatus(_connection) != CONNECTION_OK) {
|
||||
close();
|
||||
@ -31,7 +31,7 @@ void DBconnection::close() {
|
||||
}
|
||||
}
|
||||
|
||||
void DBconnection::startTransaction() throw(ConcordiaException) {
|
||||
void DBconnection::startTransaction() {
|
||||
if (_connection != NULL) {
|
||||
PGresult * result = PQexec(_connection, "BEGIN");
|
||||
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
||||
@ -44,7 +44,7 @@ void DBconnection::startTransaction() throw(ConcordiaException) {
|
||||
}
|
||||
}
|
||||
|
||||
void DBconnection::endTransaction() throw(ConcordiaException) {
|
||||
void DBconnection::endTransaction() {
|
||||
if (_connection != NULL) {
|
||||
PGresult * result = PQexec(_connection, "END");
|
||||
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
||||
@ -57,7 +57,7 @@ void DBconnection::endTransaction() throw(ConcordiaException) {
|
||||
}
|
||||
}
|
||||
|
||||
PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) {
|
||||
PGresult * DBconnection::execute(std::string query) {
|
||||
if (_connection != NULL) {
|
||||
PGresult * result = PQexec(_connection, query.c_str());
|
||||
if (PQresultStatus(result) != PGRES_COMMAND_OK &&
|
||||
@ -78,7 +78,7 @@ PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) {
|
||||
}
|
||||
|
||||
PGresult * DBconnection::execute(std::string query,
|
||||
std::vector<QueryParam*> params) throw(ConcordiaException) {
|
||||
std::vector<QueryParam*> params) {
|
||||
if (_connection != NULL) {
|
||||
const char * paramValues[params.size()];
|
||||
int paramLengths[params.size()];
|
||||
@ -122,7 +122,7 @@ void DBconnection::clearResult(PGresult * result) {
|
||||
PQclear(result);
|
||||
}
|
||||
|
||||
int DBconnection::getIntValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||
int DBconnection::getIntValue(PGresult * result, int row, int col) {
|
||||
try {
|
||||
char * valueStr = PQgetvalue(result,row,col);
|
||||
return strtol(valueStr, NULL, 10);
|
||||
@ -133,7 +133,7 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
|
||||
}
|
||||
}
|
||||
|
||||
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||
bool DBconnection::getBoolValue(PGresult * result, int row, int col) {
|
||||
try {
|
||||
char * valueStr = PQgetvalue(result,row,col);
|
||||
return std::string(valueStr) == "t";
|
||||
@ -144,7 +144,7 @@ bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (Conc
|
||||
}
|
||||
}
|
||||
|
||||
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||
std::string DBconnection::getStringValue(PGresult * result, int row, int col) {
|
||||
try {
|
||||
char * valueStr = PQgetvalue(result,row,col);
|
||||
return std::string(valueStr);
|
||||
@ -155,7 +155,7 @@ std::string DBconnection::getStringValue(PGresult * result, int row, int col) t
|
||||
}
|
||||
}
|
||||
|
||||
int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
|
||||
int DBconnection::getRowCount(PGresult * result) {
|
||||
try {
|
||||
return PQntuples(result);
|
||||
} catch (std::exception & e) {
|
||||
|
@ -13,29 +13,29 @@ class DBconnection {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
DBconnection() throw(ConcordiaException);
|
||||
DBconnection();
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~DBconnection();
|
||||
|
||||
void startTransaction() throw(ConcordiaException);
|
||||
void startTransaction();
|
||||
|
||||
void endTransaction() throw(ConcordiaException);
|
||||
void endTransaction();
|
||||
|
||||
PGresult * execute(std::string query) throw(ConcordiaException);
|
||||
PGresult * execute(std::string query);
|
||||
|
||||
PGresult * execute(std::string query,
|
||||
std::vector<QueryParam*> params) throw(ConcordiaException);
|
||||
std::vector<QueryParam*> params);
|
||||
|
||||
void clearResult(PGresult * result);
|
||||
|
||||
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
int getIntValue(PGresult * result, int row, int col);
|
||||
|
||||
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
bool getBoolValue(PGresult * result, int row, int col);
|
||||
|
||||
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
std::string getStringValue(PGresult * result, int row, int col);
|
||||
|
||||
int getRowCount(PGresult * result) throw (ConcordiaException);
|
||||
int getRowCount(PGresult * result);
|
||||
|
||||
private:
|
||||
void close();
|
||||
|
@ -15,8 +15,7 @@
|
||||
#include "logger.hpp"
|
||||
|
||||
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException):
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
|
||||
_concordiasMap(concordiasMap),
|
||||
_lemmatizerFacade(lemmatizerFacade) {
|
||||
}
|
||||
@ -24,44 +23,6 @@ IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia>
|
||||
IndexController::~IndexController() {
|
||||
}
|
||||
|
||||
|
||||
void IndexController::addSentence(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::string & sourceSentence,
|
||||
const std::string & targetSentence,
|
||||
const int tmId) {
|
||||
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
|
||||
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
||||
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
|
||||
it->second->refreshSAfromRAM();
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
} catch (std::exception & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "general error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
} catch (...) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "unexpected error occurred";
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & concordiaSourceSentences,
|
||||
@ -93,76 +54,6 @@ void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> &
|
||||
}
|
||||
}
|
||||
|
||||
void IndexController::addAlignedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & rawSourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId) {
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
|
||||
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
|
||||
}
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
void IndexController::addAlignedLemmatizedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const std::vector<std::string> & alignmentStrings,
|
||||
const int tmId) {
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<std::string> lemmatizedSourceSentences;
|
||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
|
||||
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
|
||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
||||
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
|
||||
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
|
||||
}
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const int tmId) {
|
||||
@ -185,53 +76,3 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void IndexController::_getSourceSentencesAndAlignments(
|
||||
std::vector<std::string> & sourceSentences,
|
||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<std::string> & rawSourceSentences) {
|
||||
|
||||
for (int i = 0; i<rawSourceSentences.size(); i++) {
|
||||
std::string rawSourceSentence = rawSourceSentences[i];
|
||||
|
||||
std::string sourceSentence = "";
|
||||
std::vector<std::vector<int> > alignments;
|
||||
|
||||
UnicodeString s(rawSourceSentence.c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(
|
||||
boost::make_u32regex_iterator(
|
||||
s,
|
||||
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
|
||||
)
|
||||
);
|
||||
boost::u32regex_iterator<const UChar*> end;
|
||||
|
||||
for (; begin != end; ++begin) {
|
||||
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
|
||||
std::string token;
|
||||
tokenUTF8.toUTF8String(token);
|
||||
|
||||
if (token != "NULL") {
|
||||
std::string numbers((*begin)[2].first, (*begin)[2].second);
|
||||
std::istringstream iss(numbers);
|
||||
std::vector<std::string> numberStrings;
|
||||
std::copy(std::istream_iterator<std::string>(iss),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::back_inserter(numberStrings));
|
||||
|
||||
std::vector<int> tokenAlignments;
|
||||
for (int j=0;j<numberStrings.size();j++) {
|
||||
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
|
||||
tokenAlignments.push_back(n);
|
||||
}
|
||||
alignments.push_back(tokenAlignments);
|
||||
sourceSentence += token + " ";
|
||||
}
|
||||
}
|
||||
|
||||
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
|
||||
|
||||
sourceSentences.push_back(sourceSentence);
|
||||
allAlignments.push_back(alignments);
|
||||
}
|
||||
}
|
||||
|
@ -20,17 +20,11 @@ public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException);
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~IndexController();
|
||||
|
||||
void addSentence(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::string & sourceSentence,
|
||||
const std::string & targetSentence,
|
||||
const int tmId);
|
||||
|
||||
void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & concordiaSourceSentences,
|
||||
@ -39,27 +33,10 @@ public:
|
||||
const std::vector<int> & sourceIds,
|
||||
const int tmId);
|
||||
|
||||
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & rawSourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
void addAlignedLemmatizedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const std::vector<std::string> & alignmentStrings,
|
||||
const int tmId);
|
||||
|
||||
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const int tmId);
|
||||
|
||||
private:
|
||||
void _getSourceSentencesAndAlignments(
|
||||
std::vector<std::string> & sourceSentences,
|
||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<std::string> & rawSourceSentences);
|
||||
|
||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||
|
||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||
|
@ -8,14 +8,14 @@
|
||||
#include "rapidjson/error/en.h"
|
||||
#include <string>
|
||||
|
||||
JsonLemmatizer::JsonLemmatizer() throw(ConcordiaException) {
|
||||
JsonLemmatizer::JsonLemmatizer() {
|
||||
}
|
||||
|
||||
JsonLemmatizer::~JsonLemmatizer() {
|
||||
}
|
||||
|
||||
|
||||
std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
LemmatizerResult JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
rapidjson::StringBuffer paramsJson;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> jsonWriter(paramsJson);
|
||||
jsonWriter.StartObject();
|
||||
@ -31,6 +31,7 @@ std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::str
|
||||
RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString());
|
||||
rapidjson::Document d;
|
||||
d.Parse(r.body.c_str());
|
||||
std::string lemmatized = d["processed_sentences"][0]["tokens"].GetString();
|
||||
return lemmatized;
|
||||
std::string lemmatizedSentence = d["processed_sentences"][0]["tokens"].GetString();
|
||||
bool isFirstLemmatized = d["processed_sentences"][0]["isFirstLemmatized"].GetBool();
|
||||
return LemmatizerResult(lemmatizedSentence, isFirstLemmatized);
|
||||
}
|
||||
|
@ -5,18 +5,19 @@
|
||||
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
|
||||
#include "lemmatizer_result.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
class JsonLemmatizer {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit JsonLemmatizer() throw(ConcordiaException);
|
||||
explicit JsonLemmatizer();
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~JsonLemmatizer();
|
||||
|
||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
private:
|
||||
Logger _logger;
|
||||
};
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
|
||||
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
||||
LemmatizerFacade::LemmatizerFacade() {
|
||||
_lemmatizersMap = boost::ptr_map<std::string,JsonLemmatizer>();
|
||||
|
||||
// todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator
|
||||
@ -18,7 +18,7 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
||||
LemmatizerFacade::~LemmatizerFacade() {
|
||||
}
|
||||
|
||||
std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
LemmatizerResult LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
|
||||
boost::ptr_map<std::string,JsonLemmatizer>::iterator it = _lemmatizersMap.find(languageCode);
|
||||
if (it != _lemmatizersMap.end()) {
|
||||
@ -29,7 +29,7 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
|
||||
|
||||
}
|
||||
|
||||
std::vector<std::string> LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
|
||||
LemmatizerResult LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
|
||||
std::vector<std::string> result;
|
||||
BOOST_FOREACH(std::string & sentence, sentences) {
|
||||
result.push_back(lemmatizeSentence(languageCode, sentence));
|
||||
|
@ -1,8 +1,8 @@
|
||||
#ifndef LEMMATIZER_FACADE_HDR
|
||||
#define LEMMATIZER_FACADE_HDR
|
||||
|
||||
#include "socket_lemmatizer.hpp"
|
||||
#include "json_lemmatizer.hpp"
|
||||
#include "lemmatizer_result.hpp"
|
||||
#include "tm_dao.hpp"
|
||||
|
||||
#include <string>
|
||||
@ -14,14 +14,14 @@ class LemmatizerFacade {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
LemmatizerFacade() throw(ConcordiaException);
|
||||
LemmatizerFacade();
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~LemmatizerFacade();
|
||||
|
||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
|
||||
std::vector<std::string> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
|
||||
std::vector<LemmatizerResult> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
|
||||
|
||||
private:
|
||||
boost::ptr_map<std::string,JsonLemmatizer> _lemmatizersMap;
|
||||
|
10
concordia-server/lemmatizer_result.cpp
Normal file
10
concordia-server/lemmatizer_result.cpp
Normal file
@ -0,0 +1,10 @@
|
||||
#include "lemmatizer_result.hpp"
|
||||
|
||||
LemmatizerResutl::LemmatizerResult(const std::string & lemmatizedSentence,
|
||||
const bool isFirstLemmatized):
|
||||
_lemmatizedSentence(lemmatizedSentence),
|
||||
_isFirstLemmatized(isFirstLemmatized) {
|
||||
|
||||
}
|
||||
|
||||
LemmatizerResutl::~LemmatizerResult();
|
31
concordia-server/lemmatizer_result.hpp
Normal file
31
concordia-server/lemmatizer_result.hpp
Normal file
@ -0,0 +1,31 @@
|
||||
#ifndef LEMMATIZER_RESULT_HDR
|
||||
#define LEMMATIZER_RESULT__HDR
|
||||
|
||||
#include <string>
|
||||
|
||||
class LemmatizerResult {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
LemmatizerResult(const std::string & lemmatizedSentence,
|
||||
const bool isFirstLemmatized);
|
||||
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~LemmatizerResult();
|
||||
|
||||
const std::string & getLemmatizedSentence() const {
|
||||
return _lemmatizedSentence;
|
||||
}
|
||||
|
||||
int isFirstLemmatized() const {
|
||||
return _isFirstLemmatized;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string _lemmatizedSentence;
|
||||
|
||||
bool _isFirstLemmatized;
|
||||
};
|
||||
|
||||
#endif
|
@ -12,8 +12,7 @@
|
||||
|
||||
|
||||
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException):
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
|
||||
_concordiasMap(concordiasMap),
|
||||
_lemmatizerFacade(lemmatizerFacade) {
|
||||
}
|
||||
@ -26,6 +25,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
||||
std::string & pattern,
|
||||
const int tmId) {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
_tmDAO.getTm
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
|
||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <concordia/interval.hpp>
|
||||
|
||||
#include "unit_dao.hpp"
|
||||
#include "tm_dao.hpp"
|
||||
#include "simple_search_result.hpp"
|
||||
#include "lemmatizer_facade.hpp"
|
||||
#include "rapidjson/writer.h"
|
||||
@ -19,8 +20,7 @@ public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
|
||||
throw(ConcordiaException);
|
||||
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~SearcherController();
|
||||
@ -56,6 +56,8 @@ private:
|
||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||
|
||||
UnitDAO _unitDAO;
|
||||
|
||||
TmDAO _tmDAO;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,119 +0,0 @@
|
||||
#include "socket_lemmatizer.hpp"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
|
||||
_port(port) {
|
||||
}
|
||||
|
||||
SocketLemmatizer::~SocketLemmatizer() {
|
||||
}
|
||||
|
||||
/**
|
||||
Connect to a host on a certain port number
|
||||
*/
|
||||
bool SocketLemmatizer::_connect() {
|
||||
|
||||
//Create socket
|
||||
_sock = socket(AF_INET , SOCK_STREAM , 0);
|
||||
if (_sock == -1) {
|
||||
throw ConcordiaException("Could not create socket for the lemmatizer.");
|
||||
}
|
||||
|
||||
std::string address = "127.0.0.1";
|
||||
|
||||
//setup address structure
|
||||
if(inet_addr(address.c_str()) == -1) {
|
||||
struct hostent *he;
|
||||
struct in_addr **addr_list;
|
||||
|
||||
//resolve the hostname, its not an ip address
|
||||
if ( (he = gethostbyname( address.c_str() ) ) == NULL) {
|
||||
//gethostbyname failed
|
||||
throw ConcordiaException("gethostbyname: Failed to resolve hostname");
|
||||
}
|
||||
|
||||
//Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only
|
||||
addr_list = (struct in_addr **) he->h_addr_list;
|
||||
|
||||
for(int i = 0; addr_list[i] != NULL; i++) {
|
||||
_server.sin_addr = *addr_list[i];
|
||||
break;
|
||||
}
|
||||
} else { //plain ip address
|
||||
_server.sin_addr.s_addr = inet_addr(address.c_str());
|
||||
}
|
||||
|
||||
_server.sin_family = AF_INET;
|
||||
_server.sin_port = htons(_port);
|
||||
|
||||
//Connect to remote server
|
||||
if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) {
|
||||
throw ConcordiaException("Connect failed. Error on address: "+address+":"+boost::lexical_cast<std::string>(_port));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SocketLemmatizer::_disconnect() {
|
||||
close(_sock);
|
||||
_sock = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
Send data to the connected host
|
||||
*/
|
||||
bool SocketLemmatizer::_send_data(std::string data)
|
||||
{
|
||||
//Send some data
|
||||
if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) {
|
||||
throw ConcordiaException("Send failed");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
Receive data from the connected host
|
||||
*/
|
||||
std::string SocketLemmatizer::_receive(int size=512)
|
||||
{
|
||||
char buffer[size];
|
||||
std::string reply = "";
|
||||
|
||||
//Receive a reply from the server
|
||||
bool dataAvailable = true;
|
||||
while (dataAvailable) {
|
||||
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
|
||||
if (amountReceived < 0) {
|
||||
throw ConcordiaException("Lemmatizer: recv failed");
|
||||
} else if (amountReceived == 0) {
|
||||
dataAvailable = false;
|
||||
} else {
|
||||
buffer[amountReceived] = '\0';
|
||||
reply += buffer;
|
||||
}
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
for (int i=0;i<5;i++) {
|
||||
try {
|
||||
_connect();
|
||||
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
|
||||
std::string reply = _receive(512);
|
||||
_disconnect();
|
||||
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
|
||||
} catch (std::exception & e) {
|
||||
_logger.logString("Problem with lemmatization of the sentence", sentence);
|
||||
_logger.log("Waiting 2 seconds and retrying...");
|
||||
sleep(2);
|
||||
}
|
||||
}
|
||||
|
||||
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
#ifndef SOCKET_LEMMATIZER_HDR
|
||||
#define SOCKET_LEMMATIZER_HDR
|
||||
|
||||
#include <string>
|
||||
#include <sys/socket.h> //socket
|
||||
#include <arpa/inet.h> //inet_addr
|
||||
#include <netdb.h> //hostent
|
||||
#include <unistd.h>
|
||||
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
class SocketLemmatizer {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit SocketLemmatizer(int port) throw(ConcordiaException);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~SocketLemmatizer();
|
||||
|
||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
private:
|
||||
bool _connect();
|
||||
|
||||
bool _disconnect();
|
||||
|
||||
bool _send_data(std::string data);
|
||||
|
||||
std::string _receive(int size);
|
||||
|
||||
int _port;
|
||||
|
||||
int _sock;
|
||||
|
||||
struct sockaddr_in _server;
|
||||
|
||||
Logger _logger;
|
||||
};
|
||||
|
||||
#endif
|
@ -88,7 +88,7 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
|
||||
|
||||
}
|
||||
|
||||
Tm TmDAO::getTmInfo(int tmId) {
|
||||
Tm TmDAO::getTm(int tmId) {
|
||||
DBconnection connection;
|
||||
connection.startTransaction();
|
||||
std::string query = "select tm.id, tm.name, tm.lemmatized, tm.paired_tm_id, source_language.code, target_language.code from tm inner join language as source_language on source_language.id = tm.source_lang_id inner join language as target_language on target_language.id = tm.target_lang_id where tm.id = $1::integer;";
|
||||
|
@ -28,7 +28,7 @@ public:
|
||||
|
||||
std::vector<Tm> getTms();
|
||||
|
||||
std::pair<bool, std::string> getTmInfo(int tmId);
|
||||
std::pair<bool, std::string> getTm(int tmId);
|
||||
|
||||
private:
|
||||
|
||||
|
@ -59,7 +59,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId) throw (ConcordiaException) {
|
||||
const int tmId) {
|
||||
|
||||
DBconnection connection;
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
@ -78,7 +78,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<int> & sourceIds,
|
||||
const int tmId) throw (ConcordiaException) {
|
||||
const int tmId) {
|
||||
|
||||
DBconnection connection;
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
@ -270,7 +270,7 @@ int UnitDAO::_addAlignedUnit (
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId) throw(ConcordiaException) {
|
||||
const int tmId) {
|
||||
|
||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||
// Here we check if the source sentence, taken from src.tok,
|
||||
@ -305,7 +305,7 @@ int UnitDAO::_addAlignedUnit (
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int sourceId,
|
||||
const int tmId) throw(ConcordiaException) {
|
||||
const int tmId) {
|
||||
|
||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||
// Here we check if the source sentence, taken from src.tok,
|
||||
|
@ -41,14 +41,14 @@ public:
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId) throw (ConcordiaException);
|
||||
const int tmId);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const std::vector<int> & sourceIds,
|
||||
const int tmId) throw (ConcordiaException);
|
||||
const int tmId);
|
||||
|
||||
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
|
||||
|
||||
@ -83,7 +83,7 @@ private:
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId) throw(ConcordiaException);
|
||||
const int tmId);
|
||||
|
||||
int _addAlignedUnit(
|
||||
DBconnection & connection,
|
||||
@ -91,7 +91,7 @@ private:
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int sourceId,
|
||||
const int tmId) throw(ConcordiaException);
|
||||
const int tmId);
|
||||
|
||||
std::vector<int> _getArray(std::string arrayString);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user