refactoring, lemmatizers

This commit is contained in:
rjawor 2019-07-23 21:57:40 +02:00
parent 84d8102f58
commit 2690b15958
21 changed files with 105 additions and 461 deletions

View File

@ -20,8 +20,7 @@
#include <boost/ptr_container/ptr_map.hpp> #include <boost/ptr_container/ptr_map.hpp>
#include <boost/filesystem/path.hpp> #include <boost/filesystem/path.hpp>
ConcordiaServer::ConcordiaServer(const std::string & configFilePath) ConcordiaServer::ConcordiaServer(const std::string & configFilePath):
throw(ConcordiaException) :
_configFilePath(configFilePath) { _configFilePath(configFilePath) {
std::vector<int> tmIds = _tmDAO.getTmIds(); std::vector<int> tmIds = _tmDAO.getTmIds();
@ -57,12 +56,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
JsonGenerator::signalError(jsonWriter, errorstream.str()); JsonGenerator::signalError(jsonWriter, errorstream.str());
} else { // json parsed } else { // json parsed
std::string operation = _getStringParameter(d, OPERATION_PARAM); std::string operation = _getStringParameter(d, OPERATION_PARAM);
if (operation == ADD_SENTENCE_OP) { if (operation == ADD_SENTENCES_OP) {
std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM);
std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM);
int tmId = _getIntParameter(d, TM_ID_PARAM);
_indexController->addSentence(jsonWriter, sourceSentence, targetSentence, tmId);
} else if (operation == ADD_SENTENCES_OP) {
std::vector<std::string> sourceSentences; std::vector<std::string> sourceSentences;
std::vector<std::string> concordiaSourceSentences; std::vector<std::string> concordiaSourceSentences;
std::vector<std::string> targetSentences; std::vector<std::string> targetSentences;
@ -86,50 +80,6 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
} }
} }
_indexController->addSentences(jsonWriter, sourceSentences, concordiaSourceSentences, targetSentences, alignments, sourceIds, tmId); _indexController->addSentences(jsonWriter, sourceSentences, concordiaSourceSentences, targetSentences, alignments, sourceIds, tmId);
} else if (operation == ADD_ALIGNED_SENTENCES_OP) {
std::vector<std::string> sourceSentences;
std::vector<std::string> targetSentences;
int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
/*
Logger::log("addAlignedSentences");
Logger::logInt("sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
*/
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 2) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
break;
} else {
sourceSentences.push_back(sentencesArray[i][0].GetString());
targetSentences.push_back(sentencesArray[i][1].GetString());
}
}
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
std::vector<std::string> sourceSentences;
std::vector<std::string> targetSentences;
std::vector<std::string> alignmentStrings;
int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
/*
Logger::log("addAlignedLemmatizedSentences");
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
*/
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 3) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
break;
} else {
sourceSentences.push_back(sentencesArray[i][0].GetString());
targetSentences.push_back(sentencesArray[i][1].GetString());
alignmentStrings.push_back(sentencesArray[i][2].GetString());
}
}
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
} else if (operation == GET_TMS_INFO_OP) { } else if (operation == GET_TMS_INFO_OP) {
std::vector<Tm> tms = _tmDAO.getTms(); std::vector<Tm> tms = _tmDAO.getTms();
@ -348,8 +298,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
} }
std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name) std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name) {
throw (ConcordiaException) {
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name); rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
if (itr != d.MemberEnd()) { if (itr != d.MemberEnd()) {
std::string value = itr->value.GetString(); std::string value = itr->value.GetString();
@ -359,8 +308,7 @@ std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const
} }
} }
int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name) int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name) {
throw (ConcordiaException) {
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name); rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
if (itr != d.MemberEnd()) { if (itr != d.MemberEnd()) {
int value = itr->value.GetInt(); int value = itr->value.GetInt();
@ -370,8 +318,7 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
} }
} }
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) {
throw (ConcordiaException) {
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name); rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
if (itr != d.MemberEnd()) { if (itr != d.MemberEnd()) {
bool value = itr->value.GetBool(); bool value = itr->value.GetBool();
@ -381,8 +328,7 @@ int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * nam
} }
} }
std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v) std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v) {
throw (ConcordiaException) {
std::vector<std::vector<int> > result; std::vector<std::vector<int> > result;
for (rapidjson::SizeType i = 0; i < v.Size(); i++) { for (rapidjson::SizeType i = 0; i < v.Size(); i++) {
std::vector<int> innerArray; std::vector<int> innerArray;

View File

@ -26,8 +26,7 @@ public:
\param configFilePath path to the Concordia configuration file \param configFilePath path to the Concordia configuration file
\throws ConcordiaException \throws ConcordiaException
*/ */
explicit ConcordiaServer(const std::string & configFilePath) explicit ConcordiaServer(const std::string & configFilePath);
throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~ConcordiaServer(); virtual ~ConcordiaServer();
@ -37,13 +36,13 @@ public:
private: private:
void _logPhrase(std::string phraseString); void _logPhrase(std::string phraseString);
std::string _getStringParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); std::string _getStringParameter(rapidjson::Document & d, const char * name);
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); int _getIntParameter(rapidjson::Document & d, const char * name);
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException); int _getBoolParameter(rapidjson::Document & d, const char * name);
std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v) throw (ConcordiaException); std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v);
void _addTm(int tmId); void _addTm(int tmId);

View File

@ -32,12 +32,9 @@
#define LINK_PARAM "link" #define LINK_PARAM "link"
#define SOURCES_PARAM "sources" #define SOURCES_PARAM "sources"
#define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences" #define ADD_SENTENCES_OP "addSentences"
#define LEMMATIZE_OP "lemmatize" #define LEMMATIZE_OP "lemmatize"
#define LEMMATIZE_ALL_OP "lemmatizeAll" #define LEMMATIZE_ALL_OP "lemmatizeAll"
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
#define ADD_REQUEST_OP "addRequest" #define ADD_REQUEST_OP "addRequest"
#define GET_TMS_INFO_OP "getTmsInfo" #define GET_TMS_INFO_OP "getTmsInfo"
#define GET_REQUESTS_INFO_OP "getRequestsInfo" #define GET_REQUESTS_INFO_OP "getRequestsInfo"

View File

@ -7,8 +7,8 @@
#include "config.hpp" #include "config.hpp"
#include "logger.hpp" #include "logger.hpp"
DBconnection::DBconnection() throw(ConcordiaException) { DBconnection::DBconnection() {
std::string connectionInfo = "dbname="DB_NAME" user="DB_USER" password="DB_PASSWORD" host="DB_HOST" port="DB_PORT; std::string connectionInfo = "dbname=" DB_NAME " user=" DB_USER " password=" DB_PASSWORD " host=" DB_HOST " port=" DB_PORT;
_connection = PQconnectdb(connectionInfo.c_str()); _connection = PQconnectdb(connectionInfo.c_str());
if (PQstatus(_connection) != CONNECTION_OK) { if (PQstatus(_connection) != CONNECTION_OK) {
close(); close();
@ -31,7 +31,7 @@ void DBconnection::close() {
} }
} }
void DBconnection::startTransaction() throw(ConcordiaException) { void DBconnection::startTransaction() {
if (_connection != NULL) { if (_connection != NULL) {
PGresult * result = PQexec(_connection, "BEGIN"); PGresult * result = PQexec(_connection, "BEGIN");
if (PQresultStatus(result) != PGRES_COMMAND_OK) { if (PQresultStatus(result) != PGRES_COMMAND_OK) {
@ -44,7 +44,7 @@ void DBconnection::startTransaction() throw(ConcordiaException) {
} }
} }
void DBconnection::endTransaction() throw(ConcordiaException) { void DBconnection::endTransaction() {
if (_connection != NULL) { if (_connection != NULL) {
PGresult * result = PQexec(_connection, "END"); PGresult * result = PQexec(_connection, "END");
if (PQresultStatus(result) != PGRES_COMMAND_OK) { if (PQresultStatus(result) != PGRES_COMMAND_OK) {
@ -57,7 +57,7 @@ void DBconnection::endTransaction() throw(ConcordiaException) {
} }
} }
PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) { PGresult * DBconnection::execute(std::string query) {
if (_connection != NULL) { if (_connection != NULL) {
PGresult * result = PQexec(_connection, query.c_str()); PGresult * result = PQexec(_connection, query.c_str());
if (PQresultStatus(result) != PGRES_COMMAND_OK && if (PQresultStatus(result) != PGRES_COMMAND_OK &&
@ -78,7 +78,7 @@ PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) {
} }
PGresult * DBconnection::execute(std::string query, PGresult * DBconnection::execute(std::string query,
std::vector<QueryParam*> params) throw(ConcordiaException) { std::vector<QueryParam*> params) {
if (_connection != NULL) { if (_connection != NULL) {
const char * paramValues[params.size()]; const char * paramValues[params.size()];
int paramLengths[params.size()]; int paramLengths[params.size()];
@ -122,7 +122,7 @@ void DBconnection::clearResult(PGresult * result) {
PQclear(result); PQclear(result);
} }
int DBconnection::getIntValue(PGresult * result, int row, int col) throw (ConcordiaException) { int DBconnection::getIntValue(PGresult * result, int row, int col) {
try { try {
char * valueStr = PQgetvalue(result,row,col); char * valueStr = PQgetvalue(result,row,col);
return strtol(valueStr, NULL, 10); return strtol(valueStr, NULL, 10);
@ -133,7 +133,7 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
} }
} }
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) { bool DBconnection::getBoolValue(PGresult * result, int row, int col) {
try { try {
char * valueStr = PQgetvalue(result,row,col); char * valueStr = PQgetvalue(result,row,col);
return std::string(valueStr) == "t"; return std::string(valueStr) == "t";
@ -144,7 +144,7 @@ bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (Conc
} }
} }
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) { std::string DBconnection::getStringValue(PGresult * result, int row, int col) {
try { try {
char * valueStr = PQgetvalue(result,row,col); char * valueStr = PQgetvalue(result,row,col);
return std::string(valueStr); return std::string(valueStr);
@ -155,7 +155,7 @@ std::string DBconnection::getStringValue(PGresult * result, int row, int col) t
} }
} }
int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) { int DBconnection::getRowCount(PGresult * result) {
try { try {
return PQntuples(result); return PQntuples(result);
} catch (std::exception & e) { } catch (std::exception & e) {

View File

@ -13,29 +13,29 @@ class DBconnection {
public: public:
/*! Constructor. /*! Constructor.
*/ */
DBconnection() throw(ConcordiaException); DBconnection();
/*! Destructor. /*! Destructor.
*/ */
virtual ~DBconnection(); virtual ~DBconnection();
void startTransaction() throw(ConcordiaException); void startTransaction();
void endTransaction() throw(ConcordiaException); void endTransaction();
PGresult * execute(std::string query) throw(ConcordiaException); PGresult * execute(std::string query);
PGresult * execute(std::string query, PGresult * execute(std::string query,
std::vector<QueryParam*> params) throw(ConcordiaException); std::vector<QueryParam*> params);
void clearResult(PGresult * result); void clearResult(PGresult * result);
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException); int getIntValue(PGresult * result, int row, int col);
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException); bool getBoolValue(PGresult * result, int row, int col);
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException); std::string getStringValue(PGresult * result, int row, int col);
int getRowCount(PGresult * result) throw (ConcordiaException); int getRowCount(PGresult * result);
private: private:
void close(); void close();

View File

@ -15,8 +15,7 @@
#include "logger.hpp" #include "logger.hpp"
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap, IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade) boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
throw(ConcordiaException):
_concordiasMap(concordiasMap), _concordiasMap(concordiasMap),
_lemmatizerFacade(lemmatizerFacade) { _lemmatizerFacade(lemmatizerFacade) {
} }
@ -24,44 +23,6 @@ IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia>
IndexController::~IndexController() { IndexController::~IndexController() {
} }
void IndexController::addSentence(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & sourceSentence,
const std::string & targetSentence,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (std::exception & e) {
std::stringstream errorstream;
errorstream << "general error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
} catch (...) {
std::stringstream errorstream;
errorstream << "unexpected error occurred";
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences, const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & concordiaSourceSentences, const std::vector<std::string> & concordiaSourceSentences,
@ -93,76 +54,6 @@ void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> &
} }
} }
void IndexController::addAlignedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & rawSourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<std::string> sourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::addAlignedLemmatizedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const std::vector<std::string> & alignmentStrings,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<std::string> lemmatizedSourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId) { const int tmId) {
@ -185,53 +76,3 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
} }
} }
void IndexController::_getSourceSentencesAndAlignments(
std::vector<std::string> & sourceSentences,
std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<std::string> & rawSourceSentences) {
for (int i = 0; i<rawSourceSentences.size(); i++) {
std::string rawSourceSentence = rawSourceSentences[i];
std::string sourceSentence = "";
std::vector<std::vector<int> > alignments;
UnicodeString s(rawSourceSentence.c_str());
boost::u32regex_iterator<const UChar*> begin(
boost::make_u32regex_iterator(
s,
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
)
);
boost::u32regex_iterator<const UChar*> end;
for (; begin != end; ++begin) {
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
std::string token;
tokenUTF8.toUTF8String(token);
if (token != "NULL") {
std::string numbers((*begin)[2].first, (*begin)[2].second);
std::istringstream iss(numbers);
std::vector<std::string> numberStrings;
std::copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter(numberStrings));
std::vector<int> tokenAlignments;
for (int j=0;j<numberStrings.size();j++) {
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
tokenAlignments.push_back(n);
}
alignments.push_back(tokenAlignments);
sourceSentence += token + " ";
}
}
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
sourceSentences.push_back(sourceSentence);
allAlignments.push_back(alignments);
}
}

View File

@ -20,17 +20,11 @@ public:
/*! Constructor. /*! Constructor.
*/ */
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap, explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade) boost::shared_ptr<LemmatizerFacade> lemmatizerFacade);
throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~IndexController(); virtual ~IndexController();
void addSentence(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::string & sourceSentence,
const std::string & targetSentence,
const int tmId);
void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences, const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & concordiaSourceSentences, const std::vector<std::string> & concordiaSourceSentences,
@ -39,27 +33,10 @@ public:
const std::vector<int> & sourceIds, const std::vector<int> & sourceIds,
const int tmId); const int tmId);
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & rawSourceSentences,
const std::vector<std::string> & targetSentences,
const int tmId);
void addAlignedLemmatizedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const std::vector<std::string> & alignmentStrings,
const int tmId);
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter, void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId); const int tmId);
private: private:
void _getSourceSentencesAndAlignments(
std::vector<std::string> & sourceSentences,
std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<std::string> & rawSourceSentences);
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap; boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade; boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;

View File

@ -8,14 +8,14 @@
#include "rapidjson/error/en.h" #include "rapidjson/error/en.h"
#include <string> #include <string>
JsonLemmatizer::JsonLemmatizer() throw(ConcordiaException) { JsonLemmatizer::JsonLemmatizer() {
} }
JsonLemmatizer::~JsonLemmatizer() { JsonLemmatizer::~JsonLemmatizer() {
} }
std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { LemmatizerResult JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
rapidjson::StringBuffer paramsJson; rapidjson::StringBuffer paramsJson;
rapidjson::Writer<rapidjson::StringBuffer> jsonWriter(paramsJson); rapidjson::Writer<rapidjson::StringBuffer> jsonWriter(paramsJson);
jsonWriter.StartObject(); jsonWriter.StartObject();
@ -31,6 +31,7 @@ std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::str
RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString()); RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString());
rapidjson::Document d; rapidjson::Document d;
d.Parse(r.body.c_str()); d.Parse(r.body.c_str());
std::string lemmatized = d["processed_sentences"][0]["tokens"].GetString(); std::string lemmatizedSentence = d["processed_sentences"][0]["tokens"].GetString();
return lemmatized; bool isFirstLemmatized = d["processed_sentences"][0]["isFirstLemmatized"].GetBool();
return LemmatizerResult(lemmatizedSentence, isFirstLemmatized);
} }

View File

@ -5,18 +5,19 @@
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
#include "lemmatizer_result.hpp"
#include "logger.hpp" #include "logger.hpp"
class JsonLemmatizer { class JsonLemmatizer {
public: public:
/*! Constructor. /*! Constructor.
*/ */
explicit JsonLemmatizer() throw(ConcordiaException); explicit JsonLemmatizer();
/*! Destructor. /*! Destructor.
*/ */
virtual ~JsonLemmatizer(); virtual ~JsonLemmatizer();
std::string lemmatizeSentence(std::string languageCode, std::string sentence); LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
private: private:
Logger _logger; Logger _logger;
}; };

View File

@ -3,7 +3,7 @@
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) { LemmatizerFacade::LemmatizerFacade() {
_lemmatizersMap = boost::ptr_map<std::string,JsonLemmatizer>(); _lemmatizersMap = boost::ptr_map<std::string,JsonLemmatizer>();
// todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator // todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator
@ -18,7 +18,7 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
LemmatizerFacade::~LemmatizerFacade() { LemmatizerFacade::~LemmatizerFacade() {
} }
std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) { LemmatizerResult LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) {
boost::ptr_map<std::string,JsonLemmatizer>::iterator it = _lemmatizersMap.find(languageCode); boost::ptr_map<std::string,JsonLemmatizer>::iterator it = _lemmatizersMap.find(languageCode);
if (it != _lemmatizersMap.end()) { if (it != _lemmatizersMap.end()) {
@ -29,7 +29,7 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
} }
std::vector<std::string> LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) { LemmatizerResult LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
std::vector<std::string> result; std::vector<std::string> result;
BOOST_FOREACH(std::string & sentence, sentences) { BOOST_FOREACH(std::string & sentence, sentences) {
result.push_back(lemmatizeSentence(languageCode, sentence)); result.push_back(lemmatizeSentence(languageCode, sentence));

View File

@ -1,8 +1,8 @@
#ifndef LEMMATIZER_FACADE_HDR #ifndef LEMMATIZER_FACADE_HDR
#define LEMMATIZER_FACADE_HDR #define LEMMATIZER_FACADE_HDR
#include "socket_lemmatizer.hpp"
#include "json_lemmatizer.hpp" #include "json_lemmatizer.hpp"
#include "lemmatizer_result.hpp"
#include "tm_dao.hpp" #include "tm_dao.hpp"
#include <string> #include <string>
@ -14,14 +14,14 @@ class LemmatizerFacade {
public: public:
/*! Constructor. /*! Constructor.
*/ */
LemmatizerFacade() throw(ConcordiaException); LemmatizerFacade();
/*! Destructor. /*! Destructor.
*/ */
virtual ~LemmatizerFacade(); virtual ~LemmatizerFacade();
std::string lemmatizeSentence(std::string languageCode, std::string sentence); LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
std::vector<std::string> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences); std::vector<LemmatizerResult> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
private: private:
boost::ptr_map<std::string,JsonLemmatizer> _lemmatizersMap; boost::ptr_map<std::string,JsonLemmatizer> _lemmatizersMap;

View File

@ -0,0 +1,10 @@
#include "lemmatizer_result.hpp"
LemmatizerResutl::LemmatizerResult(const std::string & lemmatizedSentence,
const bool isFirstLemmatized):
_lemmatizedSentence(lemmatizedSentence),
_isFirstLemmatized(isFirstLemmatized) {
}
LemmatizerResutl::~LemmatizerResult();

View File

@ -0,0 +1,31 @@
#ifndef LEMMATIZER_RESULT_HDR
#define LEMMATIZER_RESULT__HDR
#include <string>
class LemmatizerResult {
public:
/*! Constructor.
*/
LemmatizerResult(const std::string & lemmatizedSentence,
const bool isFirstLemmatized);
/*! Destructor.
*/
virtual ~LemmatizerResult();
const std::string & getLemmatizedSentence() const {
return _lemmatizedSentence;
}
int isFirstLemmatized() const {
return _isFirstLemmatized;
}
private:
std::string _lemmatizedSentence;
bool _isFirstLemmatized;
};
#endif

View File

@ -12,8 +12,7 @@
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap, SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade) boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
throw(ConcordiaException):
_concordiasMap(concordiasMap), _concordiasMap(concordiasMap),
_lemmatizerFacade(lemmatizerFacade) { _lemmatizerFacade(lemmatizerFacade) {
} }
@ -26,6 +25,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
std::string & pattern, std::string & pattern,
const int tmId) { const int tmId) {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId); boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
_tmDAO.getTm
if (it != _concordiasMap->end()) { if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false); TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId); pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);

View File

@ -9,6 +9,7 @@
#include <concordia/interval.hpp> #include <concordia/interval.hpp>
#include "unit_dao.hpp" #include "unit_dao.hpp"
#include "tm_dao.hpp"
#include "simple_search_result.hpp" #include "simple_search_result.hpp"
#include "lemmatizer_facade.hpp" #include "lemmatizer_facade.hpp"
#include "rapidjson/writer.h" #include "rapidjson/writer.h"
@ -19,8 +20,7 @@ public:
/*! Constructor. /*! Constructor.
*/ */
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap, explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade) boost::shared_ptr<LemmatizerFacade> LemmatizerFacade);
throw(ConcordiaException);
/*! Destructor. /*! Destructor.
*/ */
virtual ~SearcherController(); virtual ~SearcherController();
@ -56,6 +56,8 @@ private:
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade; boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
UnitDAO _unitDAO; UnitDAO _unitDAO;
TmDAO _tmDAO;
}; };
#endif #endif

View File

@ -1,119 +0,0 @@
#include "socket_lemmatizer.hpp"
#include <time.h>
#include "config.hpp"
#include <boost/lexical_cast.hpp>
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
_port(port) {
}
SocketLemmatizer::~SocketLemmatizer() {
}
/**
Connect to a host on a certain port number
*/
bool SocketLemmatizer::_connect() {
//Create socket
_sock = socket(AF_INET , SOCK_STREAM , 0);
if (_sock == -1) {
throw ConcordiaException("Could not create socket for the lemmatizer.");
}
std::string address = "127.0.0.1";
//setup address structure
if(inet_addr(address.c_str()) == -1) {
struct hostent *he;
struct in_addr **addr_list;
//resolve the hostname, its not an ip address
if ( (he = gethostbyname( address.c_str() ) ) == NULL) {
//gethostbyname failed
throw ConcordiaException("gethostbyname: Failed to resolve hostname");
}
//Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only
addr_list = (struct in_addr **) he->h_addr_list;
for(int i = 0; addr_list[i] != NULL; i++) {
_server.sin_addr = *addr_list[i];
break;
}
} else { //plain ip address
_server.sin_addr.s_addr = inet_addr(address.c_str());
}
_server.sin_family = AF_INET;
_server.sin_port = htons(_port);
//Connect to remote server
if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) {
throw ConcordiaException("Connect failed. Error on address: "+address+":"+boost::lexical_cast<std::string>(_port));
}
return true;
}
bool SocketLemmatizer::_disconnect() {
close(_sock);
_sock = -1;
}
/**
Send data to the connected host
*/
bool SocketLemmatizer::_send_data(std::string data)
{
//Send some data
if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) {
throw ConcordiaException("Send failed");
}
return true;
}
/**
Receive data from the connected host
*/
std::string SocketLemmatizer::_receive(int size=512)
{
char buffer[size];
std::string reply = "";
//Receive a reply from the server
bool dataAvailable = true;
while (dataAvailable) {
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
if (amountReceived < 0) {
throw ConcordiaException("Lemmatizer: recv failed");
} else if (amountReceived == 0) {
dataAvailable = false;
} else {
buffer[amountReceived] = '\0';
reply += buffer;
}
}
return reply;
}
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
for (int i=0;i<5;i++) {
try {
_connect();
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
std::string reply = _receive(512);
_disconnect();
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
} catch (std::exception & e) {
_logger.logString("Problem with lemmatization of the sentence", sentence);
_logger.log("Waiting 2 seconds and retrying...");
sleep(2);
}
}
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
}

View File

@ -1,42 +0,0 @@
#ifndef SOCKET_LEMMATIZER_HDR
#define SOCKET_LEMMATIZER_HDR
#include <string>
#include <sys/socket.h> //socket
#include <arpa/inet.h> //inet_addr
#include <netdb.h> //hostent
#include <unistd.h>
#include <concordia/concordia_exception.hpp>
#include "logger.hpp"
class SocketLemmatizer {
public:
/*! Constructor.
*/
explicit SocketLemmatizer(int port) throw(ConcordiaException);
/*! Destructor.
*/
virtual ~SocketLemmatizer();
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
private:
bool _connect();
bool _disconnect();
bool _send_data(std::string data);
std::string _receive(int size);
int _port;
int _sock;
struct sockaddr_in _server;
Logger _logger;
};
#endif

View File

@ -88,7 +88,7 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
} }
Tm TmDAO::getTmInfo(int tmId) { Tm TmDAO::getTm(int tmId) {
DBconnection connection; DBconnection connection;
connection.startTransaction(); connection.startTransaction();
std::string query = "select tm.id, tm.name, tm.lemmatized, tm.paired_tm_id, source_language.code, target_language.code from tm inner join language as source_language on source_language.id = tm.source_lang_id inner join language as target_language on target_language.id = tm.target_lang_id where tm.id = $1::integer;"; std::string query = "select tm.id, tm.name, tm.lemmatized, tm.paired_tm_id, source_language.code, target_language.code from tm inner join language as source_language on source_language.id = tm.source_lang_id inner join language as target_language on target_language.id = tm.target_lang_id where tm.id = $1::integer;";

View File

@ -28,7 +28,7 @@ public:
std::vector<Tm> getTms(); std::vector<Tm> getTms();
std::pair<bool, std::string> getTmInfo(int tmId); std::pair<bool, std::string> getTm(int tmId);
private: private:

View File

@ -59,7 +59,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences, const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences, const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments, const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) throw (ConcordiaException) { const int tmId) {
DBconnection connection; DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds; std::vector<SUFFIX_MARKER_TYPE> newIds;
@ -78,7 +78,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & targetSentences, const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments, const std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<int> & sourceIds, const std::vector<int> & sourceIds,
const int tmId) throw (ConcordiaException) { const int tmId) {
DBconnection connection; DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds; std::vector<SUFFIX_MARKER_TYPE> newIds;
@ -270,7 +270,7 @@ int UnitDAO::_addAlignedUnit (
const TokenizedSentence & sourceSentence, const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence, const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments, const std::vector<std::vector<int> > & alignments,
const int tmId) throw(ConcordiaException) { const int tmId) {
if (sourceSentence.getTokens().size() != alignments.size()) { if (sourceSentence.getTokens().size() != alignments.size()) {
// Here we check if the source sentence, taken from src.tok, // Here we check if the source sentence, taken from src.tok,
@ -305,7 +305,7 @@ int UnitDAO::_addAlignedUnit (
const TokenizedSentence & targetSentence, const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments, const std::vector<std::vector<int> > & alignments,
const int sourceId, const int sourceId,
const int tmId) throw(ConcordiaException) { const int tmId) {
if (sourceSentence.getTokens().size() != alignments.size()) { if (sourceSentence.getTokens().size() != alignments.size()) {
// Here we check if the source sentence, taken from src.tok, // Here we check if the source sentence, taken from src.tok,

View File

@ -41,14 +41,14 @@ public:
const std::vector<TokenizedSentence> & sourceSentences, const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences, const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments, const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) throw (ConcordiaException); const int tmId);
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences( std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences, const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences, const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments, const std::vector<std::vector<std::vector<int> > > & allAlignments,
const std::vector<int> & sourceIds, const std::vector<int> & sourceIds,
const int tmId) throw (ConcordiaException); const int tmId);
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment); SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
@ -83,7 +83,7 @@ private:
const TokenizedSentence & sourceSentence, const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence, const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments, const std::vector<std::vector<int> > & alignments,
const int tmId) throw(ConcordiaException); const int tmId);
int _addAlignedUnit( int _addAlignedUnit(
DBconnection & connection, DBconnection & connection,
@ -91,7 +91,7 @@ private:
const TokenizedSentence & targetSentence, const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments, const std::vector<std::vector<int> > & alignments,
const int sourceId, const int sourceId,
const int tmId) throw(ConcordiaException); const int tmId);
std::vector<int> _getArray(std::string arrayString); std::vector<int> _getArray(std::string arrayString);