refactoring, lemmatizers
This commit is contained in:
parent
84d8102f58
commit
2690b15958
@ -20,8 +20,7 @@
|
|||||||
#include <boost/ptr_container/ptr_map.hpp>
|
#include <boost/ptr_container/ptr_map.hpp>
|
||||||
#include <boost/filesystem/path.hpp>
|
#include <boost/filesystem/path.hpp>
|
||||||
|
|
||||||
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
ConcordiaServer::ConcordiaServer(const std::string & configFilePath):
|
||||||
throw(ConcordiaException) :
|
|
||||||
_configFilePath(configFilePath) {
|
_configFilePath(configFilePath) {
|
||||||
|
|
||||||
std::vector<int> tmIds = _tmDAO.getTmIds();
|
std::vector<int> tmIds = _tmDAO.getTmIds();
|
||||||
@ -57,12 +56,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||||
} else { // json parsed
|
} else { // json parsed
|
||||||
std::string operation = _getStringParameter(d, OPERATION_PARAM);
|
std::string operation = _getStringParameter(d, OPERATION_PARAM);
|
||||||
if (operation == ADD_SENTENCE_OP) {
|
if (operation == ADD_SENTENCES_OP) {
|
||||||
std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM);
|
|
||||||
std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM);
|
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
|
||||||
_indexController->addSentence(jsonWriter, sourceSentence, targetSentence, tmId);
|
|
||||||
} else if (operation == ADD_SENTENCES_OP) {
|
|
||||||
std::vector<std::string> sourceSentences;
|
std::vector<std::string> sourceSentences;
|
||||||
std::vector<std::string> concordiaSourceSentences;
|
std::vector<std::string> concordiaSourceSentences;
|
||||||
std::vector<std::string> targetSentences;
|
std::vector<std::string> targetSentences;
|
||||||
@ -86,50 +80,6 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_indexController->addSentences(jsonWriter, sourceSentences, concordiaSourceSentences, targetSentences, alignments, sourceIds, tmId);
|
_indexController->addSentences(jsonWriter, sourceSentences, concordiaSourceSentences, targetSentences, alignments, sourceIds, tmId);
|
||||||
} else if (operation == ADD_ALIGNED_SENTENCES_OP) {
|
|
||||||
std::vector<std::string> sourceSentences;
|
|
||||||
std::vector<std::string> targetSentences;
|
|
||||||
int tmId = d[TM_ID_PARAM].GetInt();
|
|
||||||
// loading data from json
|
|
||||||
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
|
|
||||||
/*
|
|
||||||
Logger::log("addAlignedSentences");
|
|
||||||
Logger::logInt("sentences to add", sentencesArray.Size());
|
|
||||||
Logger::logInt("tm id", tmId);
|
|
||||||
*/
|
|
||||||
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
|
||||||
if (sentencesArray[i].Size() != 2) {
|
|
||||||
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 2 elements");
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
|
||||||
targetSentences.push_back(sentencesArray[i][1].GetString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
|
||||||
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
|
|
||||||
std::vector<std::string> sourceSentences;
|
|
||||||
std::vector<std::string> targetSentences;
|
|
||||||
std::vector<std::string> alignmentStrings;
|
|
||||||
int tmId = d[TM_ID_PARAM].GetInt();
|
|
||||||
// loading data from json
|
|
||||||
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
|
|
||||||
/*
|
|
||||||
Logger::log("addAlignedLemmatizedSentences");
|
|
||||||
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
|
|
||||||
Logger::logInt("tm id", tmId);
|
|
||||||
*/
|
|
||||||
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
|
||||||
if (sentencesArray[i].Size() != 3) {
|
|
||||||
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
|
||||||
targetSentences.push_back(sentencesArray[i][1].GetString());
|
|
||||||
alignmentStrings.push_back(sentencesArray[i][2].GetString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
|
||||||
} else if (operation == GET_TMS_INFO_OP) {
|
} else if (operation == GET_TMS_INFO_OP) {
|
||||||
std::vector<Tm> tms = _tmDAO.getTms();
|
std::vector<Tm> tms = _tmDAO.getTms();
|
||||||
|
|
||||||
@ -348,8 +298,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name)
|
std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const char * name) {
|
||||||
throw (ConcordiaException) {
|
|
||||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||||
if (itr != d.MemberEnd()) {
|
if (itr != d.MemberEnd()) {
|
||||||
std::string value = itr->value.GetString();
|
std::string value = itr->value.GetString();
|
||||||
@ -359,8 +308,7 @@ std::string ConcordiaServer::_getStringParameter(rapidjson::Document & d, const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name)
|
int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name) {
|
||||||
throw (ConcordiaException) {
|
|
||||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||||
if (itr != d.MemberEnd()) {
|
if (itr != d.MemberEnd()) {
|
||||||
int value = itr->value.GetInt();
|
int value = itr->value.GetInt();
|
||||||
@ -370,8 +318,7 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
|
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name) {
|
||||||
throw (ConcordiaException) {
|
|
||||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||||
if (itr != d.MemberEnd()) {
|
if (itr != d.MemberEnd()) {
|
||||||
bool value = itr->value.GetBool();
|
bool value = itr->value.GetBool();
|
||||||
@ -381,8 +328,7 @@ int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * nam
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v)
|
std::vector<std::vector<int> > ConcordiaServer::_getInt2DArray(const rapidjson::Value & v) {
|
||||||
throw (ConcordiaException) {
|
|
||||||
std::vector<std::vector<int> > result;
|
std::vector<std::vector<int> > result;
|
||||||
for (rapidjson::SizeType i = 0; i < v.Size(); i++) {
|
for (rapidjson::SizeType i = 0; i < v.Size(); i++) {
|
||||||
std::vector<int> innerArray;
|
std::vector<int> innerArray;
|
||||||
|
@ -26,8 +26,7 @@ public:
|
|||||||
\param configFilePath path to the Concordia configuration file
|
\param configFilePath path to the Concordia configuration file
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
explicit ConcordiaServer(const std::string & configFilePath)
|
explicit ConcordiaServer(const std::string & configFilePath);
|
||||||
throw(ConcordiaException);
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~ConcordiaServer();
|
virtual ~ConcordiaServer();
|
||||||
@ -37,13 +36,13 @@ public:
|
|||||||
private:
|
private:
|
||||||
void _logPhrase(std::string phraseString);
|
void _logPhrase(std::string phraseString);
|
||||||
|
|
||||||
std::string _getStringParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
std::string _getStringParameter(rapidjson::Document & d, const char * name);
|
||||||
|
|
||||||
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
int _getIntParameter(rapidjson::Document & d, const char * name);
|
||||||
|
|
||||||
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
int _getBoolParameter(rapidjson::Document & d, const char * name);
|
||||||
|
|
||||||
std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v) throw (ConcordiaException);
|
std::vector<std::vector<int> > _getInt2DArray(const rapidjson::Value & v);
|
||||||
|
|
||||||
void _addTm(int tmId);
|
void _addTm(int tmId);
|
||||||
|
|
||||||
|
@ -32,12 +32,9 @@
|
|||||||
#define LINK_PARAM "link"
|
#define LINK_PARAM "link"
|
||||||
#define SOURCES_PARAM "sources"
|
#define SOURCES_PARAM "sources"
|
||||||
|
|
||||||
#define ADD_SENTENCE_OP "addSentence"
|
|
||||||
#define ADD_SENTENCES_OP "addSentences"
|
#define ADD_SENTENCES_OP "addSentences"
|
||||||
#define LEMMATIZE_OP "lemmatize"
|
#define LEMMATIZE_OP "lemmatize"
|
||||||
#define LEMMATIZE_ALL_OP "lemmatizeAll"
|
#define LEMMATIZE_ALL_OP "lemmatizeAll"
|
||||||
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
|
||||||
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
|
||||||
#define ADD_REQUEST_OP "addRequest"
|
#define ADD_REQUEST_OP "addRequest"
|
||||||
#define GET_TMS_INFO_OP "getTmsInfo"
|
#define GET_TMS_INFO_OP "getTmsInfo"
|
||||||
#define GET_REQUESTS_INFO_OP "getRequestsInfo"
|
#define GET_REQUESTS_INFO_OP "getRequestsInfo"
|
||||||
|
@ -7,8 +7,8 @@
|
|||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
DBconnection::DBconnection() throw(ConcordiaException) {
|
DBconnection::DBconnection() {
|
||||||
std::string connectionInfo = "dbname="DB_NAME" user="DB_USER" password="DB_PASSWORD" host="DB_HOST" port="DB_PORT;
|
std::string connectionInfo = "dbname=" DB_NAME " user=" DB_USER " password=" DB_PASSWORD " host=" DB_HOST " port=" DB_PORT;
|
||||||
_connection = PQconnectdb(connectionInfo.c_str());
|
_connection = PQconnectdb(connectionInfo.c_str());
|
||||||
if (PQstatus(_connection) != CONNECTION_OK) {
|
if (PQstatus(_connection) != CONNECTION_OK) {
|
||||||
close();
|
close();
|
||||||
@ -31,7 +31,7 @@ void DBconnection::close() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBconnection::startTransaction() throw(ConcordiaException) {
|
void DBconnection::startTransaction() {
|
||||||
if (_connection != NULL) {
|
if (_connection != NULL) {
|
||||||
PGresult * result = PQexec(_connection, "BEGIN");
|
PGresult * result = PQexec(_connection, "BEGIN");
|
||||||
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
||||||
@ -44,7 +44,7 @@ void DBconnection::startTransaction() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBconnection::endTransaction() throw(ConcordiaException) {
|
void DBconnection::endTransaction() {
|
||||||
if (_connection != NULL) {
|
if (_connection != NULL) {
|
||||||
PGresult * result = PQexec(_connection, "END");
|
PGresult * result = PQexec(_connection, "END");
|
||||||
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
if (PQresultStatus(result) != PGRES_COMMAND_OK) {
|
||||||
@ -57,7 +57,7 @@ void DBconnection::endTransaction() throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) {
|
PGresult * DBconnection::execute(std::string query) {
|
||||||
if (_connection != NULL) {
|
if (_connection != NULL) {
|
||||||
PGresult * result = PQexec(_connection, query.c_str());
|
PGresult * result = PQexec(_connection, query.c_str());
|
||||||
if (PQresultStatus(result) != PGRES_COMMAND_OK &&
|
if (PQresultStatus(result) != PGRES_COMMAND_OK &&
|
||||||
@ -78,7 +78,7 @@ PGresult * DBconnection::execute(std::string query) throw(ConcordiaException) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
PGresult * DBconnection::execute(std::string query,
|
PGresult * DBconnection::execute(std::string query,
|
||||||
std::vector<QueryParam*> params) throw(ConcordiaException) {
|
std::vector<QueryParam*> params) {
|
||||||
if (_connection != NULL) {
|
if (_connection != NULL) {
|
||||||
const char * paramValues[params.size()];
|
const char * paramValues[params.size()];
|
||||||
int paramLengths[params.size()];
|
int paramLengths[params.size()];
|
||||||
@ -122,7 +122,7 @@ void DBconnection::clearResult(PGresult * result) {
|
|||||||
PQclear(result);
|
PQclear(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
int DBconnection::getIntValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
int DBconnection::getIntValue(PGresult * result, int row, int col) {
|
||||||
try {
|
try {
|
||||||
char * valueStr = PQgetvalue(result,row,col);
|
char * valueStr = PQgetvalue(result,row,col);
|
||||||
return strtol(valueStr, NULL, 10);
|
return strtol(valueStr, NULL, 10);
|
||||||
@ -133,7 +133,7 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
bool DBconnection::getBoolValue(PGresult * result, int row, int col) {
|
||||||
try {
|
try {
|
||||||
char * valueStr = PQgetvalue(result,row,col);
|
char * valueStr = PQgetvalue(result,row,col);
|
||||||
return std::string(valueStr) == "t";
|
return std::string(valueStr) == "t";
|
||||||
@ -144,7 +144,7 @@ bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (Conc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
std::string DBconnection::getStringValue(PGresult * result, int row, int col) {
|
||||||
try {
|
try {
|
||||||
char * valueStr = PQgetvalue(result,row,col);
|
char * valueStr = PQgetvalue(result,row,col);
|
||||||
return std::string(valueStr);
|
return std::string(valueStr);
|
||||||
@ -155,7 +155,7 @@ std::string DBconnection::getStringValue(PGresult * result, int row, int col) t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
|
int DBconnection::getRowCount(PGresult * result) {
|
||||||
try {
|
try {
|
||||||
return PQntuples(result);
|
return PQntuples(result);
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
|
@ -13,29 +13,29 @@ class DBconnection {
|
|||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
DBconnection() throw(ConcordiaException);
|
DBconnection();
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~DBconnection();
|
virtual ~DBconnection();
|
||||||
|
|
||||||
void startTransaction() throw(ConcordiaException);
|
void startTransaction();
|
||||||
|
|
||||||
void endTransaction() throw(ConcordiaException);
|
void endTransaction();
|
||||||
|
|
||||||
PGresult * execute(std::string query) throw(ConcordiaException);
|
PGresult * execute(std::string query);
|
||||||
|
|
||||||
PGresult * execute(std::string query,
|
PGresult * execute(std::string query,
|
||||||
std::vector<QueryParam*> params) throw(ConcordiaException);
|
std::vector<QueryParam*> params);
|
||||||
|
|
||||||
void clearResult(PGresult * result);
|
void clearResult(PGresult * result);
|
||||||
|
|
||||||
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
int getIntValue(PGresult * result, int row, int col);
|
||||||
|
|
||||||
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
bool getBoolValue(PGresult * result, int row, int col);
|
||||||
|
|
||||||
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
std::string getStringValue(PGresult * result, int row, int col);
|
||||||
|
|
||||||
int getRowCount(PGresult * result) throw (ConcordiaException);
|
int getRowCount(PGresult * result);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void close();
|
void close();
|
||||||
|
@ -15,8 +15,7 @@
|
|||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
|
||||||
throw(ConcordiaException):
|
|
||||||
_concordiasMap(concordiasMap),
|
_concordiasMap(concordiasMap),
|
||||||
_lemmatizerFacade(lemmatizerFacade) {
|
_lemmatizerFacade(lemmatizerFacade) {
|
||||||
}
|
}
|
||||||
@ -24,44 +23,6 @@ IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia>
|
|||||||
IndexController::~IndexController() {
|
IndexController::~IndexController() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void IndexController::addSentence(
|
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::string & sourceSentence,
|
|
||||||
const std::string & targetSentence,
|
|
||||||
const int tmId) {
|
|
||||||
|
|
||||||
try {
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
|
||||||
if (it != _concordiasMap->end()) {
|
|
||||||
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
|
|
||||||
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
|
||||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
|
||||||
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
|
|
||||||
it->second->refreshSAfromRAM();
|
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
|
||||||
jsonWriter.String("status");
|
|
||||||
jsonWriter.String("success");
|
|
||||||
jsonWriter.EndObject();
|
|
||||||
} else {
|
|
||||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
|
||||||
}
|
|
||||||
} catch (ConcordiaException & e) {
|
|
||||||
std::stringstream errorstream;
|
|
||||||
errorstream << "concordia error: " << e.what();
|
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
|
||||||
} catch (std::exception & e) {
|
|
||||||
std::stringstream errorstream;
|
|
||||||
errorstream << "general error: " << e.what();
|
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
|
||||||
} catch (...) {
|
|
||||||
std::stringstream errorstream;
|
|
||||||
errorstream << "unexpected error occurred";
|
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const std::vector<std::string> & sourceSentences,
|
const std::vector<std::string> & sourceSentences,
|
||||||
const std::vector<std::string> & concordiaSourceSentences,
|
const std::vector<std::string> & concordiaSourceSentences,
|
||||||
@ -93,76 +54,6 @@ void IndexController::addSentences(rapidjson::Writer<rapidjson::StringBuffer> &
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void IndexController::addAlignedSentences(
|
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::vector<std::string> & rawSourceSentences,
|
|
||||||
const std::vector<std::string> & targetSentences,
|
|
||||||
const int tmId) {
|
|
||||||
try {
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
|
||||||
if (it != _concordiasMap->end()) {
|
|
||||||
std::vector<std::string> sourceSentences;
|
|
||||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
|
||||||
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
|
|
||||||
|
|
||||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
|
||||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
|
||||||
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
|
|
||||||
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
|
|
||||||
}
|
|
||||||
jsonWriter.StartObject();
|
|
||||||
jsonWriter.String("status");
|
|
||||||
jsonWriter.String("success");
|
|
||||||
jsonWriter.EndObject();
|
|
||||||
} else {
|
|
||||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
|
||||||
}
|
|
||||||
} catch (ConcordiaException & e) {
|
|
||||||
std::stringstream errorstream;
|
|
||||||
errorstream << "concordia error: " << e.what();
|
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void IndexController::addAlignedLemmatizedSentences(
|
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::vector<std::string> & sourceSentences,
|
|
||||||
const std::vector<std::string> & targetSentences,
|
|
||||||
const std::vector<std::string> & alignmentStrings,
|
|
||||||
const int tmId) {
|
|
||||||
try {
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
|
||||||
if (it != _concordiasMap->end()) {
|
|
||||||
std::vector<std::string> lemmatizedSourceSentences;
|
|
||||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
|
||||||
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
|
|
||||||
|
|
||||||
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
|
|
||||||
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
|
||||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
|
|
||||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
|
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
|
||||||
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
|
||||||
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
|
|
||||||
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
|
|
||||||
}
|
|
||||||
jsonWriter.StartObject();
|
|
||||||
jsonWriter.String("status");
|
|
||||||
jsonWriter.String("success");
|
|
||||||
jsonWriter.EndObject();
|
|
||||||
} else {
|
|
||||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
|
||||||
}
|
|
||||||
} catch (ConcordiaException & e) {
|
|
||||||
std::stringstream errorstream;
|
|
||||||
errorstream << "concordia error: " << e.what();
|
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
@ -184,54 +75,4 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
|||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void IndexController::_getSourceSentencesAndAlignments(
|
|
||||||
std::vector<std::string> & sourceSentences,
|
|
||||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
|
||||||
const std::vector<std::string> & rawSourceSentences) {
|
|
||||||
|
|
||||||
for (int i = 0; i<rawSourceSentences.size(); i++) {
|
|
||||||
std::string rawSourceSentence = rawSourceSentences[i];
|
|
||||||
|
|
||||||
std::string sourceSentence = "";
|
|
||||||
std::vector<std::vector<int> > alignments;
|
|
||||||
|
|
||||||
UnicodeString s(rawSourceSentence.c_str());
|
|
||||||
boost::u32regex_iterator<const UChar*> begin(
|
|
||||||
boost::make_u32regex_iterator(
|
|
||||||
s,
|
|
||||||
boost::make_u32regex(UnicodeString("(\\S+) \\(\\{(( \\d+)*) \\}\\)"), boost::regex::icase)
|
|
||||||
)
|
|
||||||
);
|
|
||||||
boost::u32regex_iterator<const UChar*> end;
|
|
||||||
|
|
||||||
for (; begin != end; ++begin) {
|
|
||||||
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
|
|
||||||
std::string token;
|
|
||||||
tokenUTF8.toUTF8String(token);
|
|
||||||
|
|
||||||
if (token != "NULL") {
|
|
||||||
std::string numbers((*begin)[2].first, (*begin)[2].second);
|
|
||||||
std::istringstream iss(numbers);
|
|
||||||
std::vector<std::string> numberStrings;
|
|
||||||
std::copy(std::istream_iterator<std::string>(iss),
|
|
||||||
std::istream_iterator<std::string>(),
|
|
||||||
std::back_inserter(numberStrings));
|
|
||||||
|
|
||||||
std::vector<int> tokenAlignments;
|
|
||||||
for (int j=0;j<numberStrings.size();j++) {
|
|
||||||
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
|
|
||||||
tokenAlignments.push_back(n);
|
|
||||||
}
|
|
||||||
alignments.push_back(tokenAlignments);
|
|
||||||
sourceSentence += token + " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
|
|
||||||
|
|
||||||
sourceSentences.push_back(sourceSentence);
|
|
||||||
allAlignments.push_back(alignments);
|
|
||||||
}
|
|
||||||
}
|
|
@ -20,17 +20,11 @@ public:
|
|||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade);
|
||||||
throw(ConcordiaException);
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~IndexController();
|
virtual ~IndexController();
|
||||||
|
|
||||||
void addSentence(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::string & sourceSentence,
|
|
||||||
const std::string & targetSentence,
|
|
||||||
const int tmId);
|
|
||||||
|
|
||||||
void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void addSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const std::vector<std::string> & sourceSentences,
|
const std::vector<std::string> & sourceSentences,
|
||||||
const std::vector<std::string> & concordiaSourceSentences,
|
const std::vector<std::string> & concordiaSourceSentences,
|
||||||
@ -39,27 +33,10 @@ public:
|
|||||||
const std::vector<int> & sourceIds,
|
const std::vector<int> & sourceIds,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
void addAlignedSentences(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::vector<std::string> & rawSourceSentences,
|
|
||||||
const std::vector<std::string> & targetSentences,
|
|
||||||
const int tmId);
|
|
||||||
|
|
||||||
void addAlignedLemmatizedSentences(
|
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
|
||||||
const std::vector<std::string> & sourceSentences,
|
|
||||||
const std::vector<std::string> & targetSentences,
|
|
||||||
const std::vector<std::string> & alignmentStrings,
|
|
||||||
const int tmId);
|
|
||||||
|
|
||||||
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _getSourceSentencesAndAlignments(
|
|
||||||
std::vector<std::string> & sourceSentences,
|
|
||||||
std::vector<std::vector<std::vector<int> > > & allAlignments,
|
|
||||||
const std::vector<std::string> & rawSourceSentences);
|
|
||||||
|
|
||||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||||
|
|
||||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||||
|
@ -8,14 +8,14 @@
|
|||||||
#include "rapidjson/error/en.h"
|
#include "rapidjson/error/en.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
JsonLemmatizer::JsonLemmatizer() throw(ConcordiaException) {
|
JsonLemmatizer::JsonLemmatizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
JsonLemmatizer::~JsonLemmatizer() {
|
JsonLemmatizer::~JsonLemmatizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
LemmatizerResult JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||||
rapidjson::StringBuffer paramsJson;
|
rapidjson::StringBuffer paramsJson;
|
||||||
rapidjson::Writer<rapidjson::StringBuffer> jsonWriter(paramsJson);
|
rapidjson::Writer<rapidjson::StringBuffer> jsonWriter(paramsJson);
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
@ -31,6 +31,7 @@ std::string JsonLemmatizer::lemmatizeSentence(std::string languageCode, std::str
|
|||||||
RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString());
|
RestClient::Response r = RestClient::post("http://concordia-preprocessor:9001/lemmatize", "application/json", paramsJson.GetString());
|
||||||
rapidjson::Document d;
|
rapidjson::Document d;
|
||||||
d.Parse(r.body.c_str());
|
d.Parse(r.body.c_str());
|
||||||
std::string lemmatized = d["processed_sentences"][0]["tokens"].GetString();
|
std::string lemmatizedSentence = d["processed_sentences"][0]["tokens"].GetString();
|
||||||
return lemmatized;
|
bool isFirstLemmatized = d["processed_sentences"][0]["isFirstLemmatized"].GetBool();
|
||||||
|
return LemmatizerResult(lemmatizedSentence, isFirstLemmatized);
|
||||||
}
|
}
|
||||||
|
@ -5,18 +5,19 @@
|
|||||||
|
|
||||||
#include <concordia/concordia_exception.hpp>
|
#include <concordia/concordia_exception.hpp>
|
||||||
|
|
||||||
|
#include "lemmatizer_result.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
class JsonLemmatizer {
|
class JsonLemmatizer {
|
||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
explicit JsonLemmatizer() throw(ConcordiaException);
|
explicit JsonLemmatizer();
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~JsonLemmatizer();
|
virtual ~JsonLemmatizer();
|
||||||
|
|
||||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||||
private:
|
private:
|
||||||
Logger _logger;
|
Logger _logger;
|
||||||
};
|
};
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
|
|
||||||
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
LemmatizerFacade::LemmatizerFacade() {
|
||||||
_lemmatizersMap = boost::ptr_map<std::string,JsonLemmatizer>();
|
_lemmatizersMap = boost::ptr_map<std::string,JsonLemmatizer>();
|
||||||
|
|
||||||
// todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator
|
// todo: extract this to configuration, especially when new lemmatizers ConstMemberIterator
|
||||||
@ -18,7 +18,7 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
|||||||
LemmatizerFacade::~LemmatizerFacade() {
|
LemmatizerFacade::~LemmatizerFacade() {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
LemmatizerResult LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||||
|
|
||||||
boost::ptr_map<std::string,JsonLemmatizer>::iterator it = _lemmatizersMap.find(languageCode);
|
boost::ptr_map<std::string,JsonLemmatizer>::iterator it = _lemmatizersMap.find(languageCode);
|
||||||
if (it != _lemmatizersMap.end()) {
|
if (it != _lemmatizersMap.end()) {
|
||||||
@ -29,7 +29,7 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
|
LemmatizerResult LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
|
||||||
std::vector<std::string> result;
|
std::vector<std::string> result;
|
||||||
BOOST_FOREACH(std::string & sentence, sentences) {
|
BOOST_FOREACH(std::string & sentence, sentences) {
|
||||||
result.push_back(lemmatizeSentence(languageCode, sentence));
|
result.push_back(lemmatizeSentence(languageCode, sentence));
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#ifndef LEMMATIZER_FACADE_HDR
|
#ifndef LEMMATIZER_FACADE_HDR
|
||||||
#define LEMMATIZER_FACADE_HDR
|
#define LEMMATIZER_FACADE_HDR
|
||||||
|
|
||||||
#include "socket_lemmatizer.hpp"
|
|
||||||
#include "json_lemmatizer.hpp"
|
#include "json_lemmatizer.hpp"
|
||||||
|
#include "lemmatizer_result.hpp"
|
||||||
#include "tm_dao.hpp"
|
#include "tm_dao.hpp"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -14,14 +14,14 @@ class LemmatizerFacade {
|
|||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
LemmatizerFacade() throw(ConcordiaException);
|
LemmatizerFacade();
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~LemmatizerFacade();
|
virtual ~LemmatizerFacade();
|
||||||
|
|
||||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
LemmatizerResult lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||||
|
|
||||||
std::vector<std::string> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
|
std::vector<LemmatizerResult> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::ptr_map<std::string,JsonLemmatizer> _lemmatizersMap;
|
boost::ptr_map<std::string,JsonLemmatizer> _lemmatizersMap;
|
||||||
|
10
concordia-server/lemmatizer_result.cpp
Normal file
10
concordia-server/lemmatizer_result.cpp
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#include "lemmatizer_result.hpp"
|
||||||
|
|
||||||
|
LemmatizerResutl::LemmatizerResult(const std::string & lemmatizedSentence,
|
||||||
|
const bool isFirstLemmatized):
|
||||||
|
_lemmatizedSentence(lemmatizedSentence),
|
||||||
|
_isFirstLemmatized(isFirstLemmatized) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
LemmatizerResutl::~LemmatizerResult();
|
31
concordia-server/lemmatizer_result.hpp
Normal file
31
concordia-server/lemmatizer_result.hpp
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#ifndef LEMMATIZER_RESULT_HDR
|
||||||
|
#define LEMMATIZER_RESULT__HDR
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class LemmatizerResult {
|
||||||
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
|
LemmatizerResult(const std::string & lemmatizedSentence,
|
||||||
|
const bool isFirstLemmatized);
|
||||||
|
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~LemmatizerResult();
|
||||||
|
|
||||||
|
const std::string & getLemmatizedSentence() const {
|
||||||
|
return _lemmatizedSentence;
|
||||||
|
}
|
||||||
|
|
||||||
|
int isFirstLemmatized() const {
|
||||||
|
return _isFirstLemmatized;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string _lemmatizedSentence;
|
||||||
|
|
||||||
|
bool _isFirstLemmatized;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -12,8 +12,7 @@
|
|||||||
|
|
||||||
|
|
||||||
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade):
|
||||||
throw(ConcordiaException):
|
|
||||||
_concordiasMap(concordiasMap),
|
_concordiasMap(concordiasMap),
|
||||||
_lemmatizerFacade(lemmatizerFacade) {
|
_lemmatizerFacade(lemmatizerFacade) {
|
||||||
}
|
}
|
||||||
@ -26,6 +25,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
|||||||
std::string & pattern,
|
std::string & pattern,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
|
_tmDAO.getTm
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
|
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
|
||||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);
|
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <concordia/interval.hpp>
|
#include <concordia/interval.hpp>
|
||||||
|
|
||||||
#include "unit_dao.hpp"
|
#include "unit_dao.hpp"
|
||||||
|
#include "tm_dao.hpp"
|
||||||
#include "simple_search_result.hpp"
|
#include "simple_search_result.hpp"
|
||||||
#include "lemmatizer_facade.hpp"
|
#include "lemmatizer_facade.hpp"
|
||||||
#include "rapidjson/writer.h"
|
#include "rapidjson/writer.h"
|
||||||
@ -19,8 +20,7 @@ public:
|
|||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
|
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
|
||||||
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
|
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade);
|
||||||
throw(ConcordiaException);
|
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~SearcherController();
|
virtual ~SearcherController();
|
||||||
@ -56,6 +56,8 @@ private:
|
|||||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||||
|
|
||||||
UnitDAO _unitDAO;
|
UnitDAO _unitDAO;
|
||||||
|
|
||||||
|
TmDAO _tmDAO;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,119 +0,0 @@
|
|||||||
#include "socket_lemmatizer.hpp"
|
|
||||||
|
|
||||||
#include <time.h>
|
|
||||||
|
|
||||||
#include "config.hpp"
|
|
||||||
|
|
||||||
#include <boost/lexical_cast.hpp>
|
|
||||||
|
|
||||||
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
|
|
||||||
_port(port) {
|
|
||||||
}
|
|
||||||
|
|
||||||
SocketLemmatizer::~SocketLemmatizer() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
Connect to a host on a certain port number
|
|
||||||
*/
|
|
||||||
bool SocketLemmatizer::_connect() {
|
|
||||||
|
|
||||||
//Create socket
|
|
||||||
_sock = socket(AF_INET , SOCK_STREAM , 0);
|
|
||||||
if (_sock == -1) {
|
|
||||||
throw ConcordiaException("Could not create socket for the lemmatizer.");
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string address = "127.0.0.1";
|
|
||||||
|
|
||||||
//setup address structure
|
|
||||||
if(inet_addr(address.c_str()) == -1) {
|
|
||||||
struct hostent *he;
|
|
||||||
struct in_addr **addr_list;
|
|
||||||
|
|
||||||
//resolve the hostname, its not an ip address
|
|
||||||
if ( (he = gethostbyname( address.c_str() ) ) == NULL) {
|
|
||||||
//gethostbyname failed
|
|
||||||
throw ConcordiaException("gethostbyname: Failed to resolve hostname");
|
|
||||||
}
|
|
||||||
|
|
||||||
//Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only
|
|
||||||
addr_list = (struct in_addr **) he->h_addr_list;
|
|
||||||
|
|
||||||
for(int i = 0; addr_list[i] != NULL; i++) {
|
|
||||||
_server.sin_addr = *addr_list[i];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else { //plain ip address
|
|
||||||
_server.sin_addr.s_addr = inet_addr(address.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
_server.sin_family = AF_INET;
|
|
||||||
_server.sin_port = htons(_port);
|
|
||||||
|
|
||||||
//Connect to remote server
|
|
||||||
if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) {
|
|
||||||
throw ConcordiaException("Connect failed. Error on address: "+address+":"+boost::lexical_cast<std::string>(_port));
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool SocketLemmatizer::_disconnect() {
|
|
||||||
close(_sock);
|
|
||||||
_sock = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
Send data to the connected host
|
|
||||||
*/
|
|
||||||
bool SocketLemmatizer::_send_data(std::string data)
|
|
||||||
{
|
|
||||||
//Send some data
|
|
||||||
if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) {
|
|
||||||
throw ConcordiaException("Send failed");
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
Receive data from the connected host
|
|
||||||
*/
|
|
||||||
std::string SocketLemmatizer::_receive(int size=512)
|
|
||||||
{
|
|
||||||
char buffer[size];
|
|
||||||
std::string reply = "";
|
|
||||||
|
|
||||||
//Receive a reply from the server
|
|
||||||
bool dataAvailable = true;
|
|
||||||
while (dataAvailable) {
|
|
||||||
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
|
|
||||||
if (amountReceived < 0) {
|
|
||||||
throw ConcordiaException("Lemmatizer: recv failed");
|
|
||||||
} else if (amountReceived == 0) {
|
|
||||||
dataAvailable = false;
|
|
||||||
} else {
|
|
||||||
buffer[amountReceived] = '\0';
|
|
||||||
reply += buffer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return reply;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
|
||||||
for (int i=0;i<5;i++) {
|
|
||||||
try {
|
|
||||||
_connect();
|
|
||||||
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
|
|
||||||
std::string reply = _receive(512);
|
|
||||||
_disconnect();
|
|
||||||
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
|
|
||||||
} catch (std::exception & e) {
|
|
||||||
_logger.logString("Problem with lemmatization of the sentence", sentence);
|
|
||||||
_logger.log("Waiting 2 seconds and retrying...");
|
|
||||||
sleep(2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
|
|
||||||
}
|
|
@ -1,42 +0,0 @@
|
|||||||
#ifndef SOCKET_LEMMATIZER_HDR
|
|
||||||
#define SOCKET_LEMMATIZER_HDR
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <sys/socket.h> //socket
|
|
||||||
#include <arpa/inet.h> //inet_addr
|
|
||||||
#include <netdb.h> //hostent
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include <concordia/concordia_exception.hpp>
|
|
||||||
|
|
||||||
#include "logger.hpp"
|
|
||||||
|
|
||||||
class SocketLemmatizer {
|
|
||||||
public:
|
|
||||||
/*! Constructor.
|
|
||||||
*/
|
|
||||||
explicit SocketLemmatizer(int port) throw(ConcordiaException);
|
|
||||||
/*! Destructor.
|
|
||||||
*/
|
|
||||||
virtual ~SocketLemmatizer();
|
|
||||||
|
|
||||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
|
||||||
private:
|
|
||||||
bool _connect();
|
|
||||||
|
|
||||||
bool _disconnect();
|
|
||||||
|
|
||||||
bool _send_data(std::string data);
|
|
||||||
|
|
||||||
std::string _receive(int size);
|
|
||||||
|
|
||||||
int _port;
|
|
||||||
|
|
||||||
int _sock;
|
|
||||||
|
|
||||||
struct sockaddr_in _server;
|
|
||||||
|
|
||||||
Logger _logger;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
@ -88,7 +88,7 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Tm TmDAO::getTmInfo(int tmId) {
|
Tm TmDAO::getTm(int tmId) {
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
connection.startTransaction();
|
connection.startTransaction();
|
||||||
std::string query = "select tm.id, tm.name, tm.lemmatized, tm.paired_tm_id, source_language.code, target_language.code from tm inner join language as source_language on source_language.id = tm.source_lang_id inner join language as target_language on target_language.id = tm.target_lang_id where tm.id = $1::integer;";
|
std::string query = "select tm.id, tm.name, tm.lemmatized, tm.paired_tm_id, source_language.code, target_language.code from tm inner join language as source_language on source_language.id = tm.source_lang_id inner join language as target_language on target_language.id = tm.target_lang_id where tm.id = $1::integer;";
|
||||||
|
@ -28,7 +28,7 @@ public:
|
|||||||
|
|
||||||
std::vector<Tm> getTms();
|
std::vector<Tm> getTms();
|
||||||
|
|
||||||
std::pair<bool, std::string> getTmInfo(int tmId);
|
std::pair<bool, std::string> getTm(int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
@ -59,7 +59,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
|||||||
const std::vector<TokenizedSentence> & sourceSentences,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId) throw (ConcordiaException) {
|
const int tmId) {
|
||||||
|
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||||
@ -78,7 +78,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
|||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const std::vector<int> & sourceIds,
|
const std::vector<int> & sourceIds,
|
||||||
const int tmId) throw (ConcordiaException) {
|
const int tmId) {
|
||||||
|
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||||
@ -270,7 +270,7 @@ int UnitDAO::_addAlignedUnit (
|
|||||||
const TokenizedSentence & sourceSentence,
|
const TokenizedSentence & sourceSentence,
|
||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) throw(ConcordiaException) {
|
const int tmId) {
|
||||||
|
|
||||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||||
// Here we check if the source sentence, taken from src.tok,
|
// Here we check if the source sentence, taken from src.tok,
|
||||||
@ -305,7 +305,7 @@ int UnitDAO::_addAlignedUnit (
|
|||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int sourceId,
|
const int sourceId,
|
||||||
const int tmId) throw(ConcordiaException) {
|
const int tmId) {
|
||||||
|
|
||||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||||
// Here we check if the source sentence, taken from src.tok,
|
// Here we check if the source sentence, taken from src.tok,
|
||||||
|
@ -41,14 +41,14 @@ public:
|
|||||||
const std::vector<TokenizedSentence> & sourceSentences,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId) throw (ConcordiaException);
|
const int tmId);
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
||||||
const std::vector<TokenizedSentence> & sourceSentences,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const std::vector<int> & sourceIds,
|
const std::vector<int> & sourceIds,
|
||||||
const int tmId) throw (ConcordiaException);
|
const int tmId);
|
||||||
|
|
||||||
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
|
SimpleSearchResult getSimpleSearchResult(const MatchedPatternFragment & fragment);
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ private:
|
|||||||
const TokenizedSentence & sourceSentence,
|
const TokenizedSentence & sourceSentence,
|
||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) throw(ConcordiaException);
|
const int tmId);
|
||||||
|
|
||||||
int _addAlignedUnit(
|
int _addAlignedUnit(
|
||||||
DBconnection & connection,
|
DBconnection & connection,
|
||||||
@ -91,7 +91,7 @@ private:
|
|||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int sourceId,
|
const int sourceId,
|
||||||
const int tmId) throw(ConcordiaException);
|
const int tmId);
|
||||||
|
|
||||||
std::vector<int> _getArray(std::string arrayString);
|
std::vector<int> _getArray(std::string arrayString);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user