working lemmatization
This commit is contained in:
parent
8b0666c34d
commit
89fb77bf58
24
concordia-server/bool_param.cpp
Normal file
24
concordia-server/bool_param.cpp
Normal file
@ -0,0 +1,24 @@
|
||||
#include "bool_param.hpp"
|
||||
|
||||
|
||||
BoolParam::BoolParam(bool value):_value(value) {
|
||||
}
|
||||
|
||||
BoolParam::~BoolParam() {
|
||||
}
|
||||
|
||||
const char * BoolParam::getValue() {
|
||||
if (_value) {
|
||||
return "t";
|
||||
} else {
|
||||
return "f";
|
||||
}
|
||||
}
|
||||
|
||||
const int BoolParam::getLength() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int BoolParam::isBinary() {
|
||||
return 0;
|
||||
}
|
24
concordia-server/bool_param.hpp
Normal file
24
concordia-server/bool_param.hpp
Normal file
@ -0,0 +1,24 @@
|
||||
#ifndef BOOL_PARAM_HDR
|
||||
#define BOOL_PARAM_HDR
|
||||
|
||||
#include "query_param.hpp"
|
||||
|
||||
class BoolParam : public QueryParam {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
BoolParam(bool value);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~BoolParam();
|
||||
|
||||
const char * getValue();
|
||||
|
||||
const int getLength();
|
||||
|
||||
const int isBinary();
|
||||
private:
|
||||
bool _value;
|
||||
};
|
||||
|
||||
#endif
|
@ -5,6 +5,7 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <ctime>
|
||||
#include <utility>
|
||||
|
||||
#include <concordia/interval.hpp>
|
||||
|
||||
@ -19,16 +20,17 @@
|
||||
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
||||
throw(ConcordiaException) :
|
||||
_configFilePath(configFilePath) {
|
||||
|
||||
std::vector<int> tmIds = _tmDAO.getTmIds();
|
||||
_concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
|
||||
|
||||
BOOST_FOREACH(int & tmId, tmIds) {
|
||||
_addTm(tmId);
|
||||
}
|
||||
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
|
||||
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
|
||||
|
||||
_lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
|
||||
|
||||
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap, _lemmatizerFacade));
|
||||
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap, _lemmatizerFacade));
|
||||
}
|
||||
|
||||
ConcordiaServer::~ConcordiaServer() {
|
||||
@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
||||
}
|
||||
}
|
||||
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
||||
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::string> targetSentences;
|
||||
std::vector<std::string> alignmentStrings;
|
||||
int tmId = d[TM_ID_PARAM].GetInt();
|
||||
// loading data from json
|
||||
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
|
||||
Logger::log("addAlignedLemmatizedSentences");
|
||||
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
|
||||
Logger::logInt("tm id", tmId);
|
||||
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
||||
if (sentencesArray[i].Size() != 3) {
|
||||
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
|
||||
break;
|
||||
} else {
|
||||
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
||||
targetSentences.push_back(sentencesArray[i][1].GetString());
|
||||
alignmentStrings.push_back(sentencesArray[i][2].GetString());
|
||||
}
|
||||
}
|
||||
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
||||
} else if (operation == "lemmatize") {
|
||||
std::string sentence = _getStringParameter(d, "sentence");
|
||||
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||
@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
||||
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
|
||||
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
|
||||
std::string name = _getStringParameter(d, NAME_PARAM);
|
||||
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
|
||||
bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM);
|
||||
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized);
|
||||
_addTm(newId);
|
||||
|
||||
jsonWriter.StartObject();
|
||||
@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
|
||||
}
|
||||
}
|
||||
|
||||
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
|
||||
throw (ConcordiaException) {
|
||||
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||
if (itr != d.MemberEnd()) {
|
||||
bool value = itr->value.GetBool();
|
||||
return value;
|
||||
} else {
|
||||
throw ConcordiaException("missing parameter: " + std::string(name));
|
||||
}
|
||||
}
|
||||
|
||||
void ConcordiaServer::_addTm(int tmId) {
|
||||
std::stringstream indexPath;
|
||||
indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
|
||||
|
@ -38,6 +38,8 @@ private:
|
||||
|
||||
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||
|
||||
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||
|
||||
void _addTm(int tmId);
|
||||
|
||||
std::string _configFilePath;
|
||||
|
@ -16,7 +16,9 @@
|
||||
#define SOURCE_SENTENCE_PARAM "sourceSentence"
|
||||
#define TARGET_SENTENCE_PARAM "targetSentence"
|
||||
#define TM_ID_PARAM "tmId"
|
||||
#define TM_LEMMATIZED_PARAM "tmLemmatized"
|
||||
#define SENTENCES_PARAM "sentences"
|
||||
#define EXAMPLES_PARAM "examples"
|
||||
#define SOURCE_LANG_PARAM "sourceLangId"
|
||||
#define TARGET_LANG_PARAM "targetLangId"
|
||||
#define NAME_PARAM "name"
|
||||
@ -25,6 +27,7 @@
|
||||
#define ADD_SENTENCE_OP "addSentence"
|
||||
#define ADD_SENTENCES_OP "addSentences"
|
||||
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
||||
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
||||
#define REFRESH_INDEX_OP "refreshIndex"
|
||||
#define SIMPLE_SEARCH_OP "simpleSearch"
|
||||
#define CONCORDIA_SEARCH_OP "concordiaSearch"
|
||||
|
@ -17,7 +17,7 @@ DBconnection::DBconnection() throw(ConcordiaException) {
|
||||
ss << "Connection string: " << connectionInfo;
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
DBconnection::~DBconnection() {
|
||||
@ -90,8 +90,8 @@ PGresult * DBconnection::execute(std::string query,
|
||||
paramFormats[index] = param->isBinary();
|
||||
index++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
PGresult * result = PQexecParams(_connection,
|
||||
query.c_str(),
|
||||
params.size(),
|
||||
@ -129,7 +129,18 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
|
||||
} catch (std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Error getting int value. Message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||
try {
|
||||
char * valueStr = PQgetvalue(result,row,col);
|
||||
return std::string(valueStr) == "t";
|
||||
} catch (std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Error getting bool value. Message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,7 +161,6 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
|
||||
} catch (std::exception & e) {
|
||||
std::stringstream ss;
|
||||
ss << "Error getting int value. Message: " << e.what();
|
||||
throw ConcordiaException(ss.str());
|
||||
throw ConcordiaException(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -31,6 +31,8 @@ public:
|
||||
|
||||
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
|
||||
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
|
||||
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||
|
||||
int getRowCount(PGresult * result) throw (ConcordiaException);
|
||||
|
@ -14,9 +14,11 @@
|
||||
#include "json_generator.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
||||
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException):
|
||||
_concordiasMap(concordiasMap) {
|
||||
_concordiasMap(concordiasMap),
|
||||
_lemmatizerFacade(lemmatizerFacade) {
|
||||
}
|
||||
|
||||
IndexController::~IndexController() {
|
||||
@ -32,9 +34,10 @@ void IndexController::addSentence(
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
|
||||
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
||||
it->second->addTokenizedExample(tokenizedSentence, sentenceId);
|
||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
||||
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
|
||||
it->second->refreshSAfromRAM();
|
||||
|
||||
jsonWriter.StartObject();
|
||||
@ -42,20 +45,20 @@ void IndexController::addSentence(
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
} catch (std::exception & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "general error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
} catch (...) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "unexpected error occurred";
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -67,21 +70,22 @@ void IndexController::addSentences(
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
|
||||
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
|
||||
it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
|
||||
it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -96,28 +100,66 @@ void IndexController::addAlignedSentences(
|
||||
std::vector<std::string> sourceSentences;
|
||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||
_getSourceSentencesAndAlignments(sourceSentences, allAlignments, rawSourceSentences);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, true);
|
||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||
for(int index = 0; index < tokenizedSourceSentences.size(); index++) {
|
||||
it->second->addTokenizedExample(tokenizedSourceSentences.at(index), sentenceIds.at(index));
|
||||
}
|
||||
}
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
void IndexController::addAlignedLemmatizedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const std::vector<std::string> & alignmentStrings,
|
||||
const int tmId) {
|
||||
try {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
std::vector<std::string> lemmatizedSourceSentences;
|
||||
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
|
||||
|
||||
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
|
||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
||||
|
||||
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
|
||||
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
|
||||
}
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const int tmId) {
|
||||
try {
|
||||
@ -130,12 +172,12 @@ void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuf
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
} catch (ConcordiaException & e) {
|
||||
std::stringstream errorstream;
|
||||
errorstream << "concordia error: " << e.what();
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||
}
|
||||
|
||||
}
|
||||
@ -147,10 +189,10 @@ void IndexController::_getSourceSentencesAndAlignments(
|
||||
|
||||
for (int i = 0; i<rawSourceSentences.size(); i++) {
|
||||
std::string rawSourceSentence = rawSourceSentences[i];
|
||||
|
||||
|
||||
std::string sourceSentence = "";
|
||||
std::vector<std::vector<int> > alignments;
|
||||
|
||||
|
||||
UnicodeString s(rawSourceSentence.c_str());
|
||||
boost::u32regex_iterator<const UChar*> begin(
|
||||
boost::make_u32regex_iterator(
|
||||
@ -159,21 +201,21 @@ void IndexController::_getSourceSentencesAndAlignments(
|
||||
)
|
||||
);
|
||||
boost::u32regex_iterator<const UChar*> end;
|
||||
|
||||
|
||||
for (; begin != end; ++begin) {
|
||||
UnicodeString tokenUTF8((*begin)[1].first, (*begin).length(1));
|
||||
std::string token;
|
||||
tokenUTF8.toUTF8String(token);
|
||||
|
||||
if (token != "NULL") {
|
||||
std::string numbers((*begin)[2].first, (*begin)[2].second);
|
||||
std::string numbers((*begin)[2].first, (*begin)[2].second);
|
||||
std::istringstream iss(numbers);
|
||||
std::vector<std::string> numberStrings;
|
||||
std::copy(std::istream_iterator<std::string>(iss),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::back_inserter(numberStrings));
|
||||
|
||||
std::vector<int> tokenAlignments;
|
||||
std::vector<int> tokenAlignments;
|
||||
for (int j=0;j<numberStrings.size();j++) {
|
||||
int n = atoi(numberStrings[j].c_str()) - 1; //subtracting 1 as we want alignments to be 0-based
|
||||
tokenAlignments.push_back(n);
|
||||
@ -182,11 +224,10 @@ void IndexController::_getSourceSentencesAndAlignments(
|
||||
sourceSentence += token + " ";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sourceSentence = sourceSentence.substr(0, sourceSentence.length()-1);
|
||||
|
||||
|
||||
sourceSentences.push_back(sourceSentence);
|
||||
allAlignments.push_back(alignments);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,8 @@
|
||||
|
||||
|
||||
#include "unit_dao.hpp"
|
||||
#include "lemmatizer_facade.hpp"
|
||||
|
||||
|
||||
#include "rapidjson/writer.h"
|
||||
|
||||
@ -17,7 +19,8 @@ class IndexController {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
||||
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException);
|
||||
/*! Destructor.
|
||||
*/
|
||||
@ -38,9 +41,16 @@ public:
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
void addAlignedLemmatizedSentences(
|
||||
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const std::vector<std::string> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const std::vector<std::string> & alignmentStrings,
|
||||
const int tmId);
|
||||
|
||||
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
const int tmId);
|
||||
|
||||
|
||||
private:
|
||||
void _getSourceSentencesAndAlignments(
|
||||
std::vector<std::string> & sourceSentences,
|
||||
@ -48,7 +58,9 @@ private:
|
||||
const std::vector<std::string> & rawSourceSentences);
|
||||
|
||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||
|
||||
|
||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||
|
||||
UnitDAO _unitDAO;
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "lemmatizer_facade.hpp"
|
||||
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
|
||||
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
||||
_lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
|
||||
@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
|
||||
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
||||
if (tmInfo.first) {
|
||||
return lemmatizeSentence(tmInfo.second, pattern);
|
||||
} else {
|
||||
return pattern;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId) {
|
||||
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
||||
if (tmInfo.first) {
|
||||
std::vector<std::string> result;
|
||||
BOOST_FOREACH(std::string & pattern, patterns) {
|
||||
result.push_back(lemmatizeSentence(tmInfo.second, pattern));
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
return patterns;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define LEMMATIZER_FACADE_HDR
|
||||
|
||||
#include "socket_lemmatizer.hpp"
|
||||
#include "tm_dao.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
@ -18,8 +19,15 @@ public:
|
||||
virtual ~LemmatizerFacade();
|
||||
|
||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||
|
||||
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
|
||||
|
||||
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
|
||||
|
||||
private:
|
||||
boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
|
||||
|
||||
TmDAO _tmDAO;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -8,9 +8,11 @@
|
||||
#include "logger.hpp"
|
||||
|
||||
|
||||
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
||||
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||
throw(ConcordiaException):
|
||||
_concordiasMap(concordiasMap) {
|
||||
_concordiasMap(concordiasMap),
|
||||
_lemmatizerFacade(lemmatizerFacade) {
|
||||
}
|
||||
|
||||
SearcherController::~SearcherController() {
|
||||
@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
||||
const int tmId) {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
|
||||
|
||||
jsonWriter.StartObject();
|
||||
@ -30,48 +33,49 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
||||
jsonWriter.String("results");
|
||||
jsonWriter.StartArray();
|
||||
BOOST_FOREACH(SimpleSearchResult & result, results) {
|
||||
JsonGenerator::writeSearchResult(jsonWriter, result);
|
||||
}
|
||||
JsonGenerator::writeSearchResult(jsonWriter, result);
|
||||
}
|
||||
jsonWriter.EndArray();
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
}
|
||||
|
||||
void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||
std::string & pattern,
|
||||
const std::vector<Interval> & intervals,
|
||||
const int tmId) {
|
||||
const int tmId) {
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
if (intervals.size() > 0) {
|
||||
// std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||
std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
||||
|
||||
|
||||
Logger::log("concordiaPhraseSearch");
|
||||
Logger::logString("short pattern", shortPattern);
|
||||
std::vector<SimpleSearchResult> shortPatternResults = _unitDAO.getSearchResults(it->second->simpleSearch(shortPattern));
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
jsonWriter.String("found");
|
||||
if (shortPatternResults.size() > 0) {
|
||||
jsonWriter.Bool(true);
|
||||
|
||||
|
||||
|
||||
|
||||
std::vector<SimpleSearchResult> bestOverlay;
|
||||
|
||||
|
||||
int currStart = 0;
|
||||
BOOST_FOREACH(const Interval & interval, intervals) {
|
||||
CompleteConcordiaSearchResult restResult = _unitDAO.getConcordiaResult(
|
||||
it->second->concordiaSearch(pattern.substr(currStart, interval.getStart()-currStart)));
|
||||
restResult.offsetPattern(currStart);
|
||||
bestOverlay.insert(bestOverlay.end(), restResult.getBestOverlay().begin(), restResult.getBestOverlay().end());
|
||||
|
||||
|
||||
SimpleSearchResult shortPatternresult = shortPatternResults[0];
|
||||
shortPatternresult.setMatchedPatternStart(interval.getStart());
|
||||
shortPatternresult.setMatchedPatternEnd(interval.getEnd());
|
||||
@ -82,26 +86,26 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
|
||||
it->second->concordiaSearch(_substrUTF8(pattern,currStart,INT_MAX)));
|
||||
lastRestResult.offsetPattern(currStart);
|
||||
bestOverlay.insert(bestOverlay.end(), lastRestResult.getBestOverlay().begin(), lastRestResult.getBestOverlay().end());
|
||||
|
||||
|
||||
jsonWriter.String("result");
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("bestOverlay");
|
||||
jsonWriter.StartArray();
|
||||
BOOST_FOREACH(SimpleSearchResult & simpleResult, bestOverlay) {
|
||||
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
|
||||
}
|
||||
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
|
||||
}
|
||||
jsonWriter.EndArray();
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
jsonWriter.Bool(false);
|
||||
jsonWriter.Bool(false);
|
||||
}
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no intervals for phrase search");
|
||||
}
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -111,8 +115,9 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
|
||||
|
||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||
if (it != _concordiasMap->end()) {
|
||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
|
||||
|
||||
|
||||
jsonWriter.StartObject();
|
||||
jsonWriter.String("status");
|
||||
jsonWriter.String("success");
|
||||
@ -123,16 +128,16 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
|
||||
jsonWriter.String("bestOverlay");
|
||||
jsonWriter.StartArray();
|
||||
BOOST_FOREACH(SimpleSearchResult & simpleResult, result.getBestOverlay()) {
|
||||
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
|
||||
}
|
||||
JsonGenerator::writeSearchResult(jsonWriter, simpleResult);
|
||||
}
|
||||
jsonWriter.EndArray();
|
||||
jsonWriter.EndObject();
|
||||
|
||||
|
||||
|
||||
|
||||
jsonWriter.EndObject();
|
||||
} else {
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||
}
|
||||
}
|
||||
|
||||
std::string SearcherController::_substrUTF8(std::string source, int start, int length) {
|
||||
@ -146,6 +151,3 @@ std::string SearcherController::_substrUTF8(std::string source, int start, int l
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "unit_dao.hpp"
|
||||
#include "simple_search_result.hpp"
|
||||
#include "lemmatizer_facade.hpp"
|
||||
#include "rapidjson/writer.h"
|
||||
|
||||
|
||||
@ -17,8 +18,9 @@ class SearcherController {
|
||||
public:
|
||||
/*! Constructor.
|
||||
*/
|
||||
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
||||
throw(ConcordiaException);
|
||||
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
|
||||
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
|
||||
throw(ConcordiaException);
|
||||
/*! Destructor.
|
||||
*/
|
||||
virtual ~SearcherController();
|
||||
@ -40,7 +42,9 @@ private:
|
||||
std::string _substrUTF8(std::string source, int start, int length);
|
||||
|
||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||
|
||||
|
||||
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||
|
||||
UnitDAO _unitDAO;
|
||||
};
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "query_param.hpp"
|
||||
#include "string_param.hpp"
|
||||
#include "int_param.hpp"
|
||||
#include "bool_param.hpp"
|
||||
#include "int_array_param.hpp"
|
||||
#include "logger.hpp"
|
||||
|
||||
@ -27,20 +28,25 @@ std::vector<int> TmDAO::getTmIds() {
|
||||
}
|
||||
connection.clearResult(dbResult);
|
||||
connection.endTransaction();
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
|
||||
addTm(sourceLangId, targetLangId, name, false);
|
||||
}
|
||||
|
||||
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) {
|
||||
DBconnection connection;
|
||||
connection.startTransaction();
|
||||
|
||||
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
|
||||
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(sourceLangId));
|
||||
params.push_back(new IntParam(targetLangId));
|
||||
params.push_back(new StringParam(name));
|
||||
|
||||
params.push_back(new BoolParam(lemmatized));
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
int newId = connection.getIntValue(result, 0, 0);
|
||||
connection.clearResult(result);
|
||||
@ -48,8 +54,23 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
|
||||
return newId;
|
||||
|
||||
}
|
||||
|
||||
std::pair<bool, std::string> TmDAO::getTmInfo(int tmId) {
|
||||
DBconnection connection;
|
||||
connection.startTransaction();
|
||||
std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(tmId));
|
||||
PGresult * dbResult = connection.execute(query, params);
|
||||
bool lemmatized = connection.getBoolValue(dbResult, 0, 1);
|
||||
std::string languageCode = connection.getStringValue(dbResult, 0, 2);
|
||||
connection.clearResult(dbResult);
|
||||
connection.endTransaction();
|
||||
|
||||
return std::pair<bool, std::string>(lemmatized, languageCode);
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
#include <concordia/common/config.hpp>
|
||||
#include "db_connection.hpp"
|
||||
@ -18,8 +19,12 @@ public:
|
||||
|
||||
int addTm(const int sourceLangId, const int targetLangId, const std::string name);
|
||||
|
||||
int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized);
|
||||
|
||||
std::vector<int> getTmIds();
|
||||
|
||||
std::pair<bool, std::string> getTmInfo(int tmId);
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
|
@ -22,7 +22,7 @@ int UnitDAO::addSentence(
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const std::string & targetSentence,
|
||||
const int tmId) {
|
||||
|
||||
|
||||
DBconnection connection;
|
||||
connection.startTransaction();
|
||||
int newId = _addSingleSentence(connection, sourceSentence, targetSentence, tmId);
|
||||
@ -38,7 +38,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addSentences(
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
connection.startTransaction();
|
||||
int index = 0;
|
||||
BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) {
|
||||
BOOST_FOREACH(const TokenizedSentence & sourceSentence, sourceSentences) {
|
||||
newIds.push_back(_addSingleSentence(connection, sourceSentence, targetSentences.at(index), tmId));
|
||||
index++;
|
||||
}
|
||||
@ -50,7 +50,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId) {
|
||||
const int tmId) throw (ConcordiaException) {
|
||||
|
||||
DBconnection connection;
|
||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||
@ -59,9 +59,9 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
||||
for (int i=0; i< sourceSentences.size(); i++) {
|
||||
newIds.push_back(_addAlignedUnit(connection, sourceSentences.at(i), targetSentences.at(i), allAlignments.at(i), tmId));
|
||||
}
|
||||
|
||||
|
||||
connection.endTransaction();
|
||||
return newIds;
|
||||
return newIds;
|
||||
}
|
||||
|
||||
std::vector<SimpleSearchResult> UnitDAO::getSearchResults(const std::vector<MatchedPatternFragment> & fragments) {
|
||||
@ -83,7 +83,7 @@ void UnitDAO::_getResultsFromFragments(
|
||||
std::vector<SimpleSearchResult> & results,
|
||||
const std::vector<MatchedPatternFragment> & fragments,
|
||||
const TokenizedSentence & tokenizedPattern) {
|
||||
|
||||
|
||||
DBconnection connection;
|
||||
connection.startTransaction();
|
||||
|
||||
@ -95,9 +95,9 @@ void UnitDAO::_getResultsFromFragments(
|
||||
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
|
||||
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new IntParam(2*fragment.getExampleOffset()+1));
|
||||
@ -116,7 +116,7 @@ void UnitDAO::_getResultsFromFragments(
|
||||
delete param;
|
||||
}
|
||||
|
||||
// now add all target fragments matched with this fragment
|
||||
// now add all target fragments matched with this fragment
|
||||
std::string targetQuery = "SELECT target_token_pos, target_tokens[2*target_token_pos+1], target_tokens[2*target_token_pos+2] FROM unit INNER JOIN alignment ON alignment.unit_id = unit.id AND unit.id = $1::integer AND source_token_pos between $2::integer and $3::integer ORDER BY target_token_pos";
|
||||
std::vector<QueryParam*> targetParams;
|
||||
targetParams.push_back(new IntParam(fragment.getExampleId()));
|
||||
@ -127,12 +127,12 @@ void UnitDAO::_getResultsFromFragments(
|
||||
int prevPos = -2;
|
||||
int currStart = -1;
|
||||
int currEnd = -1;
|
||||
|
||||
|
||||
for (int i=0;i<connection.getRowCount(targetResult);i++) {
|
||||
int targetPos = connection.getIntValue(targetResult, i, 0);
|
||||
int targetStart = connection.getIntValue(targetResult, i, 1);
|
||||
int targetEnd = connection.getIntValue(targetResult, i, 2);
|
||||
|
||||
|
||||
if (prevPos < targetPos - 1) { // beginning of detached fragment
|
||||
// check if there is a fragment to end
|
||||
if (currStart >= 0) {
|
||||
@ -141,7 +141,7 @@ void UnitDAO::_getResultsFromFragments(
|
||||
currStart = targetStart;
|
||||
}
|
||||
|
||||
currEnd = targetEnd;
|
||||
currEnd = targetEnd;
|
||||
prevPos = targetPos;
|
||||
}
|
||||
|
||||
@ -154,9 +154,9 @@ void UnitDAO::_getResultsFromFragments(
|
||||
BOOST_FOREACH (QueryParam * param, targetParams) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
|
||||
results.push_back(ssResult);
|
||||
}
|
||||
}
|
||||
connection.endTransaction();
|
||||
}
|
||||
|
||||
@ -181,25 +181,29 @@ int UnitDAO::_addSingleSentence(
|
||||
params.push_back(new StringParam(targetSentence));
|
||||
params.push_back(new IntParam(tmId));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
||||
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
int newId = connection.getIntValue(result, 0, 0);
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
int UnitDAO::_addAlignedUnit(
|
||||
int UnitDAO::_addAlignedUnit (
|
||||
DBconnection & connection,
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId) {
|
||||
|
||||
const int tmId) throw(ConcordiaException) {
|
||||
|
||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||
throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
|
||||
}
|
||||
|
||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||
std::vector<QueryParam*> params;
|
||||
params.push_back(new StringParam(sourceSentence.getSentence()));
|
||||
@ -207,14 +211,14 @@ int UnitDAO::_addAlignedUnit(
|
||||
params.push_back(new IntParam(tmId));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
||||
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
|
||||
|
||||
|
||||
PGresult * result = connection.execute(query, params);
|
||||
int newId = connection.getIntValue(result, 0, 0);
|
||||
connection.clearResult(result);
|
||||
BOOST_FOREACH (QueryParam * param, params) {
|
||||
delete param;
|
||||
}
|
||||
|
||||
|
||||
// add alignments
|
||||
bool nonEmpty = false;
|
||||
std::stringstream alignmentsQuery;
|
||||
@ -230,10 +234,8 @@ int UnitDAO::_addAlignedUnit(
|
||||
query = alignmentsQuery.str();
|
||||
query = query.substr(0, query.length()-1);
|
||||
PGresult * result = connection.execute(query);
|
||||
connection.clearResult(result);
|
||||
connection.clearResult(result);
|
||||
}
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <concordia/substring_occurence.hpp>
|
||||
#include <concordia/matched_pattern_fragment.hpp>
|
||||
#include <concordia/concordia_search_result.hpp>
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "simple_search_result.hpp"
|
||||
@ -33,13 +34,13 @@ public:
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<std::string> & targetSentences,
|
||||
const int tmId);
|
||||
|
||||
|
||||
std::vector<SUFFIX_MARKER_TYPE> addAlignedSentences(
|
||||
const std::vector<TokenizedSentence> & sourceSentences,
|
||||
const std::vector<TokenizedSentence> & targetSentences,
|
||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||
const int tmId);
|
||||
|
||||
const int tmId) throw (ConcordiaException);
|
||||
|
||||
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
||||
|
||||
CompleteConcordiaSearchResult getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult);
|
||||
@ -50,7 +51,7 @@ private:
|
||||
const TokenizedSentence & tokenizedPattern);
|
||||
|
||||
std::vector<int> _getTokenPositions(const TokenizedSentence & ts);
|
||||
|
||||
|
||||
int _addSingleSentence(
|
||||
DBconnection & connection,
|
||||
const TokenizedSentence & sourceSentence,
|
||||
@ -62,7 +63,7 @@ private:
|
||||
const TokenizedSentence & sourceSentence,
|
||||
const TokenizedSentence & targetSentence,
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId);
|
||||
const int tmId) throw(ConcordiaException);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -3,7 +3,8 @@ CREATE TABLE tm (
|
||||
id SERIAL PRIMARY KEY,
|
||||
source_lang_id integer,
|
||||
target_lang_id integer,
|
||||
name varchar(40)
|
||||
name varchar(40),
|
||||
lemmatized bool DEFAULT false
|
||||
);
|
||||
|
||||
DROP TABLE IF EXISTS language;
|
||||
|
Binary file not shown.
@ -12,12 +12,20 @@ namespace LemmaGenSentenceLemmatizer
|
||||
{
|
||||
if (args.Length == 1)
|
||||
{
|
||||
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
|
||||
string line = Console.ReadLine();
|
||||
while (!string.IsNullOrEmpty(line))
|
||||
try
|
||||
{
|
||||
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
|
||||
line = Console.ReadLine();
|
||||
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
|
||||
string line = Console.ReadLine();
|
||||
while (line != null)
|
||||
{
|
||||
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
|
||||
line = Console.ReadLine();
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine("Exception occurred: " + ex.Message);
|
||||
}
|
||||
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,10 +1,22 @@
|
||||
SRC_LANG=en
|
||||
TRG_LANG=pl
|
||||
CORPUS_NAME=europarl
|
||||
CORPUS_NAME=europarljrc
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||
cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
|
||||
clean-intermediate-files:
|
||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||
rm -f corpora/$(CORPUS_NAME)/*.low
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||
|
||||
|
||||
clean:
|
||||
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||
|
168
mgiza-aligner/clean-corpus-n.perl
Executable file
168
mgiza-aligner/clean-corpus-n.perl
Executable file
@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
|
||||
use warnings;
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
my $help;
|
||||
my $lc = 0; # lowercase the corpus?
|
||||
my $ignore_ratio = 0;
|
||||
my $ignore_xml = 0;
|
||||
my $enc = "utf8"; # encoding of the input and output files
|
||||
# set to anything else you wish, but I have not tested it yet
|
||||
my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
|
||||
# is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
|
||||
# and crashes if it encounters a word that exceeds it
|
||||
my $ratio = 9;
|
||||
|
||||
GetOptions(
|
||||
"help" => \$help,
|
||||
"lowercase|lc" => \$lc,
|
||||
"encoding=s" => \$enc,
|
||||
"ratio=f" => \$ratio,
|
||||
"ignore-ratio" => \$ignore_ratio,
|
||||
"ignore-xml" => \$ignore_xml,
|
||||
"max-word-length|mwl=s" => \$max_word_length
|
||||
) or exit(1);
|
||||
|
||||
if (scalar(@ARGV) < 6 || $help) {
|
||||
print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
my $corpus = $ARGV[0];
|
||||
my $l1 = $ARGV[1];
|
||||
my $l2 = $ARGV[2];
|
||||
my $out = $ARGV[3];
|
||||
my $min = $ARGV[4];
|
||||
my $max = $ARGV[5];
|
||||
|
||||
my $linesRetainedFile = "";
|
||||
if (scalar(@ARGV) > 6) {
|
||||
$linesRetainedFile = $ARGV[6];
|
||||
open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
|
||||
}
|
||||
|
||||
print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
|
||||
|
||||
my $opn = undef;
|
||||
my $l1input = "$corpus.$l1";
|
||||
if (-e $l1input) {
|
||||
$opn = $l1input;
|
||||
} elsif (-e $l1input.".gz") {
|
||||
$opn = "gunzip -c $l1input.gz |";
|
||||
} else {
|
||||
die "Error: $l1input does not exist";
|
||||
}
|
||||
open(F,$opn) or die "Can't open '$opn'";
|
||||
$opn = undef;
|
||||
my $l2input = "$corpus.$l2";
|
||||
if (-e $l2input) {
|
||||
$opn = $l2input;
|
||||
} elsif (-e $l2input.".gz") {
|
||||
$opn = "gunzip -c $l2input.gz |";
|
||||
} else {
|
||||
die "Error: $l2input does not exist";
|
||||
}
|
||||
|
||||
open(E,$opn) or die "Can't open '$opn'";
|
||||
|
||||
open(FO,">$out.$l1") or die "Can't write $out.$l1";
|
||||
open(EO,">$out.$l2") or die "Can't write $out.$l2";
|
||||
|
||||
# necessary for proper lowercasing
|
||||
my $binmode;
|
||||
if ($enc eq "utf8") {
|
||||
$binmode = ":utf8";
|
||||
} else {
|
||||
$binmode = ":encoding($enc)";
|
||||
}
|
||||
binmode(F, $binmode);
|
||||
binmode(E, $binmode);
|
||||
binmode(FO, $binmode);
|
||||
binmode(EO, $binmode);
|
||||
|
||||
my $innr = 0;
|
||||
my $outnr = 0;
|
||||
my $factored_flag;
|
||||
while(my $f = <F>) {
|
||||
$innr++;
|
||||
print STDERR "." if $innr % 10000 == 0;
|
||||
print STDERR "($innr)" if $innr % 100000 == 0;
|
||||
my $e = <E>;
|
||||
die "$corpus.$l2 is too short!" if !defined $e;
|
||||
chomp($e);
|
||||
chomp($f);
|
||||
if ($innr == 1) {
|
||||
$factored_flag = ($e =~ /\|/ || $f =~ /\|/);
|
||||
}
|
||||
|
||||
#if lowercasing, lowercase
|
||||
if ($lc) {
|
||||
$e = lc($e);
|
||||
$f = lc($f);
|
||||
}
|
||||
|
||||
$e =~ s/\|//g unless $factored_flag;
|
||||
$e =~ s/\s+/ /g;
|
||||
$e =~ s/^ //;
|
||||
$e =~ s/ $//;
|
||||
$f =~ s/\|//g unless $factored_flag;
|
||||
$f =~ s/\s+/ /g;
|
||||
$f =~ s/^ //;
|
||||
$f =~ s/ $//;
|
||||
next if $f eq '';
|
||||
next if $e eq '';
|
||||
|
||||
my $ec = &word_count($e);
|
||||
my $fc = &word_count($f);
|
||||
next if $ec > $max;
|
||||
next if $fc > $max;
|
||||
next if $ec < $min;
|
||||
next if $fc < $min;
|
||||
next if !$ignore_ratio && $ec/$fc > $ratio;
|
||||
next if !$ignore_ratio && $fc/$ec > $ratio;
|
||||
# Skip this segment if any factor is longer than $max_word_length
|
||||
my $max_word_length_plus_one = $max_word_length + 1;
|
||||
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
|
||||
next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
|
||||
|
||||
# An extra check: none of the factors can be blank!
|
||||
die "There is a blank factor in $corpus.$l1 on line $innr: $f"
|
||||
if $f =~ /[ \|]\|/;
|
||||
die "There is a blank factor in $corpus.$l2 on line $innr: $e"
|
||||
if $e =~ /[ \|]\|/;
|
||||
|
||||
$outnr++;
|
||||
print FO $f."\n";
|
||||
print EO $e."\n";
|
||||
|
||||
if ($linesRetainedFile ne "") {
|
||||
print LINES_RETAINED $innr."\n";
|
||||
}
|
||||
}
|
||||
|
||||
if ($linesRetainedFile ne "") {
|
||||
close LINES_RETAINED;
|
||||
}
|
||||
|
||||
print STDERR "\n";
|
||||
my $e = <E>;
|
||||
die "$corpus.$l2 is too long!" if defined $e;
|
||||
|
||||
print STDERR "Input sentences: $innr Output sentences: $outnr\n";
|
||||
|
||||
sub word_count {
|
||||
my ($line) = @_;
|
||||
if ($ignore_xml) {
|
||||
$line =~ s/<\S[^>]*\S>/ /g;
|
||||
$line =~ s/\s+/ /g;
|
||||
$line =~ s/^ //g;
|
||||
$line =~ s/ $//g;
|
||||
}
|
||||
my @w = split(/ /,$line);
|
||||
return scalar @w;
|
||||
}
|
26
mgiza-aligner/sortGizaAlignments.py
Executable file
26
mgiza-aligner/sortGizaAlignments.py
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import sys, re
|
||||
|
||||
examples_dict = {}
|
||||
p = re.compile("# Sentence pair \((\d+)\)")
|
||||
|
||||
i = 0
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if i % 3 == 0:
|
||||
current_example = [line]
|
||||
m = p.match(line)
|
||||
if m:
|
||||
current_key = int(m.group(1))
|
||||
else:
|
||||
raise Exception("Wrong line: "+line)
|
||||
elif i % 3 == 1:
|
||||
current_example.append(line)
|
||||
else:
|
||||
current_example.append(line)
|
||||
examples_dict[current_key] = current_example
|
||||
i+=1
|
||||
|
||||
for key in sorted(examples_dict.keys()):
|
||||
print ('\n'.join(examples_dict[key]))
|
60
tests/addAlignedLemmatizedTM.py
Normal file → Executable file
60
tests/addAlignedLemmatizedTM.py
Normal file → Executable file
@ -21,10 +21,15 @@ def file_len(fname):
|
||||
pass
|
||||
return i + 1
|
||||
|
||||
def add_data(data):
|
||||
def add_examples(examplesData):
|
||||
req = urllib2.Request(address)
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
|
||||
if response['status'] == 'error':
|
||||
raise Exception(response['message'])
|
||||
|
||||
if len(sys.argv) != 7:
|
||||
raise Exception("wrong number of arguments")
|
||||
|
||||
name = sys.argv[1]
|
||||
sourceFile = sys.argv[2]
|
||||
@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
|
||||
raise Exception("alignments file is not exactly 3 times longer than source and target")
|
||||
|
||||
|
||||
totalLines = file_len(sourceFile)
|
||||
totalExamples = file_len(sourceFile)
|
||||
|
||||
data = {
|
||||
'operation': 'addTm',
|
||||
'sourceLangId':sourceLangId,
|
||||
'targetLangId':targetLangId,
|
||||
'name':name
|
||||
'name':name,
|
||||
'tmLemmatized':True
|
||||
}
|
||||
|
||||
req = urllib2.Request(address)
|
||||
@ -60,35 +66,35 @@ data = {
|
||||
'tmId':tmId
|
||||
}
|
||||
|
||||
sentences = []
|
||||
examples = []
|
||||
start = time.time()
|
||||
with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines:
|
||||
with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
|
||||
for lineNumber in range(totalExamples):
|
||||
sourceSentence = sf.readline().strip()
|
||||
targetSentence = tf.readline().strip()
|
||||
|
||||
lineNumber = 0
|
||||
for line in sourceLines:
|
||||
line = line.strip()
|
||||
if lineNumber % 3 == 1:
|
||||
currSentence.append(line)
|
||||
elif lineNumber % 3 == 2:
|
||||
currSentence.append(line)
|
||||
currSentence.reverse()
|
||||
sentences.append(currSentence)
|
||||
currSentence = []
|
||||
if len(sentences) >= BUFFER_SIZE:
|
||||
data['sentences'] = sentences
|
||||
add_data(data)
|
||||
mark = time.time()
|
||||
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
|
||||
sentences = []
|
||||
lineNumber += 1
|
||||
# skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
|
||||
af.readline()
|
||||
af.readline()
|
||||
|
||||
alignmentString = af.readline().strip()
|
||||
|
||||
examples.append([sourceSentence, targetSentence, alignmentString])
|
||||
|
||||
if len(examples) >= BUFFER_SIZE:
|
||||
data['examples'] = examples
|
||||
add_examples(data)
|
||||
mark = time.time()
|
||||
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
|
||||
examples = []
|
||||
|
||||
|
||||
if len(sentences) > 0:
|
||||
data['sentences'] = sentences
|
||||
add_data(data)
|
||||
if len(examples) > 0:
|
||||
data['examples'] = examples
|
||||
add_examples(data)
|
||||
|
||||
end = time.time()
|
||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
||||
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
|
||||
|
||||
print "Generating index..."
|
||||
start = time.time()
|
||||
|
7
tests/addLemmatizedTM.sh
Executable file
7
tests/addLemmatizedTM.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
CORPUS_NAME="europarl_sample"
|
||||
SRC_LANG_ID=2
|
||||
TRG_LANG_ID=1
|
||||
|
||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
@ -16,7 +16,8 @@ data = {
|
||||
'operation': 'addTm',
|
||||
'sourceLangId':int(sys.argv[1]),
|
||||
'targetLangId':int(sys.argv[2]),
|
||||
'name':sys.argv[3]
|
||||
'name':sys.argv[3],
|
||||
'tmLemmatized':bool(int(sys.argv[4]))
|
||||
}
|
||||
|
||||
req = urllib2.Request(address)
|
||||
|
Loading…
Reference in New Issue
Block a user