working lemmatization
This commit is contained in:
parent
8b0666c34d
commit
89fb77bf58
24
concordia-server/bool_param.cpp
Normal file
24
concordia-server/bool_param.cpp
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#include "bool_param.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
BoolParam::BoolParam(bool value):_value(value) {
|
||||||
|
}
|
||||||
|
|
||||||
|
BoolParam::~BoolParam() {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * BoolParam::getValue() {
|
||||||
|
if (_value) {
|
||||||
|
return "t";
|
||||||
|
} else {
|
||||||
|
return "f";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int BoolParam::getLength() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int BoolParam::isBinary() {
|
||||||
|
return 0;
|
||||||
|
}
|
24
concordia-server/bool_param.hpp
Normal file
24
concordia-server/bool_param.hpp
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#ifndef BOOL_PARAM_HDR
|
||||||
|
#define BOOL_PARAM_HDR
|
||||||
|
|
||||||
|
#include "query_param.hpp"
|
||||||
|
|
||||||
|
class BoolParam : public QueryParam {
|
||||||
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
|
BoolParam(bool value);
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~BoolParam();
|
||||||
|
|
||||||
|
const char * getValue();
|
||||||
|
|
||||||
|
const int getLength();
|
||||||
|
|
||||||
|
const int isBinary();
|
||||||
|
private:
|
||||||
|
bool _value;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -5,6 +5,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#include <concordia/interval.hpp>
|
#include <concordia/interval.hpp>
|
||||||
|
|
||||||
@ -19,16 +20,17 @@
|
|||||||
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
||||||
throw(ConcordiaException) :
|
throw(ConcordiaException) :
|
||||||
_configFilePath(configFilePath) {
|
_configFilePath(configFilePath) {
|
||||||
|
|
||||||
std::vector<int> tmIds = _tmDAO.getTmIds();
|
std::vector<int> tmIds = _tmDAO.getTmIds();
|
||||||
_concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
|
_concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
|
||||||
|
|
||||||
BOOST_FOREACH(int & tmId, tmIds) {
|
BOOST_FOREACH(int & tmId, tmIds) {
|
||||||
_addTm(tmId);
|
_addTm(tmId);
|
||||||
}
|
}
|
||||||
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
|
|
||||||
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
|
|
||||||
|
|
||||||
_lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
|
_lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
|
||||||
|
|
||||||
|
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap, _lemmatizerFacade));
|
||||||
|
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap, _lemmatizerFacade));
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaServer::~ConcordiaServer() {
|
ConcordiaServer::~ConcordiaServer() {
|
||||||
@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
||||||
|
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
|
||||||
|
std::vector<std::string> sourceSentences;
|
||||||
|
std::vector<std::string> targetSentences;
|
||||||
|
std::vector<std::string> alignmentStrings;
|
||||||
|
int tmId = d[TM_ID_PARAM].GetInt();
|
||||||
|
// loading data from json
|
||||||
|
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
|
||||||
|
Logger::log("addAlignedLemmatizedSentences");
|
||||||
|
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
|
||||||
|
Logger::logInt("tm id", tmId);
|
||||||
|
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
||||||
|
if (sentencesArray[i].Size() != 3) {
|
||||||
|
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
sourceSentences.push_back(sentencesArray[i][0].GetString());
|
||||||
|
targetSentences.push_back(sentencesArray[i][1].GetString());
|
||||||
|
alignmentStrings.push_back(sentencesArray[i][2].GetString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
|
||||||
} else if (operation == "lemmatize") {
|
} else if (operation == "lemmatize") {
|
||||||
std::string sentence = _getStringParameter(d, "sentence");
|
std::string sentence = _getStringParameter(d, "sentence");
|
||||||
std::string languageCode = _getStringParameter(d, "languageCode");
|
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||||
@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
|
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
|
||||||
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
|
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
|
||||||
std::string name = _getStringParameter(d, NAME_PARAM);
|
std::string name = _getStringParameter(d, NAME_PARAM);
|
||||||
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
|
bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM);
|
||||||
|
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized);
|
||||||
_addTm(newId);
|
_addTm(newId);
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
|
||||||
|
throw (ConcordiaException) {
|
||||||
|
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
|
||||||
|
if (itr != d.MemberEnd()) {
|
||||||
|
bool value = itr->value.GetBool();
|
||||||
|
return value;
|
||||||
|
} else {
|
||||||
|
throw ConcordiaException("missing parameter: " + std::string(name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ConcordiaServer::_addTm(int tmId) {
|
void ConcordiaServer::_addTm(int tmId) {
|
||||||
std::stringstream indexPath;
|
std::stringstream indexPath;
|
||||||
indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
|
indexPath << INDEX_DIRECTORY << "/tm_" << tmId;
|
||||||
|
@ -38,6 +38,8 @@ private:
|
|||||||
|
|
||||||
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||||
|
|
||||||
|
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
|
||||||
|
|
||||||
void _addTm(int tmId);
|
void _addTm(int tmId);
|
||||||
|
|
||||||
std::string _configFilePath;
|
std::string _configFilePath;
|
||||||
|
@ -16,7 +16,9 @@
|
|||||||
#define SOURCE_SENTENCE_PARAM "sourceSentence"
|
#define SOURCE_SENTENCE_PARAM "sourceSentence"
|
||||||
#define TARGET_SENTENCE_PARAM "targetSentence"
|
#define TARGET_SENTENCE_PARAM "targetSentence"
|
||||||
#define TM_ID_PARAM "tmId"
|
#define TM_ID_PARAM "tmId"
|
||||||
|
#define TM_LEMMATIZED_PARAM "tmLemmatized"
|
||||||
#define SENTENCES_PARAM "sentences"
|
#define SENTENCES_PARAM "sentences"
|
||||||
|
#define EXAMPLES_PARAM "examples"
|
||||||
#define SOURCE_LANG_PARAM "sourceLangId"
|
#define SOURCE_LANG_PARAM "sourceLangId"
|
||||||
#define TARGET_LANG_PARAM "targetLangId"
|
#define TARGET_LANG_PARAM "targetLangId"
|
||||||
#define NAME_PARAM "name"
|
#define NAME_PARAM "name"
|
||||||
@ -25,6 +27,7 @@
|
|||||||
#define ADD_SENTENCE_OP "addSentence"
|
#define ADD_SENTENCE_OP "addSentence"
|
||||||
#define ADD_SENTENCES_OP "addSentences"
|
#define ADD_SENTENCES_OP "addSentences"
|
||||||
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
||||||
|
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
||||||
#define REFRESH_INDEX_OP "refreshIndex"
|
#define REFRESH_INDEX_OP "refreshIndex"
|
||||||
#define SIMPLE_SEARCH_OP "simpleSearch"
|
#define SIMPLE_SEARCH_OP "simpleSearch"
|
||||||
#define CONCORDIA_SEARCH_OP "concordiaSearch"
|
#define CONCORDIA_SEARCH_OP "concordiaSearch"
|
||||||
|
@ -133,6 +133,17 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||||
|
try {
|
||||||
|
char * valueStr = PQgetvalue(result,row,col);
|
||||||
|
return std::string(valueStr) == "t";
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "Error getting bool value. Message: " << e.what();
|
||||||
|
throw ConcordiaException(ss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) {
|
||||||
try {
|
try {
|
||||||
char * valueStr = PQgetvalue(result,row,col);
|
char * valueStr = PQgetvalue(result,row,col);
|
||||||
@ -153,4 +164,3 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
|
|||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,6 +31,8 @@ public:
|
|||||||
|
|
||||||
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||||
|
|
||||||
|
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||||
|
|
||||||
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||||
|
|
||||||
int getRowCount(PGresult * result) throw (ConcordiaException);
|
int getRowCount(PGresult * result) throw (ConcordiaException);
|
||||||
|
@ -14,9 +14,11 @@
|
|||||||
#include "json_generator.hpp"
|
#include "json_generator.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
_concordiasMap(concordiasMap) {
|
_concordiasMap(concordiasMap),
|
||||||
|
_lemmatizerFacade(lemmatizerFacade) {
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexController::~IndexController() {
|
IndexController::~IndexController() {
|
||||||
@ -32,9 +34,10 @@ void IndexController::addSentence(
|
|||||||
try {
|
try {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
|
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
|
||||||
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
|
||||||
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
|
||||||
it->second->addTokenizedExample(tokenizedSentence, sentenceId);
|
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
|
||||||
it->second->refreshSAfromRAM();
|
it->second->refreshSAfromRAM();
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
@ -67,9 +70,10 @@ void IndexController::addSentences(
|
|||||||
try {
|
try {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
|
std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
|
||||||
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
|
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
|
||||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
|
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
|
||||||
it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
|
it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
@ -118,6 +122,44 @@ void IndexController::addAlignedSentences(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void IndexController::addAlignedLemmatizedSentences(
|
||||||
|
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
|
const std::vector<std::string> & sourceSentences,
|
||||||
|
const std::vector<std::string> & targetSentences,
|
||||||
|
const std::vector<std::string> & alignmentStrings,
|
||||||
|
const int tmId) {
|
||||||
|
try {
|
||||||
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
|
if (it != _concordiasMap->end()) {
|
||||||
|
std::vector<std::string> lemmatizedSourceSentences;
|
||||||
|
std::vector<std::vector<std::vector<int> > > allAlignments;
|
||||||
|
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
|
||||||
|
|
||||||
|
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
||||||
|
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
|
||||||
|
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
||||||
|
|
||||||
|
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
||||||
|
|
||||||
|
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||||
|
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
|
||||||
|
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
|
||||||
|
}
|
||||||
|
jsonWriter.StartObject();
|
||||||
|
jsonWriter.String("status");
|
||||||
|
jsonWriter.String("success");
|
||||||
|
jsonWriter.EndObject();
|
||||||
|
} else {
|
||||||
|
JsonGenerator::signalError(jsonWriter, "no such tm!");
|
||||||
|
}
|
||||||
|
} catch (ConcordiaException & e) {
|
||||||
|
std::stringstream errorstream;
|
||||||
|
errorstream << "concordia error: " << e.what();
|
||||||
|
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
try {
|
try {
|
||||||
@ -189,4 +231,3 @@ void IndexController::_getSourceSentencesAndAlignments(
|
|||||||
allAlignments.push_back(alignments);
|
allAlignments.push_back(alignments);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
|
|
||||||
#include "unit_dao.hpp"
|
#include "unit_dao.hpp"
|
||||||
|
#include "lemmatizer_facade.hpp"
|
||||||
|
|
||||||
|
|
||||||
#include "rapidjson/writer.h"
|
#include "rapidjson/writer.h"
|
||||||
|
|
||||||
@ -17,7 +19,8 @@ class IndexController {
|
|||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -38,6 +41,13 @@ public:
|
|||||||
const std::vector<std::string> & targetSentences,
|
const std::vector<std::string> & targetSentences,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
|
void addAlignedLemmatizedSentences(
|
||||||
|
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
|
const std::vector<std::string> & sourceSentences,
|
||||||
|
const std::vector<std::string> & targetSentences,
|
||||||
|
const std::vector<std::string> & alignmentStrings,
|
||||||
|
const int tmId);
|
||||||
|
|
||||||
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
const int tmId);
|
const int tmId);
|
||||||
|
|
||||||
@ -49,6 +59,8 @@ private:
|
|||||||
|
|
||||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||||
|
|
||||||
|
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||||
|
|
||||||
UnitDAO _unitDAO;
|
UnitDAO _unitDAO;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include "lemmatizer_facade.hpp"
|
#include "lemmatizer_facade.hpp"
|
||||||
|
|
||||||
|
#include <boost/foreach.hpp>
|
||||||
|
|
||||||
|
|
||||||
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
||||||
_lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
|
_lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
|
||||||
@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
|
||||||
|
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
||||||
|
if (tmInfo.first) {
|
||||||
|
return lemmatizeSentence(tmInfo.second, pattern);
|
||||||
|
} else {
|
||||||
|
return pattern;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId) {
|
||||||
|
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
||||||
|
if (tmInfo.first) {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
BOOST_FOREACH(std::string & pattern, patterns) {
|
||||||
|
result.push_back(lemmatizeSentence(tmInfo.second, pattern));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return patterns;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define LEMMATIZER_FACADE_HDR
|
#define LEMMATIZER_FACADE_HDR
|
||||||
|
|
||||||
#include "socket_lemmatizer.hpp"
|
#include "socket_lemmatizer.hpp"
|
||||||
|
#include "tm_dao.hpp"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <concordia/concordia_exception.hpp>
|
#include <concordia/concordia_exception.hpp>
|
||||||
@ -18,8 +19,15 @@ public:
|
|||||||
virtual ~LemmatizerFacade();
|
virtual ~LemmatizerFacade();
|
||||||
|
|
||||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||||
|
|
||||||
|
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
|
||||||
|
|
||||||
|
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
|
boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
|
||||||
|
|
||||||
|
TmDAO _tmDAO;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -8,9 +8,11 @@
|
|||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
|
|
||||||
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
|
||||||
|
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
_concordiasMap(concordiasMap) {
|
_concordiasMap(concordiasMap),
|
||||||
|
_lemmatizerFacade(lemmatizerFacade) {
|
||||||
}
|
}
|
||||||
|
|
||||||
SearcherController::~SearcherController() {
|
SearcherController::~SearcherController() {
|
||||||
@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
|||||||
const int tmId) {
|
const int tmId) {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
|
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||||
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
|
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
@ -47,6 +50,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
|
|||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
if (intervals.size() > 0) {
|
if (intervals.size() > 0) {
|
||||||
// std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
// std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
||||||
|
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||||
std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
|
||||||
|
|
||||||
Logger::log("concordiaPhraseSearch");
|
Logger::log("concordiaPhraseSearch");
|
||||||
@ -111,6 +115,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
|
|||||||
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
|
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
||||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
|
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
@ -146,6 +151,3 @@ std::string SearcherController::_substrUTF8(std::string source, int start, int l
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "unit_dao.hpp"
|
#include "unit_dao.hpp"
|
||||||
#include "simple_search_result.hpp"
|
#include "simple_search_result.hpp"
|
||||||
|
#include "lemmatizer_facade.hpp"
|
||||||
#include "rapidjson/writer.h"
|
#include "rapidjson/writer.h"
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +18,8 @@ class SearcherController {
|
|||||||
public:
|
public:
|
||||||
/*! Constructor.
|
/*! Constructor.
|
||||||
*/
|
*/
|
||||||
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
|
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
|
||||||
|
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
@ -41,6 +43,8 @@ private:
|
|||||||
|
|
||||||
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
|
||||||
|
|
||||||
|
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
|
||||||
|
|
||||||
UnitDAO _unitDAO;
|
UnitDAO _unitDAO;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "query_param.hpp"
|
#include "query_param.hpp"
|
||||||
#include "string_param.hpp"
|
#include "string_param.hpp"
|
||||||
#include "int_param.hpp"
|
#include "int_param.hpp"
|
||||||
|
#include "bool_param.hpp"
|
||||||
#include "int_array_param.hpp"
|
#include "int_array_param.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
|
||||||
@ -32,14 +33,19 @@ std::vector<int> TmDAO::getTmIds() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
|
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
|
||||||
|
addTm(sourceLangId, targetLangId, name, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) {
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
connection.startTransaction();
|
connection.startTransaction();
|
||||||
|
|
||||||
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
|
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id";
|
||||||
std::vector<QueryParam*> params;
|
std::vector<QueryParam*> params;
|
||||||
params.push_back(new IntParam(sourceLangId));
|
params.push_back(new IntParam(sourceLangId));
|
||||||
params.push_back(new IntParam(targetLangId));
|
params.push_back(new IntParam(targetLangId));
|
||||||
params.push_back(new StringParam(name));
|
params.push_back(new StringParam(name));
|
||||||
|
params.push_back(new BoolParam(lemmatized));
|
||||||
|
|
||||||
PGresult * result = connection.execute(query, params);
|
PGresult * result = connection.execute(query, params);
|
||||||
int newId = connection.getIntValue(result, 0, 0);
|
int newId = connection.getIntValue(result, 0, 0);
|
||||||
@ -53,3 +59,18 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::pair<bool, std::string> TmDAO::getTmInfo(int tmId) {
|
||||||
|
DBconnection connection;
|
||||||
|
connection.startTransaction();
|
||||||
|
std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;";
|
||||||
|
std::vector<QueryParam*> params;
|
||||||
|
params.push_back(new IntParam(tmId));
|
||||||
|
PGresult * dbResult = connection.execute(query, params);
|
||||||
|
bool lemmatized = connection.getBoolValue(dbResult, 0, 1);
|
||||||
|
std::string languageCode = connection.getStringValue(dbResult, 0, 2);
|
||||||
|
connection.clearResult(dbResult);
|
||||||
|
connection.endTransaction();
|
||||||
|
|
||||||
|
return std::pair<bool, std::string>(lemmatized, languageCode);
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#include <concordia/common/config.hpp>
|
#include <concordia/common/config.hpp>
|
||||||
#include "db_connection.hpp"
|
#include "db_connection.hpp"
|
||||||
@ -18,8 +19,12 @@ public:
|
|||||||
|
|
||||||
int addTm(const int sourceLangId, const int targetLangId, const std::string name);
|
int addTm(const int sourceLangId, const int targetLangId, const std::string name);
|
||||||
|
|
||||||
|
int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized);
|
||||||
|
|
||||||
std::vector<int> getTmIds();
|
std::vector<int> getTmIds();
|
||||||
|
|
||||||
|
std::pair<bool, std::string> getTmInfo(int tmId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -50,7 +50,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
|
|||||||
const std::vector<TokenizedSentence> & sourceSentences,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId) {
|
const int tmId) throw (ConcordiaException) {
|
||||||
|
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
std::vector<SUFFIX_MARKER_TYPE> newIds;
|
||||||
@ -198,7 +198,11 @@ int UnitDAO::_addAlignedUnit(
|
|||||||
const TokenizedSentence & sourceSentence,
|
const TokenizedSentence & sourceSentence,
|
||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) {
|
const int tmId) throw(ConcordiaException) {
|
||||||
|
|
||||||
|
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||||
|
throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
|
||||||
|
}
|
||||||
|
|
||||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||||
std::vector<QueryParam*> params;
|
std::vector<QueryParam*> params;
|
||||||
@ -235,5 +239,3 @@ int UnitDAO::_addAlignedUnit(
|
|||||||
|
|
||||||
return newId;
|
return newId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <concordia/substring_occurence.hpp>
|
#include <concordia/substring_occurence.hpp>
|
||||||
#include <concordia/matched_pattern_fragment.hpp>
|
#include <concordia/matched_pattern_fragment.hpp>
|
||||||
#include <concordia/concordia_search_result.hpp>
|
#include <concordia/concordia_search_result.hpp>
|
||||||
|
#include <concordia/concordia_exception.hpp>
|
||||||
#include <boost/shared_ptr.hpp>
|
#include <boost/shared_ptr.hpp>
|
||||||
|
|
||||||
#include "simple_search_result.hpp"
|
#include "simple_search_result.hpp"
|
||||||
@ -38,7 +39,7 @@ public:
|
|||||||
const std::vector<TokenizedSentence> & sourceSentences,
|
const std::vector<TokenizedSentence> & sourceSentences,
|
||||||
const std::vector<TokenizedSentence> & targetSentences,
|
const std::vector<TokenizedSentence> & targetSentences,
|
||||||
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
const std::vector<std::vector<std::vector<int> > > & allAlignments,
|
||||||
const int tmId);
|
const int tmId) throw (ConcordiaException);
|
||||||
|
|
||||||
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
|
||||||
|
|
||||||
@ -62,7 +63,7 @@ private:
|
|||||||
const TokenizedSentence & sourceSentence,
|
const TokenizedSentence & sourceSentence,
|
||||||
const TokenizedSentence & targetSentence,
|
const TokenizedSentence & targetSentence,
|
||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId);
|
const int tmId) throw(ConcordiaException);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -3,7 +3,8 @@ CREATE TABLE tm (
|
|||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
source_lang_id integer,
|
source_lang_id integer,
|
||||||
target_lang_id integer,
|
target_lang_id integer,
|
||||||
name varchar(40)
|
name varchar(40),
|
||||||
|
lemmatized bool DEFAULT false
|
||||||
);
|
);
|
||||||
|
|
||||||
DROP TABLE IF EXISTS language;
|
DROP TABLE IF EXISTS language;
|
||||||
|
Binary file not shown.
@ -11,15 +11,23 @@ namespace LemmaGenSentenceLemmatizer
|
|||||||
static void Main(string[] args)
|
static void Main(string[] args)
|
||||||
{
|
{
|
||||||
if (args.Length == 1)
|
if (args.Length == 1)
|
||||||
|
{
|
||||||
|
try
|
||||||
{
|
{
|
||||||
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
|
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
|
||||||
string line = Console.ReadLine();
|
string line = Console.ReadLine();
|
||||||
while (!string.IsNullOrEmpty(line))
|
while (line != null)
|
||||||
{
|
{
|
||||||
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
|
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
|
||||||
line = Console.ReadLine();
|
line = Console.ReadLine();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine("Exception occurred: " + ex.Message);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,10 +1,22 @@
|
|||||||
SRC_LANG=en
|
SRC_LANG=en
|
||||||
TRG_LANG=pl
|
TRG_LANG=pl
|
||||||
CORPUS_NAME=europarl
|
CORPUS_NAME=europarljrc
|
||||||
|
|
||||||
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
|
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
|
||||||
|
clean-intermediate-files:
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.low
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
|
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.tok
|
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||||
|
168
mgiza-aligner/clean-corpus-n.perl
Executable file
168
mgiza-aligner/clean-corpus-n.perl
Executable file
@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
#
|
||||||
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||||
|
# Public License version 2.1 or, at your option, any later version.
|
||||||
|
|
||||||
|
# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
use Getopt::Long;
|
||||||
|
my $help;
|
||||||
|
my $lc = 0; # lowercase the corpus?
|
||||||
|
my $ignore_ratio = 0;
|
||||||
|
my $ignore_xml = 0;
|
||||||
|
my $enc = "utf8"; # encoding of the input and output files
|
||||||
|
# set to anything else you wish, but I have not tested it yet
|
||||||
|
my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
|
||||||
|
# is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
|
||||||
|
# and crashes if it encounters a word that exceeds it
|
||||||
|
my $ratio = 9;
|
||||||
|
|
||||||
|
GetOptions(
|
||||||
|
"help" => \$help,
|
||||||
|
"lowercase|lc" => \$lc,
|
||||||
|
"encoding=s" => \$enc,
|
||||||
|
"ratio=f" => \$ratio,
|
||||||
|
"ignore-ratio" => \$ignore_ratio,
|
||||||
|
"ignore-xml" => \$ignore_xml,
|
||||||
|
"max-word-length|mwl=s" => \$max_word_length
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
if (scalar(@ARGV) < 6 || $help) {
|
||||||
|
print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $corpus = $ARGV[0];
|
||||||
|
my $l1 = $ARGV[1];
|
||||||
|
my $l2 = $ARGV[2];
|
||||||
|
my $out = $ARGV[3];
|
||||||
|
my $min = $ARGV[4];
|
||||||
|
my $max = $ARGV[5];
|
||||||
|
|
||||||
|
my $linesRetainedFile = "";
|
||||||
|
if (scalar(@ARGV) > 6) {
|
||||||
|
$linesRetainedFile = $ARGV[6];
|
||||||
|
open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
|
||||||
|
|
||||||
|
my $opn = undef;
|
||||||
|
my $l1input = "$corpus.$l1";
|
||||||
|
if (-e $l1input) {
|
||||||
|
$opn = $l1input;
|
||||||
|
} elsif (-e $l1input.".gz") {
|
||||||
|
$opn = "gunzip -c $l1input.gz |";
|
||||||
|
} else {
|
||||||
|
die "Error: $l1input does not exist";
|
||||||
|
}
|
||||||
|
open(F,$opn) or die "Can't open '$opn'";
|
||||||
|
$opn = undef;
|
||||||
|
my $l2input = "$corpus.$l2";
|
||||||
|
if (-e $l2input) {
|
||||||
|
$opn = $l2input;
|
||||||
|
} elsif (-e $l2input.".gz") {
|
||||||
|
$opn = "gunzip -c $l2input.gz |";
|
||||||
|
} else {
|
||||||
|
die "Error: $l2input does not exist";
|
||||||
|
}
|
||||||
|
|
||||||
|
open(E,$opn) or die "Can't open '$opn'";
|
||||||
|
|
||||||
|
open(FO,">$out.$l1") or die "Can't write $out.$l1";
|
||||||
|
open(EO,">$out.$l2") or die "Can't write $out.$l2";
|
||||||
|
|
||||||
|
# necessary for proper lowercasing
|
||||||
|
my $binmode;
|
||||||
|
if ($enc eq "utf8") {
|
||||||
|
$binmode = ":utf8";
|
||||||
|
} else {
|
||||||
|
$binmode = ":encoding($enc)";
|
||||||
|
}
|
||||||
|
binmode(F, $binmode);
|
||||||
|
binmode(E, $binmode);
|
||||||
|
binmode(FO, $binmode);
|
||||||
|
binmode(EO, $binmode);
|
||||||
|
|
||||||
|
my $innr = 0;
|
||||||
|
my $outnr = 0;
|
||||||
|
my $factored_flag;
|
||||||
|
while(my $f = <F>) {
|
||||||
|
$innr++;
|
||||||
|
print STDERR "." if $innr % 10000 == 0;
|
||||||
|
print STDERR "($innr)" if $innr % 100000 == 0;
|
||||||
|
my $e = <E>;
|
||||||
|
die "$corpus.$l2 is too short!" if !defined $e;
|
||||||
|
chomp($e);
|
||||||
|
chomp($f);
|
||||||
|
if ($innr == 1) {
|
||||||
|
$factored_flag = ($e =~ /\|/ || $f =~ /\|/);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if lowercasing, lowercase
|
||||||
|
if ($lc) {
|
||||||
|
$e = lc($e);
|
||||||
|
$f = lc($f);
|
||||||
|
}
|
||||||
|
|
||||||
|
$e =~ s/\|//g unless $factored_flag;
|
||||||
|
$e =~ s/\s+/ /g;
|
||||||
|
$e =~ s/^ //;
|
||||||
|
$e =~ s/ $//;
|
||||||
|
$f =~ s/\|//g unless $factored_flag;
|
||||||
|
$f =~ s/\s+/ /g;
|
||||||
|
$f =~ s/^ //;
|
||||||
|
$f =~ s/ $//;
|
||||||
|
next if $f eq '';
|
||||||
|
next if $e eq '';
|
||||||
|
|
||||||
|
my $ec = &word_count($e);
|
||||||
|
my $fc = &word_count($f);
|
||||||
|
next if $ec > $max;
|
||||||
|
next if $fc > $max;
|
||||||
|
next if $ec < $min;
|
||||||
|
next if $fc < $min;
|
||||||
|
next if !$ignore_ratio && $ec/$fc > $ratio;
|
||||||
|
next if !$ignore_ratio && $fc/$ec > $ratio;
|
||||||
|
# Skip this segment if any factor is longer than $max_word_length
|
||||||
|
my $max_word_length_plus_one = $max_word_length + 1;
|
||||||
|
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
|
||||||
|
next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
|
||||||
|
|
||||||
|
# An extra check: none of the factors can be blank!
|
||||||
|
die "There is a blank factor in $corpus.$l1 on line $innr: $f"
|
||||||
|
if $f =~ /[ \|]\|/;
|
||||||
|
die "There is a blank factor in $corpus.$l2 on line $innr: $e"
|
||||||
|
if $e =~ /[ \|]\|/;
|
||||||
|
|
||||||
|
$outnr++;
|
||||||
|
print FO $f."\n";
|
||||||
|
print EO $e."\n";
|
||||||
|
|
||||||
|
if ($linesRetainedFile ne "") {
|
||||||
|
print LINES_RETAINED $innr."\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($linesRetainedFile ne "") {
|
||||||
|
close LINES_RETAINED;
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "\n";
|
||||||
|
my $e = <E>;
|
||||||
|
die "$corpus.$l2 is too long!" if defined $e;
|
||||||
|
|
||||||
|
print STDERR "Input sentences: $innr Output sentences: $outnr\n";
|
||||||
|
|
||||||
|
sub word_count {
|
||||||
|
my ($line) = @_;
|
||||||
|
if ($ignore_xml) {
|
||||||
|
$line =~ s/<\S[^>]*\S>/ /g;
|
||||||
|
$line =~ s/\s+/ /g;
|
||||||
|
$line =~ s/^ //g;
|
||||||
|
$line =~ s/ $//g;
|
||||||
|
}
|
||||||
|
my @w = split(/ /,$line);
|
||||||
|
return scalar @w;
|
||||||
|
}
|
26
mgiza-aligner/sortGizaAlignments.py
Executable file
26
mgiza-aligner/sortGizaAlignments.py
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
examples_dict = {}
|
||||||
|
p = re.compile("# Sentence pair \((\d+)\)")
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if i % 3 == 0:
|
||||||
|
current_example = [line]
|
||||||
|
m = p.match(line)
|
||||||
|
if m:
|
||||||
|
current_key = int(m.group(1))
|
||||||
|
else:
|
||||||
|
raise Exception("Wrong line: "+line)
|
||||||
|
elif i % 3 == 1:
|
||||||
|
current_example.append(line)
|
||||||
|
else:
|
||||||
|
current_example.append(line)
|
||||||
|
examples_dict[current_key] = current_example
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
for key in sorted(examples_dict.keys()):
|
||||||
|
print ('\n'.join(examples_dict[key]))
|
58
tests/addAlignedLemmatizedTM.py
Normal file → Executable file
58
tests/addAlignedLemmatizedTM.py
Normal file → Executable file
@ -21,10 +21,15 @@ def file_len(fname):
|
|||||||
pass
|
pass
|
||||||
return i + 1
|
return i + 1
|
||||||
|
|
||||||
def add_data(data):
|
def add_examples(examplesData):
|
||||||
req = urllib2.Request(address)
|
req = urllib2.Request(address)
|
||||||
req.add_header('Content-Type', 'application/json')
|
req.add_header('Content-Type', 'application/json')
|
||||||
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
|
||||||
|
if response['status'] == 'error':
|
||||||
|
raise Exception(response['message'])
|
||||||
|
|
||||||
|
if len(sys.argv) != 7:
|
||||||
|
raise Exception("wrong number of arguments")
|
||||||
|
|
||||||
name = sys.argv[1]
|
name = sys.argv[1]
|
||||||
sourceFile = sys.argv[2]
|
sourceFile = sys.argv[2]
|
||||||
@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
|
|||||||
raise Exception("alignments file is not exactly 3 times longer than source and target")
|
raise Exception("alignments file is not exactly 3 times longer than source and target")
|
||||||
|
|
||||||
|
|
||||||
totalLines = file_len(sourceFile)
|
totalExamples = file_len(sourceFile)
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'operation': 'addTm',
|
'operation': 'addTm',
|
||||||
'sourceLangId':sourceLangId,
|
'sourceLangId':sourceLangId,
|
||||||
'targetLangId':targetLangId,
|
'targetLangId':targetLangId,
|
||||||
'name':name
|
'name':name,
|
||||||
|
'tmLemmatized':True
|
||||||
}
|
}
|
||||||
|
|
||||||
req = urllib2.Request(address)
|
req = urllib2.Request(address)
|
||||||
@ -60,35 +66,35 @@ data = {
|
|||||||
'tmId':tmId
|
'tmId':tmId
|
||||||
}
|
}
|
||||||
|
|
||||||
sentences = []
|
examples = []
|
||||||
start = time.time()
|
start = time.time()
|
||||||
with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines:
|
with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
|
||||||
|
for lineNumber in range(totalExamples):
|
||||||
|
sourceSentence = sf.readline().strip()
|
||||||
|
targetSentence = tf.readline().strip()
|
||||||
|
|
||||||
lineNumber = 0
|
# skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
|
||||||
for line in sourceLines:
|
af.readline()
|
||||||
line = line.strip()
|
af.readline()
|
||||||
if lineNumber % 3 == 1:
|
|
||||||
currSentence.append(line)
|
alignmentString = af.readline().strip()
|
||||||
elif lineNumber % 3 == 2:
|
|
||||||
currSentence.append(line)
|
examples.append([sourceSentence, targetSentence, alignmentString])
|
||||||
currSentence.reverse()
|
|
||||||
sentences.append(currSentence)
|
if len(examples) >= BUFFER_SIZE:
|
||||||
currSentence = []
|
data['examples'] = examples
|
||||||
if len(sentences) >= BUFFER_SIZE:
|
add_examples(data)
|
||||||
data['sentences'] = sentences
|
|
||||||
add_data(data)
|
|
||||||
mark = time.time()
|
mark = time.time()
|
||||||
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
|
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
|
||||||
sentences = []
|
examples = []
|
||||||
lineNumber += 1
|
|
||||||
|
|
||||||
|
|
||||||
if len(sentences) > 0:
|
if len(examples) > 0:
|
||||||
data['sentences'] = sentences
|
data['examples'] = examples
|
||||||
add_data(data)
|
add_examples(data)
|
||||||
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
|
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
|
||||||
|
|
||||||
print "Generating index..."
|
print "Generating index..."
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
7
tests/addLemmatizedTM.sh
Executable file
7
tests/addLemmatizedTM.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
CORPUS_NAME="europarl_sample"
|
||||||
|
SRC_LANG_ID=2
|
||||||
|
TRG_LANG_ID=1
|
||||||
|
|
||||||
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
@ -16,7 +16,8 @@ data = {
|
|||||||
'operation': 'addTm',
|
'operation': 'addTm',
|
||||||
'sourceLangId':int(sys.argv[1]),
|
'sourceLangId':int(sys.argv[1]),
|
||||||
'targetLangId':int(sys.argv[2]),
|
'targetLangId':int(sys.argv[2]),
|
||||||
'name':sys.argv[3]
|
'name':sys.argv[3],
|
||||||
|
'tmLemmatized':bool(int(sys.argv[4]))
|
||||||
}
|
}
|
||||||
|
|
||||||
req = urllib2.Request(address)
|
req = urllib2.Request(address)
|
||||||
|
Loading…
Reference in New Issue
Block a user