working lemmatization

This commit is contained in:
Rafał Jaworski 2017-03-10 14:52:01 +01:00
parent 8b0666c34d
commit 89fb77bf58
32 changed files with 592 additions and 142 deletions

View File

@ -0,0 +1,24 @@
#include "bool_param.hpp"
BoolParam::BoolParam(bool value):_value(value) {
}
BoolParam::~BoolParam() {
}
const char * BoolParam::getValue() {
if (_value) {
return "t";
} else {
return "f";
}
}
const int BoolParam::getLength() {
return 1;
}
const int BoolParam::isBinary() {
return 0;
}

View File

@ -0,0 +1,24 @@
#ifndef BOOL_PARAM_HDR
#define BOOL_PARAM_HDR
#include "query_param.hpp"
class BoolParam : public QueryParam {
public:
/*! Constructor.
*/
BoolParam(bool value);
/*! Destructor.
*/
virtual ~BoolParam();
const char * getValue();
const int getLength();
const int isBinary();
private:
bool _value;
};
#endif

View File

@ -5,6 +5,7 @@
#include <iostream>
#include <fstream>
#include <ctime>
#include <utility>
#include <concordia/interval.hpp>
@ -19,16 +20,17 @@
ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
throw(ConcordiaException) :
_configFilePath(configFilePath) {
std::vector<int> tmIds = _tmDAO.getTmIds();
_concordiasMap = boost::shared_ptr<boost::ptr_map<int,Concordia> >(new boost::ptr_map<int,Concordia>());
BOOST_FOREACH(int & tmId, tmIds) {
_addTm(tmId);
}
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
_lemmatizerFacade = boost::shared_ptr<LemmatizerFacade> (new LemmatizerFacade());
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap, _lemmatizerFacade));
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap, _lemmatizerFacade));
}
ConcordiaServer::~ConcordiaServer() {
@ -95,6 +97,27 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
}
}
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
} else if (operation == ADD_ALIGNED_LEMMATIZED_SENTENCES_OP) {
std::vector<std::string> sourceSentences;
std::vector<std::string> targetSentences;
std::vector<std::string> alignmentStrings;
int tmId = d[TM_ID_PARAM].GetInt();
// loading data from json
const rapidjson::Value & sentencesArray = d[EXAMPLES_PARAM];
Logger::log("addAlignedLemmatizedSentences");
Logger::logInt("lemmatized sentences to add", sentencesArray.Size());
Logger::logInt("tm id", tmId);
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
if (sentencesArray[i].Size() != 3) {
JsonGenerator::signalError(jsonWriter, "sentence should be an array of 3 elements");
break;
} else {
sourceSentences.push_back(sentencesArray[i][0].GetString());
targetSentences.push_back(sentencesArray[i][1].GetString());
alignmentStrings.push_back(sentencesArray[i][2].GetString());
}
}
_indexController->addAlignedLemmatizedSentences(jsonWriter, sourceSentences, targetSentences, alignmentStrings, tmId);
} else if (operation == "lemmatize") {
std::string sentence = _getStringParameter(d, "sentence");
std::string languageCode = _getStringParameter(d, "languageCode");
@ -130,7 +153,8 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
std::string name = _getStringParameter(d, NAME_PARAM);
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
bool lemmatized = _getBoolParameter(d, TM_LEMMATIZED_PARAM);
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name, lemmatized);
_addTm(newId);
jsonWriter.StartObject();
@ -179,6 +203,17 @@ int ConcordiaServer::_getIntParameter(rapidjson::Document & d, const char * name
}
}
int ConcordiaServer::_getBoolParameter(rapidjson::Document & d, const char * name)
throw (ConcordiaException) {
rapidjson::Value::ConstMemberIterator itr = d.FindMember(name);
if (itr != d.MemberEnd()) {
bool value = itr->value.GetBool();
return value;
} else {
throw ConcordiaException("missing parameter: " + std::string(name));
}
}
void ConcordiaServer::_addTm(int tmId) {
std::stringstream indexPath;
indexPath << INDEX_DIRECTORY << "/tm_" << tmId;

View File

@ -38,6 +38,8 @@ private:
int _getIntParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
int _getBoolParameter(rapidjson::Document & d, const char * name) throw (ConcordiaException);
void _addTm(int tmId);
std::string _configFilePath;

View File

@ -16,7 +16,9 @@
#define SOURCE_SENTENCE_PARAM "sourceSentence"
#define TARGET_SENTENCE_PARAM "targetSentence"
#define TM_ID_PARAM "tmId"
#define TM_LEMMATIZED_PARAM "tmLemmatized"
#define SENTENCES_PARAM "sentences"
#define EXAMPLES_PARAM "examples"
#define SOURCE_LANG_PARAM "sourceLangId"
#define TARGET_LANG_PARAM "targetLangId"
#define NAME_PARAM "name"
@ -25,6 +27,7 @@
#define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences"
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
#define REFRESH_INDEX_OP "refreshIndex"
#define SIMPLE_SEARCH_OP "simpleSearch"
#define CONCORDIA_SEARCH_OP "concordiaSearch"

View File

@ -133,6 +133,17 @@ int DBconnection::getIntValue(PGresult * result, int row, int col) throw (Concor
}
}
bool DBconnection::getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException) {
try {
char * valueStr = PQgetvalue(result,row,col);
return std::string(valueStr) == "t";
} catch (std::exception & e) {
std::stringstream ss;
ss << "Error getting bool value. Message: " << e.what();
throw ConcordiaException(ss.str());
}
}
std::string DBconnection::getStringValue(PGresult * result, int row, int col) throw (ConcordiaException) {
try {
char * valueStr = PQgetvalue(result,row,col);
@ -153,4 +164,3 @@ int DBconnection::getRowCount(PGresult * result) throw (ConcordiaException) {
throw ConcordiaException(ss.str());
}
}

View File

@ -31,6 +31,8 @@ public:
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
bool getBoolValue(PGresult * result, int row, int col) throw (ConcordiaException);
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
int getRowCount(PGresult * result) throw (ConcordiaException);

View File

@ -14,9 +14,11 @@
#include "json_generator.hpp"
#include "logger.hpp"
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
IndexController::IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
throw(ConcordiaException):
_concordiasMap(concordiasMap) {
_concordiasMap(concordiasMap),
_lemmatizerFacade(lemmatizerFacade) {
}
IndexController::~IndexController() {
@ -32,9 +34,10 @@ void IndexController::addSentence(
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
TokenizedSentence tokenizedLemmatizedSentence = it->second->tokenize(_lemmatizerFacade->lemmatizeIfNeeded(sourceSentence, tmId));
TokenizedSentence tokenizedSentence = it->second->tokenize(sourceSentence);
int sentenceId = _unitDAO.addSentence(tokenizedSentence, targetSentence, tmId);
it->second->addTokenizedExample(tokenizedSentence, sentenceId);
it->second->addTokenizedExample(tokenizedLemmatizedSentence, sentenceId);
it->second->refreshSAfromRAM();
jsonWriter.StartObject();
@ -67,9 +70,10 @@ void IndexController::addSentences(
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<TokenizedSentence> tokenizedLemmatizedSentences = it->second->tokenizeAll(_lemmatizerFacade->lemmatizeSentencesIfNeeded(sourceSentences, tmId));
std::vector<TokenizedSentence> tokenizedSentences = it->second->tokenizeAll(sourceSentences);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds = _unitDAO.addSentences(tokenizedSentences, targetSentences, tmId);
it->second->addAllTokenizedExamples(tokenizedSentences, sentenceIds);
it->second->addAllTokenizedExamples(tokenizedLemmatizedSentences, sentenceIds);
jsonWriter.StartObject();
jsonWriter.String("status");
@ -118,6 +122,44 @@ void IndexController::addAlignedSentences(
}
}
void IndexController::addAlignedLemmatizedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const std::vector<std::string> & alignmentStrings,
const int tmId) {
try {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
std::vector<std::string> lemmatizedSourceSentences;
std::vector<std::vector<std::vector<int> > > allAlignments;
_getSourceSentencesAndAlignments(lemmatizedSourceSentences, allAlignments, alignmentStrings);
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences = it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
for(int index = 0; index < tokenizedLemmatizedSourceSentences.size(); index++) {
it->second->addTokenizedExample(tokenizedLemmatizedSourceSentences.at(index), sentenceIds.at(index));
}
jsonWriter.StartObject();
jsonWriter.String("status");
jsonWriter.String("success");
jsonWriter.EndObject();
} else {
JsonGenerator::signalError(jsonWriter, "no such tm!");
}
} catch (ConcordiaException & e) {
std::stringstream errorstream;
errorstream << "concordia error: " << e.what();
JsonGenerator::signalError(jsonWriter, errorstream.str());
}
}
void IndexController::refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId) {
try {
@ -189,4 +231,3 @@ void IndexController::_getSourceSentencesAndAlignments(
allAlignments.push_back(alignments);
}
}

View File

@ -10,6 +10,8 @@
#include "unit_dao.hpp"
#include "lemmatizer_facade.hpp"
#include "rapidjson/writer.h"
@ -17,7 +19,8 @@ class IndexController {
public:
/*! Constructor.
*/
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
explicit IndexController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
throw(ConcordiaException);
/*! Destructor.
*/
@ -38,6 +41,13 @@ public:
const std::vector<std::string> & targetSentences,
const int tmId);
void addAlignedLemmatizedSentences(
rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const std::vector<std::string> & sourceSentences,
const std::vector<std::string> & targetSentences,
const std::vector<std::string> & alignmentStrings,
const int tmId);
void refreshIndexFromRAM(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
const int tmId);
@ -49,6 +59,8 @@ private:
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
UnitDAO _unitDAO;
};

View File

@ -1,5 +1,7 @@
#include "lemmatizer_facade.hpp"
#include <boost/foreach.hpp>
LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
_lemmatizersMap = boost::ptr_map<std::string,SocketLemmatizer>();
@ -28,3 +30,26 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
}
}
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
if (tmInfo.first) {
return lemmatizeSentence(tmInfo.second, pattern);
} else {
return pattern;
}
}
std::vector<std::string> LemmatizerFacade::lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId) {
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
if (tmInfo.first) {
std::vector<std::string> result;
BOOST_FOREACH(std::string & pattern, patterns) {
result.push_back(lemmatizeSentence(tmInfo.second, pattern));
}
return result;
} else {
return patterns;
}
}

View File

@ -2,6 +2,7 @@
#define LEMMATIZER_FACADE_HDR
#include "socket_lemmatizer.hpp"
#include "tm_dao.hpp"
#include <string>
#include <concordia/concordia_exception.hpp>
@ -18,8 +19,15 @@ public:
virtual ~LemmatizerFacade();
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
private:
boost::ptr_map<std::string,SocketLemmatizer> _lemmatizersMap;
TmDAO _tmDAO;
};
#endif

View File

@ -8,9 +8,11 @@
#include "logger.hpp"
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
SearcherController::SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap,
boost::shared_ptr<LemmatizerFacade> lemmatizerFacade)
throw(ConcordiaException):
_concordiasMap(concordiasMap) {
_concordiasMap(concordiasMap),
_lemmatizerFacade(lemmatizerFacade) {
}
SearcherController::~SearcherController() {
@ -22,6 +24,7 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
const int tmId) {
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
std::vector<SimpleSearchResult> results = _unitDAO.getSearchResults(it->second->simpleSearch(pattern));
jsonWriter.StartObject();
@ -47,6 +50,7 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
if (it != _concordiasMap->end()) {
if (intervals.size() > 0) {
// std::string shortPattern = pattern.substr(intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
std::string shortPattern = _substrUTF8(pattern, intervals[0].getStart(), intervals[0].getEnd() - intervals[0].getStart());
Logger::log("concordiaPhraseSearch");
@ -111,6 +115,7 @@ void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuff
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
if (it != _concordiasMap->end()) {
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(it->second->concordiaSearch(pattern));
jsonWriter.StartObject();
@ -146,6 +151,3 @@ std::string SearcherController::_substrUTF8(std::string source, int start, int l
return result;
}

View File

@ -10,6 +10,7 @@
#include "unit_dao.hpp"
#include "simple_search_result.hpp"
#include "lemmatizer_facade.hpp"
#include "rapidjson/writer.h"
@ -17,7 +18,8 @@ class SearcherController {
public:
/*! Constructor.
*/
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> >concordiasMap)
explicit SearcherController(boost::shared_ptr<boost::ptr_map<int,Concordia> > concordiasMap,
boost::shared_ptr<LemmatizerFacade> LemmatizerFacade)
throw(ConcordiaException);
/*! Destructor.
*/
@ -41,6 +43,8 @@ private:
boost::shared_ptr<boost::ptr_map<int,Concordia> > _concordiasMap;
boost::shared_ptr<LemmatizerFacade> _lemmatizerFacade;
UnitDAO _unitDAO;
};

View File

@ -3,6 +3,7 @@
#include "query_param.hpp"
#include "string_param.hpp"
#include "int_param.hpp"
#include "bool_param.hpp"
#include "int_array_param.hpp"
#include "logger.hpp"
@ -32,14 +33,19 @@ std::vector<int> TmDAO::getTmIds() {
}
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name) {
addTm(sourceLangId, targetLangId, name, false);
}
int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized) {
DBconnection connection;
connection.startTransaction();
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name) values($1::integer,$2::integer,$3::text) RETURNING id";
std::string query = "INSERT INTO tm(source_lang_id, target_lang_id, name, lemmatized) values($1::integer,$2::integer,$3::text,$4::bool) RETURNING id";
std::vector<QueryParam*> params;
params.push_back(new IntParam(sourceLangId));
params.push_back(new IntParam(targetLangId));
params.push_back(new StringParam(name));
params.push_back(new BoolParam(lemmatized));
PGresult * result = connection.execute(query, params);
int newId = connection.getIntValue(result, 0, 0);
@ -53,3 +59,18 @@ int TmDAO::addTm(const int sourceLangId, const int targetLangId, const std::stri
}
std::pair<bool, std::string> TmDAO::getTmInfo(int tmId) {
DBconnection connection;
connection.startTransaction();
std::string query = "select tm.id, tm.lemmatized, language.code from tm inner join language on language.id = tm.source_lang_id where tm.id = $1::integer;";
std::vector<QueryParam*> params;
params.push_back(new IntParam(tmId));
PGresult * dbResult = connection.execute(query, params);
bool lemmatized = connection.getBoolValue(dbResult, 0, 1);
std::string languageCode = connection.getStringValue(dbResult, 0, 2);
connection.clearResult(dbResult);
connection.endTransaction();
return std::pair<bool, std::string>(lemmatized, languageCode);
}

View File

@ -3,6 +3,7 @@
#include <string>
#include <vector>
#include <utility>
#include <concordia/common/config.hpp>
#include "db_connection.hpp"
@ -18,8 +19,12 @@ public:
int addTm(const int sourceLangId, const int targetLangId, const std::string name);
int addTm(const int sourceLangId, const int targetLangId, const std::string name, bool lemmatized);
std::vector<int> getTmIds();
std::pair<bool, std::string> getTmInfo(int tmId);
private:
};

View File

@ -50,7 +50,7 @@ std::vector<SUFFIX_MARKER_TYPE> UnitDAO::addAlignedSentences(
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId) {
const int tmId) throw (ConcordiaException) {
DBconnection connection;
std::vector<SUFFIX_MARKER_TYPE> newIds;
@ -193,12 +193,16 @@ int UnitDAO::_addSingleSentence(
}
int UnitDAO::_addAlignedUnit(
int UnitDAO::_addAlignedUnit (
DBconnection & connection,
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int tmId) {
const int tmId) throw(ConcordiaException) {
if (sourceSentence.getTokens().size() != alignments.size()) {
throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
}
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
std::vector<QueryParam*> params;
@ -235,5 +239,3 @@ int UnitDAO::_addAlignedUnit(
return newId;
}

View File

@ -9,6 +9,7 @@
#include <concordia/substring_occurence.hpp>
#include <concordia/matched_pattern_fragment.hpp>
#include <concordia/concordia_search_result.hpp>
#include <concordia/concordia_exception.hpp>
#include <boost/shared_ptr.hpp>
#include "simple_search_result.hpp"
@ -38,7 +39,7 @@ public:
const std::vector<TokenizedSentence> & sourceSentences,
const std::vector<TokenizedSentence> & targetSentences,
const std::vector<std::vector<std::vector<int> > > & allAlignments,
const int tmId);
const int tmId) throw (ConcordiaException);
std::vector<SimpleSearchResult> getSearchResults(const std::vector<MatchedPatternFragment> & fragments);
@ -62,7 +63,7 @@ private:
const TokenizedSentence & sourceSentence,
const TokenizedSentence & targetSentence,
const std::vector<std::vector<int> > & alignments,
const int tmId);
const int tmId) throw(ConcordiaException);
};
#endif

View File

@ -3,7 +3,8 @@ CREATE TABLE tm (
id SERIAL PRIMARY KEY,
source_lang_id integer,
target_lang_id integer,
name varchar(40)
name varchar(40),
lemmatized bool DEFAULT false
);
DROP TABLE IF EXISTS language;

View File

@ -11,15 +11,23 @@ namespace LemmaGenSentenceLemmatizer
static void Main(string[] args)
{
if (args.Length == 1)
{
try
{
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
string line = Console.ReadLine();
while (!string.IsNullOrEmpty(line))
while (line != null)
{
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
line = Console.ReadLine();
}
}
catch (Exception ex)
{
Console.WriteLine("Exception occurred: " + ex.Message);
}
}
else

View File

@ -1,10 +1,22 @@
SRC_LANG=en
TRG_LANG=pl
CORPUS_NAME=europarl
CORPUS_NAME=europarljrc
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
clean-intermediate-files:
rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.low
rm -f corpora/$(CORPUS_NAME)/*.classes
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
rm -f corpora/$(CORPUS_NAME)/*.vcb
rm -f corpora/$(CORPUS_NAME)/*.snt
rm -f corpora/$(CORPUS_NAME)/*.cooc
rm -f corpora/$(CORPUS_NAME)/aligned*part*
rm -f corpora/$(CORPUS_NAME)/giza.cfg
clean:
rm -f corpora/$(CORPUS_NAME)/*.tok

168
mgiza-aligner/clean-corpus-n.perl Executable file
View File

@ -0,0 +1,168 @@
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
use warnings;
use strict;
use Getopt::Long;
my $help;
my $lc = 0; # lowercase the corpus?
my $ignore_ratio = 0;
my $ignore_xml = 0;
my $enc = "utf8"; # encoding of the input and output files
# set to anything else you wish, but I have not tested it yet
my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
# is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
# and crashes if it encounters a word that exceeds it
my $ratio = 9;
GetOptions(
"help" => \$help,
"lowercase|lc" => \$lc,
"encoding=s" => \$enc,
"ratio=f" => \$ratio,
"ignore-ratio" => \$ignore_ratio,
"ignore-xml" => \$ignore_xml,
"max-word-length|mwl=s" => \$max_word_length
) or exit(1);
if (scalar(@ARGV) < 6 || $help) {
print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
exit;
}
my $corpus = $ARGV[0];
my $l1 = $ARGV[1];
my $l2 = $ARGV[2];
my $out = $ARGV[3];
my $min = $ARGV[4];
my $max = $ARGV[5];
my $linesRetainedFile = "";
if (scalar(@ARGV) > 6) {
$linesRetainedFile = $ARGV[6];
open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
}
print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
my $opn = undef;
my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
$opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
open(F,$opn) or die "Can't open '$opn'";
$opn = undef;
my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
$opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
open(E,$opn) or die "Can't open '$opn'";
open(FO,">$out.$l1") or die "Can't write $out.$l1";
open(EO,">$out.$l2") or die "Can't write $out.$l2";
# necessary for proper lowercasing
my $binmode;
if ($enc eq "utf8") {
$binmode = ":utf8";
} else {
$binmode = ":encoding($enc)";
}
binmode(F, $binmode);
binmode(E, $binmode);
binmode(FO, $binmode);
binmode(EO, $binmode);
my $innr = 0;
my $outnr = 0;
my $factored_flag;
while(my $f = <F>) {
$innr++;
print STDERR "." if $innr % 10000 == 0;
print STDERR "($innr)" if $innr % 100000 == 0;
my $e = <E>;
die "$corpus.$l2 is too short!" if !defined $e;
chomp($e);
chomp($f);
if ($innr == 1) {
$factored_flag = ($e =~ /\|/ || $f =~ /\|/);
}
#if lowercasing, lowercase
if ($lc) {
$e = lc($e);
$f = lc($f);
}
$e =~ s/\|//g unless $factored_flag;
$e =~ s/\s+/ /g;
$e =~ s/^ //;
$e =~ s/ $//;
$f =~ s/\|//g unless $factored_flag;
$f =~ s/\s+/ /g;
$f =~ s/^ //;
$f =~ s/ $//;
next if $f eq '';
next if $e eq '';
my $ec = &word_count($e);
my $fc = &word_count($f);
next if $ec > $max;
next if $fc > $max;
next if $ec < $min;
next if $fc < $min;
next if !$ignore_ratio && $ec/$fc > $ratio;
next if !$ignore_ratio && $fc/$ec > $ratio;
# Skip this segment if any factor is longer than $max_word_length
my $max_word_length_plus_one = $max_word_length + 1;
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
# An extra check: none of the factors can be blank!
die "There is a blank factor in $corpus.$l1 on line $innr: $f"
if $f =~ /[ \|]\|/;
die "There is a blank factor in $corpus.$l2 on line $innr: $e"
if $e =~ /[ \|]\|/;
$outnr++;
print FO $f."\n";
print EO $e."\n";
if ($linesRetainedFile ne "") {
print LINES_RETAINED $innr."\n";
}
}
if ($linesRetainedFile ne "") {
close LINES_RETAINED;
}
print STDERR "\n";
my $e = <E>;
die "$corpus.$l2 is too long!" if defined $e;
print STDERR "Input sentences: $innr Output sentences: $outnr\n";
sub word_count {
my ($line) = @_;
if ($ignore_xml) {
$line =~ s/<\S[^>]*\S>/ /g;
$line =~ s/\s+/ /g;
$line =~ s/^ //g;
$line =~ s/ $//g;
}
my @w = split(/ /,$line);
return scalar @w;
}

View File

@ -0,0 +1,26 @@
#!/usr/bin/python3
import sys, re
examples_dict = {}
p = re.compile("# Sentence pair \((\d+)\)")
i = 0
for line in sys.stdin:
line = line.strip()
if i % 3 == 0:
current_example = [line]
m = p.match(line)
if m:
current_key = int(m.group(1))
else:
raise Exception("Wrong line: "+line)
elif i % 3 == 1:
current_example.append(line)
else:
current_example.append(line)
examples_dict[current_key] = current_example
i+=1
for key in sorted(examples_dict.keys()):
print ('\n'.join(examples_dict[key]))

58
tests/addAlignedLemmatizedTM.py Normal file → Executable file
View File

@ -21,10 +21,15 @@ def file_len(fname):
pass
return i + 1
def add_data(data):
def add_examples(examplesData):
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
json.loads(urllib2.urlopen(req, json.dumps(data)).read())
response = json.loads(urllib2.urlopen(req, json.dumps(examplesData)).read())
if response['status'] == 'error':
raise Exception(response['message'])
if len(sys.argv) != 7:
raise Exception("wrong number of arguments")
name = sys.argv[1]
sourceFile = sys.argv[2]
@ -40,13 +45,14 @@ if (file_len(alignmentsFile) != 3*file_len(sourceFile)):
raise Exception("alignments file is not exactly 3 times longer than source and target")
totalLines = file_len(sourceFile)
totalExamples = file_len(sourceFile)
data = {
'operation': 'addTm',
'sourceLangId':sourceLangId,
'targetLangId':targetLangId,
'name':name
'name':name,
'tmLemmatized':True
}
req = urllib2.Request(address)
@ -60,35 +66,35 @@ data = {
'tmId':tmId
}
sentences = []
examples = []
start = time.time()
with open(sourceFile) as sourceLines, open(targetFile) as targetLines, open(alignmentsFile) as alignmentsLines:
with open(sourceFile) as sf, open(targetFile) as tf, open(alignmentsFile) as af:
for lineNumber in range(totalExamples):
sourceSentence = sf.readline().strip()
targetSentence = tf.readline().strip()
lineNumber = 0
for line in sourceLines:
line = line.strip()
if lineNumber % 3 == 1:
currSentence.append(line)
elif lineNumber % 3 == 2:
currSentence.append(line)
currSentence.reverse()
sentences.append(currSentence)
currSentence = []
if len(sentences) >= BUFFER_SIZE:
data['sentences'] = sentences
add_data(data)
# skip to lines of the alignments file, these are lemmatized and we need the raw sentences from the source and target files.
af.readline()
af.readline()
alignmentString = af.readline().strip()
examples.append([sourceSentence, targetSentence, alignmentString])
if len(examples) >= BUFFER_SIZE:
data['examples'] = examples
add_examples(data)
mark = time.time()
print "Added %d of %d sentences. Time elapsed: %.4f s, current speed: %.4f sentences/second" % ( (lineNumber+1)/3, totalLines/3, mark-start, (lineNumber+1)/(3*(mark-start)))
sentences = []
lineNumber += 1
print "Added %d of %d lemmatized examples. Time elapsed: %.4f s, current speed: %.4f examples/second" % ( (lineNumber+1), totalExamples, mark-start, (lineNumber+1)/(mark-start))
examples = []
if len(sentences) > 0:
data['sentences'] = sentences
add_data(data)
if len(examples) > 0:
data['examples'] = examples
add_examples(data)
end = time.time()
print "Added all %d sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1)/3, end-start, (lineNumber+1)/(3*(end-start)))
print "Added all %d lemmatized sentences. Time elapsed: %.4f s, overall speed: %.4f sentences/second" % ((lineNumber+1), end-start, (lineNumber+1)/(end-start))
print "Generating index..."
start = time.time()

7
tests/addLemmatizedTM.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/sh
CORPUS_NAME="europarl_sample"
SRC_LANG_ID=2
TRG_LANG_ID=1
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt

View File

@ -16,7 +16,8 @@ data = {
'operation': 'addTm',
'sourceLangId':int(sys.argv[1]),
'targetLangId':int(sys.argv[2]),
'name':sys.argv[3]
'name':sys.argv[3],
'tmLemmatized':bool(int(sys.argv[4]))
}
req = urllib2.Request(address)