corrected case
This commit is contained in:
parent
8fffed6d58
commit
01a70fe444
@ -69,10 +69,10 @@ function renderResult(data) {
|
|||||||
for(var i = 0; i < data['result']['bestOverlay'].length; i++) {
|
for(var i = 0; i < data['result']['bestOverlay'].length; i++) {
|
||||||
var fragment = data['result']['bestOverlay'][i];
|
var fragment = data['result']['bestOverlay'][i];
|
||||||
//previous unmarked fragment
|
//previous unmarked fragment
|
||||||
markedSentence += inputSentence.slice(lastInsertedEnd, fragment['matchedPatternStart']);
|
markedSentence += htmlEncode(inputSentence.slice(lastInsertedEnd, fragment['matchedPatternStart']));
|
||||||
|
|
||||||
//the marked fragment
|
//the marked fragment
|
||||||
markedSentence += '<span onclick="displayDetails(this, '+i+')" class="matchedFragment">'+inputSentence.slice(fragment['matchedPatternStart'], fragment['matchedPatternEnd'])+'</span>';
|
markedSentence += '<span onclick="displayDetails(this, '+i+')" class="matchedFragment">'+htmlEncode(inputSentence.slice(fragment['matchedPatternStart'], fragment['matchedPatternEnd']))+'</span>';
|
||||||
|
|
||||||
lastInsertedEnd = fragment['matchedPatternEnd'];
|
lastInsertedEnd = fragment['matchedPatternEnd'];
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ function renderResult(data) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//remaining unmarked fragment
|
//remaining unmarked fragment
|
||||||
markedSentence += inputSentence.slice(lastInsertedEnd);
|
markedSentence += htmlEncode(inputSentence.slice(lastInsertedEnd));
|
||||||
|
|
||||||
res += '<div id="result-sentence" onMouseUp="phraseSearch(this)">'+markedSentence+'</div>';
|
res += '<div id="result-sentence" onMouseUp="phraseSearch(this)">'+markedSentence+'</div>';
|
||||||
|
|
||||||
@ -89,6 +89,12 @@ function renderResult(data) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function htmlEncode(value){
|
||||||
|
// Create a in-memory div, set its inner text (which jQuery automatically encodes)
|
||||||
|
// Then grab the encoded contents back out. The div never exists on the page.
|
||||||
|
return $('<div/>').text(value).html();
|
||||||
|
}
|
||||||
|
|
||||||
function renderFragment(fragment, number) {
|
function renderFragment(fragment, number) {
|
||||||
var result = '<div style="display:none" id="fragment'+number+'" class="fragmentDetails">';
|
var result = '<div style="display:none" id="fragment'+number+'" class="fragmentDetails">';
|
||||||
|
|
||||||
|
10
cat/versions_available/europarl_sample.cfg
Normal file
10
cat/versions_available/europarl_sample.cfg
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
dir@#@europarl_sample
|
||||||
|
concordia_host@#@localhost
|
||||||
|
concordia_port@#@8800
|
||||||
|
tmid@#@1
|
||||||
|
desc@#@Europarl sample (1000 sentences)
|
||||||
|
enjoy@#@Życzymy udanej pracy z systemem!
|
||||||
|
prompt@#@Wprowadź zdanie (po polsku):
|
||||||
|
suggestion@#@Na każde państwo członkowskie Unii Europejskiej przypada jeden komisarz.
|
||||||
|
suggestion@#@Komisja Europejska przygotowuje raport na najbliższym posiedzeniu.
|
||||||
|
suggestion@#@Wspólny Komitet przyjmuje swój statut.
|
1
cat/versions_enabled/europarl_sample.cfg
Symbolic link
1
cat/versions_enabled/europarl_sample.cfg
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../versions_available/europarl_sample.cfg
|
@ -1 +0,0 @@
|
|||||||
../versions_available/stocznia_enpl.cfg
|
|
@ -1 +0,0 @@
|
|||||||
../versions_available/stocznia_plen.cfg
|
|
@ -137,8 +137,8 @@ void IndexController::addAlignedLemmatizedSentences(
|
|||||||
|
|
||||||
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
|
std::vector<TokenizedSentence> tokenizedLemmatizedSourceSentences =
|
||||||
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
it->second->tokenizeAll(lemmatizedSourceSentences, true, true);
|
||||||
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, true, false);
|
std::vector<TokenizedSentence> tokenizedSourceSentences = it->second->tokenizeAll(sourceSentences, false, false);
|
||||||
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, true, false);
|
std::vector<TokenizedSentence> tokenizedTargetSentences = it->second->tokenizeAll(targetSentences, false, false);
|
||||||
|
|
||||||
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
std::vector<SUFFIX_MARKER_TYPE> sentenceIds =
|
||||||
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
_unitDAO.addAlignedSentences(tokenizedSourceSentences, tokenizedTargetSentences, allAlignments, tmId);
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
|
#include <concordia/tokenized_sentence.hpp>
|
||||||
|
|
||||||
|
|
||||||
#include "json_generator.hpp"
|
#include "json_generator.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
@ -24,7 +26,8 @@ void SearcherController::simpleSearch(rapidjson::Writer<rapidjson::StringBuffer>
|
|||||||
const int tmId) {
|
const int tmId) {
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
pattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
TokenizedSentence tokenizedPattern = it->second->tokenize(pattern, false, false);
|
||||||
|
pattern = _lemmatizerFacade->lemmatizeIfNeeded(tokenizedPattern.getTokenizedSentence(), tmId);
|
||||||
SimpleSearchResult result = _unitDAO.getSimpleSearchResult(it->second->simpleSearch(pattern, true));
|
SimpleSearchResult result = _unitDAO.getSimpleSearchResult(it->second->simpleSearch(pattern, true));
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
@ -106,18 +109,12 @@ void SearcherController::concordiaPhraseSearch(rapidjson::Writer<rapidjson::Stri
|
|||||||
void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
void SearcherController::concordiaSearch(rapidjson::Writer<rapidjson::StringBuffer> & jsonWriter,
|
||||||
std::string & pattern,
|
std::string & pattern,
|
||||||
const int tmId) {
|
const int tmId) {
|
||||||
Logger::log("concordiaSearch");
|
|
||||||
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
boost::ptr_map<int,Concordia>::iterator it = _concordiasMap->find(tmId);
|
||||||
if (it != _concordiasMap->end()) {
|
if (it != _concordiasMap->end()) {
|
||||||
std::string lemmatizedPattern = _lemmatizerFacade->lemmatizeIfNeeded(pattern, tmId);
|
TokenizedSentence originalPattern = it->second->tokenize(pattern, false, false);
|
||||||
Logger::logString("pattern lemmatized", lemmatizedPattern);
|
std::string lemmatizedPattern = _lemmatizerFacade->lemmatizeIfNeeded(originalPattern.getTokenizedSentence(), tmId);
|
||||||
TokenizedSentence originalPattern = it->second->tokenize(pattern, true, false);
|
|
||||||
Logger::logInt("original pattern tokenized, token count", originalPattern.getTokens().size());
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult = it->second->concordiaSearch(lemmatizedPattern, true);
|
boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult = it->second->concordiaSearch(lemmatizedPattern, true);
|
||||||
Logger::log("concordia searched, result:");
|
|
||||||
Logger::logConcordiaSearchResult(*rawConcordiaResult);
|
|
||||||
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(rawConcordiaResult, originalPattern);
|
CompleteConcordiaSearchResult result = _unitDAO.getConcordiaResult(rawConcordiaResult, originalPattern);
|
||||||
Logger::log("result got");
|
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
|
@ -81,11 +81,8 @@ CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<Conc
|
|||||||
}
|
}
|
||||||
|
|
||||||
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern) {
|
CompleteConcordiaSearchResult UnitDAO::getConcordiaResult(boost::shared_ptr<ConcordiaSearchResult> rawConcordiaResult, TokenizedSentence originalPattern) {
|
||||||
Logger::log("getConcordiaResult with original pattern");
|
|
||||||
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
|
CompleteConcordiaSearchResult result(rawConcordiaResult->getBestOverlayScore());
|
||||||
BOOST_FOREACH(MatchedPatternFragment fragment, rawConcordiaResult->getBestOverlay()) {
|
BOOST_FOREACH(MatchedPatternFragment fragment, rawConcordiaResult->getBestOverlay()) {
|
||||||
Logger::log("Working on fragment:");
|
|
||||||
Logger::logFragment(fragment);
|
|
||||||
result.addToBestOverlay(_getResultFromFragment(fragment, originalPattern));
|
result.addToBestOverlay(_getResultFromFragment(fragment, originalPattern));
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@ -95,7 +92,6 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
|
|||||||
const MatchedPatternFragment & fragment,
|
const MatchedPatternFragment & fragment,
|
||||||
const TokenizedSentence & tokenizedPattern) {
|
const TokenizedSentence & tokenizedPattern) {
|
||||||
|
|
||||||
Logger::log("getResultFromFragment");
|
|
||||||
DBconnection connection;
|
DBconnection connection;
|
||||||
connection.startTransaction();
|
connection.startTransaction();
|
||||||
|
|
||||||
@ -103,15 +99,11 @@ SimpleSearchResult UnitDAO::_getResultFromFragment(
|
|||||||
int matchedPatternEnd = 0;
|
int matchedPatternEnd = 0;
|
||||||
if (tokenizedPattern.getTokens().size() > 0) {
|
if (tokenizedPattern.getTokens().size() > 0) {
|
||||||
// if it is concordia searching
|
// if it is concordia searching
|
||||||
Logger::logInt("tokenizedPattern size",tokenizedPattern.getTokens().size());
|
|
||||||
Logger::logInt("fragment start",fragment.getStart());
|
|
||||||
Logger::logInt("fragment matched length",fragment.getMatchedLength());
|
|
||||||
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
|
matchedPatternStart = tokenizedPattern.getTokens().at(fragment.getStart()).getStart();
|
||||||
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
|
matchedPatternEnd = tokenizedPattern.getTokens().at(fragment.getStart()+fragment.getMatchedLength() - 1).getEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
SimpleSearchResult ssResult(matchedPatternStart, matchedPatternEnd);
|
SimpleSearchResult ssResult(matchedPatternStart, matchedPatternEnd);
|
||||||
Logger::log("simple search result created");
|
|
||||||
|
|
||||||
BOOST_FOREACH(SubstringOccurence sOccurence, fragment.getOccurences()) {
|
BOOST_FOREACH(SubstringOccurence sOccurence, fragment.getOccurences()) {
|
||||||
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
|
std::string query = "SELECT id, source_segment, target_segment, source_tokens[$1::integer], source_tokens[$2::integer] FROM unit WHERE id = $3::integer;";
|
||||||
@ -218,24 +210,18 @@ int UnitDAO::_addAlignedUnit (
|
|||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) throw(ConcordiaException) {
|
const int tmId) throw(ConcordiaException) {
|
||||||
|
|
||||||
if (sourceSentence.getTokens().size() < alignments.size()) {
|
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||||
// Here we check if the source sentence, taken from src.tok,
|
// Here we check if the source sentence, taken from src.tok,
|
||||||
// is shorter than alignments array.
|
// is shorter than alignments array.
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "The size of source sentence is lower than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
|
ss << "The size of source sentence is different than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
|
||||||
throw ConcordiaException(ss.str());
|
throw ConcordiaException(ss.str());
|
||||||
} else if (sourceSentence.getTokens().size() > alignments.size()) {
|
|
||||||
// On the other hand, alignments array can be shorter than the source tokenized
|
|
||||||
// sentence, because giza can truncate the sentence. In this case, we have to
|
|
||||||
// truncate the source sentence too.
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||||
std::vector<QueryParam*> params;
|
std::vector<QueryParam*> params;
|
||||||
params.push_back(new StringParam(sourceSentence.getSentence()));
|
params.push_back(new StringParam(sourceSentence.getOriginalSentence()));
|
||||||
params.push_back(new StringParam(targetSentence.getSentence()));
|
params.push_back(new StringParam(targetSentence.getOriginalSentence()));
|
||||||
params.push_back(new IntParam(tmId));
|
params.push_back(new IntParam(tmId));
|
||||||
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
params.push_back(new IntArrayParam(_getTokenPositions(sourceSentence)));
|
||||||
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
|
params.push_back(new IntArrayParam(_getTokenPositions(targetSentence)));
|
||||||
|
@ -2,30 +2,22 @@
|
|||||||
# Concordia configuration file
|
# Concordia configuration file
|
||||||
#---------------------------
|
#---------------------------
|
||||||
#
|
#
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# The following settings control the sentence anonymizer mechanism. It is used to
|
|
||||||
# remove unnecessary symbols and possibly words from sentences added to index
|
|
||||||
# and search patterns. Anonymizer removes html tags, substitutes predefined symbols
|
|
||||||
# with a single space, removes stop words (if the option is enabled), as well as
|
|
||||||
# named entities and special symbols. All these have to be listed in files.
|
|
||||||
|
|
||||||
# File containing all html tags (one per line)
|
# File containing all html tags (one per line)
|
||||||
html_tags_path = "@RESOURCES_DIRECTORY@/anonymizer/html_tags.txt"
|
html_tags_path = "@RESOURCES_DIRECTORY@/tokenizer/html_tags.txt"
|
||||||
|
|
||||||
# File containing all symbols to be replaced by spaces
|
# File containing all symbols to be replaced by spaces
|
||||||
space_symbols_path = "@RESOURCES_DIRECTORY@/anonymizer/space_symbols.txt"
|
space_symbols_path = "@RESOURCES_DIRECTORY@/tokenizer/space_symbols.txt"
|
||||||
|
|
||||||
# If set to true, words from predefined list are removed
|
# If set to true, words from predefined list are removed
|
||||||
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
stop_words_enabled = "@STOP_WORDS_ENABLED@"
|
||||||
|
|
||||||
# If stop_words_enabled is true, set the path to the stop words file
|
# If stop_words_enabled is true, set the path to the stop words file
|
||||||
#stop_words_path = "@RESOURCES_DIRECTORY@/anonymizer/stop_words.txt"
|
#stop_words_path = "@RESOURCES_DIRECTORY@/tokenizer/stop_words.txt"
|
||||||
|
|
||||||
# File containing regular expressions that match named entities
|
# File containing regular expressions that match named entities
|
||||||
named_entities_path = "@RESOURCES_DIRECTORY@/anonymizer/named_entities.txt"
|
named_entities_path = "@RESOURCES_DIRECTORY@/tokenizer/named_entities.txt"
|
||||||
|
|
||||||
# File containing special symbols (one per line) to be removed
|
# File containing special symbols (one per line) to be removed
|
||||||
stop_symbols_path = "@RESOURCES_DIRECTORY@/anonymizer/stop_symbols.txt"
|
stop_symbols_path = "@RESOURCES_DIRECTORY@/tokenizer/stop_symbols.txt"
|
||||||
|
|
||||||
### eof
|
### eof
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
SRC_LANG=en
|
SRC_LANG=pl
|
||||||
TRG_LANG=pl
|
TRG_LANG=en
|
||||||
CORPUS_NAME=europarl_sample
|
CORPUS_NAME=europarl_sample
|
||||||
|
SEPARATOR=@\#@
|
||||||
|
|
||||||
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
||||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
|
||||||
clean-intermediate-files:
|
clean-intermediate-files:
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.low
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||||
@ -18,47 +18,51 @@ clean-intermediate-files:
|
|||||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||||
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_deduplicated.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_deduplicated.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_clean.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.tok
|
||||||
|
|
||||||
|
clean: clean-intermediate-files
|
||||||
clean:
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.tok
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.low
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.snt
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/aligned*
|
rm -f corpora/$(CORPUS_NAME)/aligned*
|
||||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
||||||
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc: corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt
|
||||||
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/trg.lem_src.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
|
||||||
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
|
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.lem
|
||||||
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.lem
|
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
|
||||||
tr '[:upper:]' '[:lower:]' < $< > $@
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
|
|
||||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.tok
|
||||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
|
||||||
europarl/tools/tokenizer.perl -l $(SRC_LANG) < corpora/$(CORPUS_NAME)/src.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok
|
||||||
europarl/tools/tokenizer.perl -l $(TRG_LANG) < corpora/$(CORPUS_NAME)/trg.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
./clean_corpus.py $< corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
||||||
./clean-corpus-n.perl corpora/$(CORPUS_NAME)/$(CORPUS_NAME) $(TRG_LANG) $(SRC_LANG) corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean 0 100
|
|
||||||
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(SRC_LANG) corpora/$(CORPUS_NAME)/src.tok
|
corpora/$(CORPUS_NAME)/%_deduplicated.tok: corpora/$(CORPUS_NAME)/%_deduplicated.txt
|
||||||
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(TRG_LANG) corpora/$(CORPUS_NAME)/trg.tok
|
concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
|
||||||
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
|
||||||
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
||||||
|
./cut.py $< corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt $(SEPARATOR)
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
|
sort -k 1.13 $< | uniq -s 12 | sort > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
||||||
|
./paste.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt $(SEPARATOR)> $@
|
||||||
|
24
mgiza-aligner/clean_corpus.py
Executable file
24
mgiza-aligner/clean_corpus.py
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
max_tokens = 100
|
||||||
|
max_ratio = 4.0
|
||||||
|
|
||||||
|
separator = sys.argv[8]
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as src_deduplicated_tok, open(sys.argv[3]) as trg_deduplicated_tok, open(sys.argv[4], 'w') as src_clean, open(sys.argv[5], 'w') as trg_clean, open(sys.argv[6], 'w') as src_clean_tok, open(sys.argv[7], 'w') as trg_clean_tok:
|
||||||
|
for line in pasted_file:
|
||||||
|
src_line_orig, trg_line_orig = line.strip()[12:].split(separator)
|
||||||
|
src_line_tok = src_deduplicated_tok.readline().strip()
|
||||||
|
trg_line_tok = trg_deduplicated_tok.readline().strip()
|
||||||
|
src_token_count = len(src_line_tok.split())
|
||||||
|
trg_token_count = len(trg_line_tok.split())
|
||||||
|
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||||
|
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||||
|
if (ratio <= max_ratio):
|
||||||
|
src_clean.write(src_line_orig+"\n")
|
||||||
|
trg_clean.write(trg_line_orig+"\n")
|
||||||
|
src_clean_tok.write(src_line_tok+"\n")
|
||||||
|
trg_clean_tok.write(trg_line_tok+"\n")
|
12
mgiza-aligner/cut.py
Executable file
12
mgiza-aligner/cut.py
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
separator = sys.argv[4]
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as pasted_file, open(sys.argv[2], 'w') as src_file, open(sys.argv[3], 'w') as trg_file:
|
||||||
|
for line in pasted_file:
|
||||||
|
src_line, trg_line = line.strip()[12:].split(separator)
|
||||||
|
src_file.write(src_line+"\n")
|
||||||
|
trg_file.write(trg_line+"\n")
|
@ -1,8 +1,8 @@
|
|||||||
adbackoff 0
|
adbackoff 0
|
||||||
compactadtable 1
|
compactadtable 1
|
||||||
compactalignmentformat 0
|
compactalignmentformat 0
|
||||||
coocurrencefile corpora/CORPUS_NAME/src.low_trg.low.cooc
|
coocurrencefile corpora/CORPUS_NAME/src.lem_trg.lem.cooc
|
||||||
corpusfile corpora/CORPUS_NAME/src.low_trg.low.snt
|
corpusfile corpora/CORPUS_NAME/src.lem_trg.lem.snt
|
||||||
countcutoff 1e-06
|
countcutoff 1e-06
|
||||||
countcutoffal 1e-05
|
countcutoffal 1e-05
|
||||||
countincreasecutoff 1e-06
|
countincreasecutoff 1e-06
|
||||||
@ -84,13 +84,13 @@ probcutoff 1e-07
|
|||||||
probsmooth 1e-07
|
probsmooth 1e-07
|
||||||
readtableprefix
|
readtableprefix
|
||||||
restart 0
|
restart 0
|
||||||
sourcevocabularyfile corpora/CORPUS_NAME/src.low.vcb
|
sourcevocabularyfile corpora/CORPUS_NAME/src.lem.vcb
|
||||||
t1 1
|
t1 1
|
||||||
t2 0
|
t2 0
|
||||||
t2to3 0
|
t2to3 0
|
||||||
t3 0
|
t3 0
|
||||||
t345 0
|
t345 0
|
||||||
targetvocabularyfile corpora/CORPUS_NAME/trg.low.vcb
|
targetvocabularyfile corpora/CORPUS_NAME/trg.lem.vcb
|
||||||
tc
|
tc
|
||||||
testcorpusfile
|
testcorpusfile
|
||||||
th 0
|
th 0
|
||||||
|
15
mgiza-aligner/paste.py
Executable file
15
mgiza-aligner/paste.py
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
separator = sys.argv[3]
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file:
|
||||||
|
index = 0
|
||||||
|
for src_line in src_file:
|
||||||
|
trg_line = trg_file.readline()
|
||||||
|
if separator in src_line or separator in trg_line:
|
||||||
|
raise Exception("Can not use: "+separator+" as a separator. Please set a different one in the Makefile")
|
||||||
|
print ("%012d%s%s%s" % (index, src_line.strip(), separator, trg_line.strip()))
|
||||||
|
index += 1
|
@ -1,3 +0,0 @@
|
|||||||
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
|
||||||
[\w\._\d]+@\w+(\.\w+)* ne_email
|
|
||||||
[0-9]+([\.\,][0-9]+)? ne_number
|
|
5
resources/tokenizer/named_entities.txt
Normal file
5
resources/tokenizer/named_entities.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4} ne_date
|
||||||
|
[0-9]{4}[\.\-/][0-9]{1,2}[\.\-/][0-9]{1,2} ne_date
|
||||||
|
[\w\._\d]+@\w+(\.\w+)* ne_email
|
||||||
|
[0-9]+[\.\)]([0-9]+\.)+ ne_bullet
|
||||||
|
\b[0-9]+([\.\,][0-9]+)?\b ne_number
|
@ -1,7 +1,7 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
CORPUS_NAME="setimes_enhr"
|
CORPUS_NAME="europarl_sample"
|
||||||
SRC_LANG_ID=2
|
SRC_LANG_ID=1
|
||||||
TRG_LANG_ID=6
|
TRG_LANG_ID=2
|
||||||
|
|
||||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src.tok $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg.tok $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_clean.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_clean.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
||||||
|
Loading…
Reference in New Issue
Block a user