corrections

This commit is contained in:
Rafał Jaworski 2017-03-11 21:48:25 +01:00
parent 89fb77bf58
commit 534e14db9f
5 changed files with 39 additions and 13 deletions

View File

@ -1,6 +1,7 @@
#include "unit_dao.hpp" #include "unit_dao.hpp"
#include <sstream> #include <sstream>
#include <string>
#include "query_param.hpp" #include "query_param.hpp"
#include "string_param.hpp" #include "string_param.hpp"
@ -200,8 +201,18 @@ int UnitDAO::_addAlignedUnit (
const std::vector<std::vector<int> > & alignments, const std::vector<std::vector<int> > & alignments,
const int tmId) throw(ConcordiaException) { const int tmId) throw(ConcordiaException) {
if (sourceSentence.getTokens().size() != alignments.size()) { if (sourceSentence.getTokens().size() < alignments.size()) {
throw ConcordiaException("The size of source sentence does not match the size of alignments array."); // Here we check if the source sentence, taken from src.tok,
// is shorter than alignments array.
std::stringstream ss;
ss << "The size of source sentence is lower than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
throw ConcordiaException(ss.str());
} else if (sourceSentence.getTokens().size() > alignments.size()) {
// On the other hand, alignments array can be shorter than the source tokenized
// sentence, because giza can truncate the sentence. In this case, we have to
// truncate the source sentence too.
} }
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id"; std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";

View File

@ -1,13 +1,13 @@
#!/bin/sh #!/bin/sh
echo "Recreating database schema..." echo "Recreating database schema..."
psql -U concordia concordia_server -f concordia_server.sql psql -U concordia -p 6543 -h localhost concordia_server -f concordia_server.sql
echo "Inserting initial data..." echo "Inserting initial data..."
for initFile in `ls init/*` for initFile in `ls init/*`
do do
echo "Init file:" $initFile echo "Init file:" $initFile
psql -U concordia concordia_server -f $initFile psql -U concordia -p 6543 -h localhost concordia_server -f $initFile
done done
echo "Concordia server database recreation complete!" echo "Concordia server database recreation complete!"

View File

@ -1,8 +1,10 @@
SRC_LANG=en SRC_LANG=en
TRG_LANG=pl TRG_LANG=pl
CORPUS_NAME=europarljrc CORPUS_NAME=europarl_sample
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
@ -52,8 +54,11 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@ mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@ europarl/tools/tokenizer.perl -l $(SRC_LANG) < corpora/$(CORPUS_NAME)/src.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
europarl/tools/tokenizer.perl -l $(TRG_LANG) < corpora/$(CORPUS_NAME)/trg.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt ./clean-corpus-n.perl corpora/$(CORPUS_NAME)/$(CORPUS_NAME) $(TRG_LANG) $(SRC_LANG) corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean 0 100
europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@ mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(SRC_LANG) corpora/$(CORPUS_NAME)/src.tok
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(TRG_LANG) corpora/$(CORPUS_NAME)/trg.tok
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)

10
mgiza-aligner/build.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr clean-intermediate-files
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl clean-intermediate-files
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en clean-intermediate-files

View File

@ -1,6 +1,6 @@
#!/bin/sh #!/bin/sh
CORPUS_NAME="europarl_sample" CORPUS_NAME="europarljrc"
SRC_LANG_ID=2 SRC_LANG_ID=2
TRG_LANG_ID=1 TRG_LANG_ID=1