corrections
This commit is contained in:
parent
89fb77bf58
commit
534e14db9f
@ -1,6 +1,7 @@
|
||||
#include "unit_dao.hpp"
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "query_param.hpp"
|
||||
#include "string_param.hpp"
|
||||
@ -200,8 +201,18 @@ int UnitDAO::_addAlignedUnit (
|
||||
const std::vector<std::vector<int> > & alignments,
|
||||
const int tmId) throw(ConcordiaException) {
|
||||
|
||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
||||
throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
|
||||
if (sourceSentence.getTokens().size() < alignments.size()) {
|
||||
// Here we check if the source sentence, taken from src.tok,
|
||||
// is shorter than alignments array.
|
||||
std::stringstream ss;
|
||||
ss << "The size of source sentence is lower than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
|
||||
throw ConcordiaException(ss.str());
|
||||
} else if (sourceSentence.getTokens().size() > alignments.size()) {
|
||||
// On the other hand, alignments array can be shorter than the source tokenized
|
||||
// sentence, because giza can truncate the sentence. In this case, we have to
|
||||
// truncate the source sentence too.
|
||||
|
||||
|
||||
}
|
||||
|
||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||
|
@ -1,13 +1,13 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo "Recreating database schema..."
|
||||
psql -U concordia concordia_server -f concordia_server.sql
|
||||
psql -U concordia -p 6543 -h localhost concordia_server -f concordia_server.sql
|
||||
|
||||
echo "Inserting initial data..."
|
||||
for initFile in `ls init/*`
|
||||
do
|
||||
echo "Init file:" $initFile
|
||||
psql -U concordia concordia_server -f $initFile
|
||||
psql -U concordia -p 6543 -h localhost concordia_server -f $initFile
|
||||
done
|
||||
|
||||
echo "Concordia server database recreation complete!"
|
||||
|
@ -1,8 +1,10 @@
|
||||
SRC_LANG=en
|
||||
TRG_LANG=pl
|
||||
CORPUS_NAME=europarljrc
|
||||
CORPUS_NAME=europarl_sample
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||
|
||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
|
||||
@ -52,8 +54,11 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
|
||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
|
||||
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
||||
europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@
|
||||
corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
||||
europarl/tools/tokenizer.perl -l $(SRC_LANG) < corpora/$(CORPUS_NAME)/src.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
||||
europarl/tools/tokenizer.perl -l $(TRG_LANG) < corpora/$(CORPUS_NAME)/trg.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
||||
./clean-corpus-n.perl corpora/$(CORPUS_NAME)/$(CORPUS_NAME) $(TRG_LANG) $(SRC_LANG) corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean 0 100
|
||||
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(SRC_LANG) corpora/$(CORPUS_NAME)/src.tok
|
||||
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(TRG_LANG) corpora/$(CORPUS_NAME)/trg.tok
|
||||
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
||||
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
||||
|
10
mgiza-aligner/build.sh
Executable file
10
mgiza-aligner/build.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
|
||||
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr
|
||||
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr clean-intermediate-files
|
||||
|
||||
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl
|
||||
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl clean-intermediate-files
|
||||
|
||||
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en
|
||||
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en clean-intermediate-files
|
@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
CORPUS_NAME="europarl_sample"
|
||||
CORPUS_NAME="europarljrc"
|
||||
SRC_LANG_ID=2
|
||||
TRG_LANG_ID=1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user