corrections
This commit is contained in:
parent
89fb77bf58
commit
534e14db9f
@ -1,6 +1,7 @@
|
|||||||
#include "unit_dao.hpp"
|
#include "unit_dao.hpp"
|
||||||
|
|
||||||
#include<sstream>
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "query_param.hpp"
|
#include "query_param.hpp"
|
||||||
#include "string_param.hpp"
|
#include "string_param.hpp"
|
||||||
@ -200,8 +201,18 @@ int UnitDAO::_addAlignedUnit (
|
|||||||
const std::vector<std::vector<int> > & alignments,
|
const std::vector<std::vector<int> > & alignments,
|
||||||
const int tmId) throw(ConcordiaException) {
|
const int tmId) throw(ConcordiaException) {
|
||||||
|
|
||||||
if (sourceSentence.getTokens().size() != alignments.size()) {
|
if (sourceSentence.getTokens().size() < alignments.size()) {
|
||||||
throw ConcordiaException("The size of source sentence does not match the size of alignments array.");
|
// Here we check if the source sentence, taken from src.tok,
|
||||||
|
// is shorter than alignments array.
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "The size of source sentence is lower than the size of alignments array. Source sentence: " << sourceSentence.getSentence() << ", alignments size:" << alignments.size();
|
||||||
|
throw ConcordiaException(ss.str());
|
||||||
|
} else if (sourceSentence.getTokens().size() > alignments.size()) {
|
||||||
|
// On the other hand, alignments array can be shorter than the source tokenized
|
||||||
|
// sentence, because giza can truncate the sentence. In this case, we have to
|
||||||
|
// truncate the source sentence too.
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
std::string query = "INSERT INTO unit(source_segment, target_segment, tm_id, source_tokens, target_tokens) values($1::text,$2::text,$3::integer,$4,$5) RETURNING id";
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
echo "Recreating database schema..."
|
echo "Recreating database schema..."
|
||||||
psql -U concordia concordia_server -f concordia_server.sql
|
psql -U concordia -p 6543 -h localhost concordia_server -f concordia_server.sql
|
||||||
|
|
||||||
echo "Inserting initial data..."
|
echo "Inserting initial data..."
|
||||||
for initFile in `ls init/*`
|
for initFile in `ls init/*`
|
||||||
do
|
do
|
||||||
echo "Init file:" $initFile
|
echo "Init file:" $initFile
|
||||||
psql -U concordia concordia_server -f $initFile
|
psql -U concordia -p 6543 -h localhost concordia_server -f $initFile
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "Concordia server database recreation complete!"
|
echo "Concordia server database recreation complete!"
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
SRC_LANG=en
|
SRC_LANG=en
|
||||||
TRG_LANG=pl
|
TRG_LANG=pl
|
||||||
CORPUS_NAME=europarljrc
|
CORPUS_NAME=europarl_sample
|
||||||
|
|
||||||
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
|
||||||
@ -52,8 +54,11 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
|
|||||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
||||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
|
corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
||||||
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
|
europarl/tools/tokenizer.perl -l $(SRC_LANG) < corpora/$(CORPUS_NAME)/src.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
||||||
|
europarl/tools/tokenizer.perl -l $(TRG_LANG) < corpora/$(CORPUS_NAME)/trg.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
||||||
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
./clean-corpus-n.perl corpora/$(CORPUS_NAME)/$(CORPUS_NAME) $(TRG_LANG) $(SRC_LANG) corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean 0 100
|
||||||
europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@
|
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(SRC_LANG) corpora/$(CORPUS_NAME)/src.tok
|
||||||
|
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(TRG_LANG) corpora/$(CORPUS_NAME)/trg.tok
|
||||||
|
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
||||||
|
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
||||||
|
10
mgiza-aligner/build.sh
Executable file
10
mgiza-aligner/build.sh
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr
|
||||||
|
make CORPUS_NAME=setimes_enhr SRC_LANG=en TRG_LANG=hr clean-intermediate-files
|
||||||
|
|
||||||
|
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl
|
||||||
|
make CORPUS_NAME=europarljrc_enpl SRC_LANG=en TRG_LANG=pl clean-intermediate-files
|
||||||
|
|
||||||
|
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en
|
||||||
|
make CORPUS_NAME=jrc200k_plen SRC_LANG=pl TRG_LANG=en clean-intermediate-files
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
CORPUS_NAME="europarl_sample"
|
CORPUS_NAME="europarljrc"
|
||||||
SRC_LANG_ID=2
|
SRC_LANG_ID=2
|
||||||
TRG_LANG_ID=1
|
TRG_LANG_ID=1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user