2017-01-21 17:01:15 +01:00
|
|
|
SRC_LANG=en
|
|
|
|
TRG_LANG=pl
|
2017-03-11 21:48:25 +01:00
|
|
|
CORPUS_NAME=europarl_sample
|
2017-01-21 17:01:15 +01:00
|
|
|
|
2017-03-11 21:48:25 +01:00
|
|
|
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
2017-01-21 17:01:15 +01:00
|
|
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
2017-03-10 14:52:01 +01:00
|
|
|
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
|
|
|
|
|
|
|
clean-intermediate-files:
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.low
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.snt
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
|
|
|
|
2017-01-21 17:01:15 +01:00
|
|
|
|
|
|
|
clean:
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.tok
|
2017-03-07 10:24:08 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
2017-01-21 17:01:15 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.low
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.snt
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/aligned*
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
|
|
|
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
|
|
|
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
|
|
|
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
|
|
|
|
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
|
|
|
|
2017-03-07 10:24:08 +01:00
|
|
|
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.lem
|
2017-01-21 17:01:15 +01:00
|
|
|
tr '[:upper:]' '[:lower:]' < $< > $@
|
|
|
|
|
2017-03-07 10:24:08 +01:00
|
|
|
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
|
|
|
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
|
|
|
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
|
|
|
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
|
|
|
|
2017-03-11 21:48:25 +01:00
|
|
|
corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
|
|
|
europarl/tools/tokenizer.perl -l $(SRC_LANG) < corpora/$(CORPUS_NAME)/src.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
|
|
|
europarl/tools/tokenizer.perl -l $(TRG_LANG) < corpora/$(CORPUS_NAME)/trg.txt > corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|
|
|
|
./clean-corpus-n.perl corpora/$(CORPUS_NAME)/$(CORPUS_NAME) $(TRG_LANG) $(SRC_LANG) corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean 0 100
|
|
|
|
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(SRC_LANG) corpora/$(CORPUS_NAME)/src.tok
|
|
|
|
mv corpora/$(CORPUS_NAME)/$(CORPUS_NAME)_clean.$(TRG_LANG) corpora/$(CORPUS_NAME)/trg.tok
|
|
|
|
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(SRC_LANG)
|
|
|
|
rm corpora/$(CORPUS_NAME)/$(CORPUS_NAME).$(TRG_LANG)
|