concordia-server/mgiza-aligner/Makefile

88 lines
4.7 KiB
Makefile
Raw Normal View History

2017-10-10 17:27:44 +02:00
SRC_LANG=en
2017-07-28 20:52:29 +02:00
TRG_LANG=en
2017-10-10 17:27:44 +02:00
CORPUS_NAME=jrc_enes
2017-04-28 13:49:30 +02:00
SEPARATOR=@\#@
2017-10-10 17:27:44 +02:00
2017-04-29 23:14:08 +02:00
DICTIONARY_WEIGHT=5
2017-07-28 20:52:29 +02:00
2017-04-29 23:14:08 +02:00
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
2017-03-11 21:48:25 +01:00
2017-04-28 13:49:30 +02:00
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
2017-01-21 17:01:15 +01:00
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
2017-03-10 14:52:01 +01:00
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
clean-intermediate-files:
rm -f corpora/$(CORPUS_NAME)/*.lem
2017-04-29 23:14:08 +02:00
rm -f corpora/$(CORPUS_NAME)/*.tok
rm -f corpora/$(CORPUS_NAME)/*.dict
2017-03-10 14:52:01 +01:00
rm -f corpora/$(CORPUS_NAME)/*.classes
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
rm -f corpora/$(CORPUS_NAME)/*.vcb
rm -f corpora/$(CORPUS_NAME)/*.snt
rm -f corpora/$(CORPUS_NAME)/*.cooc
rm -f corpora/$(CORPUS_NAME)/aligned*part*
2017-04-29 23:14:08 +02:00
rm -f corpora/$(CORPUS_NAME)/aligned.txt
2017-03-10 14:52:01 +01:00
rm -f corpora/$(CORPUS_NAME)/giza.cfg
2017-04-29 23:14:08 +02:00
rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg
2017-04-28 13:49:30 +02:00
rm -f corpora/$(CORPUS_NAME)/pasted.txt
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
2017-04-29 23:14:08 +02:00
clean: clean-intermediate-files
rm -f corpora/$(CORPUS_NAME)/src_final.txt
rm -f corpora/$(CORPUS_NAME)/trg_final.txt
rm -f corpora/$(CORPUS_NAME)/aligned_final.txt
2017-01-21 17:01:15 +01:00
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
2017-04-28 13:49:30 +02:00
corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc: corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt
2017-01-21 17:01:15 +01:00
2017-04-28 13:49:30 +02:00
corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/trg.lem_src.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
2017-01-21 17:01:15 +01:00
2017-04-28 13:49:30 +02:00
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.lem
2017-01-21 17:01:15 +01:00
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict
cat corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict > $@
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict
cat corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict > $@
corpora/$(CORPUS_NAME)/src.dict:
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
2017-03-07 10:24:08 +01:00
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/trg.dict:
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
2017-03-07 10:24:08 +01:00
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/trg_clean.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src_clean.lem: corpora/$(CORPUS_NAME)/src_clean.tok
2017-03-07 10:24:08 +01:00
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
2017-04-28 13:49:30 +02:00
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
2017-06-26 13:45:10 +02:00
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
2017-04-28 13:49:30 +02:00
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
2017-06-26 13:45:10 +02:00
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
2017-04-28 13:49:30 +02:00
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
2017-04-28 13:49:30 +02:00
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
sort -k 1.13 $< | uniq -s 12 | sort > $@
2017-04-29 23:14:08 +02:00
corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt
./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR)