concordia-server/fast-aligner/Makefile
2019-03-04 11:12:19 +01:00

45 lines
2.4 KiB
Makefile

SRC_LANG=pl
TRG_LANG=en
CORPUS_NAME=opensubtitles
SEPARATOR=@\#@
DICTIONARY_WEIGHT=3
all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt
clean:
rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.dict
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
rm -f corpora/$(CORPUS_NAME)/src_clean.lem
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
rm -f corpora/$(CORPUS_NAME)/ids_clean.txt
rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt
rm -f corpora/$(CORPUS_NAME)/falign_result.txt
rm -f corpora/$(CORPUS_NAME)/alignments.txt
corpora/$(CORPUS_NAME)/alignments.txt: corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem
./get_alignments.py corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem > $@
corpora/$(CORPUS_NAME)/src.dict:
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
corpora/$(CORPUS_NAME)/trg.dict:
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(SRC_LANG) > $@
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
fast_align -i $< -d -o -v > $@