2019-02-04 15:27:56 +01:00
|
|
|
SRC_LANG=pl
|
|
|
|
TRG_LANG=en
|
2019-03-04 11:12:19 +01:00
|
|
|
CORPUS_NAME=opensubtitles
|
2019-02-04 15:27:56 +01:00
|
|
|
SEPARATOR=@\#@
|
|
|
|
|
|
|
|
DICTIONARY_WEIGHT=3
|
|
|
|
|
2019-02-21 14:02:51 +01:00
|
|
|
all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt
|
2019-02-04 15:27:56 +01:00
|
|
|
|
|
|
|
|
2019-02-06 15:34:08 +01:00
|
|
|
clean:
|
2019-02-04 15:27:56 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/*.dict
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
2019-02-21 14:02:51 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/src_clean.lem
|
2019-02-04 15:27:56 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
2019-02-06 15:34:08 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/ids_clean.txt
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt
|
2019-02-21 14:02:51 +01:00
|
|
|
rm -f corpora/$(CORPUS_NAME)/falign_result.txt
|
|
|
|
rm -f corpora/$(CORPUS_NAME)/alignments.txt
|
|
|
|
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/alignments.txt: corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem
|
|
|
|
./get_alignments.py corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem > $@
|
2019-02-04 15:27:56 +01:00
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/src.dict:
|
|
|
|
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
|
|
|
|
|
|
|
|
corpora/$(CORPUS_NAME)/trg.dict:
|
|
|
|
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
|
|
|
|
|
|
|
|
|
2019-06-12 14:44:50 +02:00
|
|
|
corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
|
|
|
|
./sentence_lemmatizer.py $< $(SRC_LANG) corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/src.lem
|
2019-02-04 15:27:56 +01:00
|
|
|
|
2019-06-12 14:44:50 +02:00
|
|
|
corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
|
|
|
./sentence_lemmatizer.py $< $(TRG_LANG) corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/trg.lem
|
2019-02-04 15:27:56 +01:00
|
|
|
|
|
|
|
|
2019-02-21 14:02:51 +01:00
|
|
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
2019-06-12 14:44:50 +02:00
|
|
|
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.norm corpora/$(CORPUS_NAME)/trg.norm corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
|
2019-02-04 15:27:56 +01:00
|
|
|
|
|
|
|
|
2019-02-21 14:02:51 +01:00
|
|
|
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
2019-06-12 14:44:50 +02:00
|
|
|
./fast_align -i $< -d -o -v > $@
|