added lemmatization

This commit is contained in:
Rafał Jaworski 2017-03-07 10:24:08 +01:00
parent 015a916d20
commit 4883cce8a5

View File

@ -8,6 +8,7 @@ all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc
clean:
rm -f corpora/$(CORPUS_NAME)/*.tok
rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.low
rm -f corpora/$(CORPUS_NAME)/*.classes
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
@ -29,9 +30,16 @@ corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.lo
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.tok
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.lem
tr '[:upper:]' '[:lower:]' < $< > $@
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.tok
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.tok
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@