#!/bin/bash -v # This sample script preprocesses a sample corpus, including tokenization, # truecasing, and subword segmentation. # For application to a different language pair, change source and target # prefix, optionally the number of BPE operations, and the file names # (currently, data/corpus and data/newsdev2016 are being processed). # In the tokenization step, you will want to remove Romanian-specific # normalization / diacritic removal, and you may want to add your own. # Also, you may want to learn BPE segmentations separately for each language, # especially if they differ in their alphabet # Suffix of source language files SRC=cs # Suffix of target language files TRG=en # Number of merge operations. Network vocabulary should be slightly larger (to # include characters), or smaller if the operations are learned on the joint # vocabulary bpe_operations=85000 # path to moses decoder: https://github.com/moses-smt/mosesdecoder mosesdecoder=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts # path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt subword_nmt=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/subword-nmt # tokenize cat dev-0/in.tsv \ | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l tsv > dev-0/in.tok.tsv cat test-A/in.tsv \ | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l tsv > test-A/in.tok.tsv # apply truecaser (dev/test files) for prefix in dev-0/in test-A/in do $mosesdecoder/scripts/recaser/truecase.perl -model preprocess-train/model/tc.$SRC < $prefix.tok.tsv > $prefix.tc.$SRC done