Zad_1_lang_corpus_ML/filter_config.yaml

28 lines
564 B
YAML

steps:
- type: remove_duplicates
parameters:
inputs:
- monolingual_data/corpora.eng.gz
outputs:
- monolingual_data/buff_deduped.eng.gz
- type: filter
parameters:
inputs:
- monolingual_data/buff_deduped.eng.gz
outputs:
- filtered_mono_data/filtered.eng.gz
filters:
- LengthFilter:
unit: word
min_length: 1
max_length: 300
- LengthRatioFilter:
unit: word
threshold: 3
- LongWordFilter:
threshold: 40