28 lines
564 B
YAML
28 lines
564 B
YAML
steps:
|
|
- type: remove_duplicates
|
|
parameters:
|
|
inputs:
|
|
- monolingual_data/corpora.eng.gz
|
|
outputs:
|
|
- monolingual_data/buff_deduped.eng.gz
|
|
|
|
- type: filter
|
|
parameters:
|
|
inputs:
|
|
- monolingual_data/buff_deduped.eng.gz
|
|
outputs:
|
|
- filtered_mono_data/filtered.eng.gz
|
|
filters:
|
|
- LengthFilter:
|
|
unit: word
|
|
min_length: 1
|
|
max_length: 300
|
|
|
|
- LengthRatioFilter:
|
|
unit: word
|
|
threshold: 3
|
|
|
|
- LongWordFilter:
|
|
threshold: 40
|
|
|