2020-11-24 08:33:07 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
2022-06-06 19:54:30 +02:00
|
|
|
input_file="$1"
|
|
|
|
method="$2"
|
|
|
|
|
|
|
|
extract_text() {
|
|
|
|
if [[ "$method" == "from-tex" ]]
|
|
|
|
then
|
2022-11-09 16:55:56 +01:00
|
|
|
detex "$input_file" | grep -E '\S' | grep -v 'unsrt' | perl -pne 's/^\s+| +$//g'
|
2022-06-06 19:54:30 +02:00
|
|
|
else
|
|
|
|
bash helpers/pdf-to-plain-text.sh "$input_file" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "'
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2022-11-09 16:55:56 +01:00
|
|
|
extract_text | python3 -m syntok.segmenter | grep -E '\S'
|