This commit is contained in:
Filip Gralinski 2022-11-09 16:55:56 +01:00
parent cdd82857d5
commit cce71acc1b

View File

@ -6,10 +6,10 @@ method="$2"
extract_text() { extract_text() {
if [[ "$method" == "from-tex" ]] if [[ "$method" == "from-tex" ]]
then then
detex "$input_file" | egrep '\S' | grep -v 'unsrt' | perl -pne 's/^\s+| +$//g' detex "$input_file" | grep -E '\S' | grep -v 'unsrt' | perl -pne 's/^\s+| +$//g'
else else
bash helpers/pdf-to-plain-text.sh "$input_file" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "' bash helpers/pdf-to-plain-text.sh "$input_file" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "'
fi fi
} }
extract_text | python3 -m syntok.segmenter | egrep '\S' extract_text | python3 -m syntok.segmenter | grep -E '\S'