Generalize script for extracting sentences
Can extract text using detex now (as an option).
This commit is contained in:
parent
d0acd0c8d4
commit
bc7b4fdf89
@ -1,3 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
bash helpers/pdf-to-plain-text.sh "$1" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "' | python3 -m syntok.segmenter
|
||||
input_file="$1"
|
||||
method="$2"
|
||||
|
||||
extract_text() {
|
||||
if [[ "$method" == "from-tex" ]]
|
||||
then
|
||||
detex "$input_file" | egrep '\S' | grep -v 'unsrt' | perl -pne 's/^\s+| +$//g'
|
||||
else
|
||||
bash helpers/pdf-to-plain-text.sh "$input_file" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "'
|
||||
fi
|
||||
}
|
||||
|
||||
extract_text | python3 -m syntok.segmenter | egrep '\S'
|
||||
|
Loading…
Reference in New Issue
Block a user