From bc7b4fdf891d6483efb7bb33e29b148b0a414585 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Mon, 6 Jun 2022 19:54:30 +0200 Subject: [PATCH] Generalize script for extracting sentences Can extract text using detex now (as an option). --- .../helpers/get-sentences.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/{{cookiecutter.paper_repo_name}}/helpers/get-sentences.sh b/{{cookiecutter.paper_repo_name}}/helpers/get-sentences.sh index ec51400..a462187 100755 --- a/{{cookiecutter.paper_repo_name}}/helpers/get-sentences.sh +++ b/{{cookiecutter.paper_repo_name}}/helpers/get-sentences.sh @@ -1,3 +1,15 @@ #!/bin/bash -bash helpers/pdf-to-plain-text.sh "$1" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "' | python3 -m syntok.segmenter +input_file="$1" +method="$2" + +extract_text() { + if [[ "$method" == "from-tex" ]] + then + detex "$input_file" | egrep '\S' | grep -v 'unsrt' | perl -pne 's/^\s+| +$//g' + else + bash helpers/pdf-to-plain-text.sh "$input_file" | perl helpers/strip-references.pl | perl -pne 'chomp $_; $_.=" "' + fi +} + +extract_text | python3 -m syntok.segmenter | egrep '\S'