From 421d2e9797d0698dcb0234d517f74a8cc56090e0 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Fri, 17 Aug 2018 18:13:27 +0200 Subject: [PATCH] add minimalistic tokenizer --- src/GEval/OptionsParser.hs | 2 +- src/Text/Tokenizer.hs | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 46f305e..1b2a333 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -135,7 +135,7 @@ specParser = GEvalSpecification ( long "tokenizer" <> short 'T' <> metavar "TOKENIZER" - <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) + <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x] diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index c036dbb..cd9a254 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -9,14 +9,17 @@ import Data.Monoid ((<>)) import Text.Regex.PCRE.Heavy -data Tokenizer = V13a | V14International +data Tokenizer = Minimalistic | V13a | V14International deriving (Eq) instance Show Tokenizer where + show Minimalistic = "minimalistic" show V13a = "13a" show V14International = "v14" instance Read Tokenizer where + readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) = + [(Minimalistic, theRest)] readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)] readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)] @@ -35,6 +38,15 @@ space = " " tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text tokenizeWithSpaces Nothing t = t +-- very simple tokenization, punctuation marks are separated +-- only at the beginning and end of a word +tokenizeWithSpaces (Just Minimalistic) t = T.strip tTokenized + where tTokenized = + gsub [re|\s{2,}|] ((const space) :: T.Text -> T.Text) + $ gsub [re|[\w\d]+\S*[\w\d]+|[\w\d]|[^\w\s]+|] + (\tok -> space <> tok <> space) + t + -- tokenization following the official BLEU implementation -- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983 -- cf. tokenize_v14_international function in sacrebleu evaluator