From 8b7a18b4c7539a59838c99332327f9b216434cc6 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Fri, 17 Aug 2018 17:45:01 +0200 Subject: [PATCH] v14 tokenizer added --- src/GEval/OptionsParser.hs | 2 +- src/Text/Tokenizer.hs | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index c15e650..46f305e 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -135,7 +135,7 @@ specParser = GEvalSpecification ( long "tokenizer" <> short 'T' <> metavar "TOKENIZER" - <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) + <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x] diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index 02a4f4c..c036dbb 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -9,14 +9,16 @@ import Data.Monoid ((<>)) import Text.Regex.PCRE.Heavy -data Tokenizer = V13a +data Tokenizer = V13a | V14International deriving (Eq) instance Show Tokenizer where show V13a = "13a" + show V14International = "v14" instance Read Tokenizer where readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)] + readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)] tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer) @@ -28,8 +30,22 @@ tokenizeTabSeparatedWithSpaces tokenizer@(Just _) t = $ map (tokenizeWithSpaces tokenizer) $ T.splitOn "\t" t +space :: T.Text +space = " " + tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text tokenizeWithSpaces Nothing t = t +-- tokenization following the official BLEU implementation +-- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983 +-- cf. tokenize_v14_international function in sacrebleu evaluator +tokenizeWithSpaces (Just V14International) t = + T.strip tTokenized + where tTokenized = + gsub [re|\p{S}|] (\s -> space <> s <> space) + $ gsub [re|(\p{P})([^\d])|] (\(p:n:_) -> p <> space <> n) + $ gsub [re|([^\d])(\p{P})|] (\(n:p:_) -> n <> space <> p) t +-- tokenization equivalent to mteval-v13a +-- cf. tokenize_13a function in sacrebleu evaluator tokenizeWithSpaces (Just V13a) t = T.strip tTokenized where tTokenized = gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p) @@ -45,4 +61,3 @@ tokenizeWithSpaces (Just V13a) t = T.strip tTokenized $ T.replace "\n" " " $ T.replace "-\n" "" $ T.replace "" "" t - space = " " :: T.Text