From 57ec495bc6a304720facf84200ed5c1543fca290 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 17 Nov 2018 16:56:16 +0100 Subject: [PATCH] Fix mistakes in the implementation of 13a and v14 tokenizers. --- src/Text/Tokenizer.hs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index cd9a254..c12a3e7 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -53,17 +53,20 @@ tokenizeWithSpaces (Just Minimalistic) t = T.strip tTokenized tokenizeWithSpaces (Just V14International) t = T.strip tTokenized where tTokenized = - gsub [re|\p{S}|] (\s -> space <> s <> space) - $ gsub [re|(\p{P})([^\d])|] (\(p:n:_) -> p <> space <> n) - $ gsub [re|([^\d])(\p{P})|] (\(n:p:_) -> n <> space <> p) t + gsub [re|\s+|] toSpace + $ gsub [re|\p{S}|] (\s -> space <> s <> space) + $ gsub [re|(\p{P})([^\d])|] (\(p:n:_) -> space <> p <> space <> n) + $ gsub [re|([^\d])(\p{P})|] (\(n:p:_) -> n <> space <> p <> space) t + -- tokenization equivalent to mteval-v13a -- cf. tokenize_13a function in sacrebleu evaluator tokenizeWithSpaces (Just V13a) t = T.strip tTokenized where tTokenized = - gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p) - $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p) - $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p) - $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded + gsub [re|\s+|] toSpace + $ gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p <> space) + $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> space <> c <> space <> p) + $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p <> space) + $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (\s -> space <> s <> space) tPadded tPadded = " " <> tReplaced <> " " tReplaced = T.replace ">" ">" @@ -73,3 +76,6 @@ tokenizeWithSpaces (Just V13a) t = T.strip tTokenized $ T.replace "\n" " " $ T.replace "-\n" "" $ T.replace "" "" t + +toSpace :: T.Text -> T.Text +toSpace _ = space