From 5d19fc758582b1a69ead3f51a1324abfbb1873de Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Mon, 17 Dec 2018 07:54:12 +0100 Subject: [PATCH] Add character-by-character tokenization. --- src/GEval/OptionsParser.hs | 2 +- src/Text/Tokenizer.hs | 14 +++++++++++++- test/Spec.hs | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 61bc653..77a700d 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -142,7 +142,7 @@ specParser = GEvalSpecification ( long "tokenizer" <> short 'T' <> metavar "TOKENIZER" - <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) + <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a, v14 and character-by-character tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) <*> ( optional . strOption $ ( long "gonito-host" <> metavar "GONITO_HOST" diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index c12a3e7..4113374 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -9,19 +9,22 @@ import Data.Monoid ((<>)) import Text.Regex.PCRE.Heavy -data Tokenizer = Minimalistic | V13a | V14International +data Tokenizer = Minimalistic | V13a | V14International | CharacterByCharacter deriving (Eq) instance Show Tokenizer where show Minimalistic = "minimalistic" show V13a = "13a" show V14International = "v14" + show CharacterByCharacter = "character-by-character" instance Read Tokenizer where readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) = [(Minimalistic, theRest)] readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)] readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)] + readsPrec _ ('c':'h':'a':'r':'a':'c':'t':'e':'r':'-':'b':'y':'-':'c':'h':'a':'r':'a':'c':'t':'e':'r':theRest) = + [(CharacterByCharacter, theRest)] tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer) @@ -77,5 +80,14 @@ tokenizeWithSpaces (Just V13a) t = T.strip tTokenized $ T.replace "-\n" "" $ T.replace "" "" t +tokenizeWithSpaces (Just CharacterByCharacter) t = T.intercalate " " + $ map T.singleton + $ map escapeSpace + $ T.unpack t + toSpace :: T.Text -> T.Text toSpace _ = space + +escapeSpace :: Char -> Char +escapeSpace ' ' = '_' +escapeSpace c = c diff --git a/test/Spec.hs b/test/Spec.hs index 83a04e1..e65fb7a 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -455,6 +455,9 @@ main = hspec $ do tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe` ["To", "be", "or", "not", "to", "be", ",", "that's", "the", "question", "."] + it "simple utterance with 'character-by-character' tokenizer" $ do + tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe` + ["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."] describe "submit" $ do it "current branch" $ do runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"