Add character-by-character tokenization.

This commit is contained in:
Filip Gralinski 2018-12-17 07:54:12 +01:00
parent d671989a09
commit 5d19fc7585
3 changed files with 17 additions and 2 deletions

View File

@ -142,7 +142,7 @@ specParser = GEvalSpecification
( long "tokenizer"
<> short 'T'
<> metavar "TOKENIZER"
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a, v14 and character-by-character tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
<*> ( optional . strOption $
( long "gonito-host"
<> metavar "GONITO_HOST"

View File

@ -9,19 +9,22 @@ import Data.Monoid ((<>))
import Text.Regex.PCRE.Heavy
data Tokenizer = Minimalistic | V13a | V14International
data Tokenizer = Minimalistic | V13a | V14International | CharacterByCharacter
deriving (Eq)
instance Show Tokenizer where
show Minimalistic = "minimalistic"
show V13a = "13a"
show V14International = "v14"
show CharacterByCharacter = "character-by-character"
instance Read Tokenizer where
readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) =
[(Minimalistic, theRest)]
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
readsPrec _ ('c':'h':'a':'r':'a':'c':'t':'e':'r':'-':'b':'y':'-':'c':'h':'a':'r':'a':'c':'t':'e':'r':theRest) =
[(CharacterByCharacter, theRest)]
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
@ -77,5 +80,14 @@ tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
$ T.replace "-\n" ""
$ T.replace "<skipped>" "" t
tokenizeWithSpaces (Just CharacterByCharacter) t = T.intercalate " "
$ map T.singleton
$ map escapeSpace
$ T.unpack t
toSpace :: T.Text -> T.Text
toSpace _ = space
escapeSpace :: Char -> Char
escapeSpace ' ' = '_'
escapeSpace c = c

View File

@ -455,6 +455,9 @@ main = hspec $ do
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
["To", "be", "or", "not", "to", "be",
",", "that's", "the", "question", "."]
it "simple utterance with 'character-by-character' tokenizer" $ do
tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe`
["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."]
describe "submit" $ do
it "current branch" $ do
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"