Add character-by-character tokenization.
This commit is contained in:
parent
d671989a09
commit
5d19fc7585
@ -142,7 +142,7 @@ specParser = GEvalSpecification
|
|||||||
( long "tokenizer"
|
( long "tokenizer"
|
||||||
<> short 'T'
|
<> short 'T'
|
||||||
<> metavar "TOKENIZER"
|
<> metavar "TOKENIZER"
|
||||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a, v14 and character-by-character tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
||||||
<*> ( optional . strOption $
|
<*> ( optional . strOption $
|
||||||
( long "gonito-host"
|
( long "gonito-host"
|
||||||
<> metavar "GONITO_HOST"
|
<> metavar "GONITO_HOST"
|
||||||
|
@ -9,19 +9,22 @@ import Data.Monoid ((<>))
|
|||||||
|
|
||||||
import Text.Regex.PCRE.Heavy
|
import Text.Regex.PCRE.Heavy
|
||||||
|
|
||||||
data Tokenizer = Minimalistic | V13a | V14International
|
data Tokenizer = Minimalistic | V13a | V14International | CharacterByCharacter
|
||||||
deriving (Eq)
|
deriving (Eq)
|
||||||
|
|
||||||
instance Show Tokenizer where
|
instance Show Tokenizer where
|
||||||
show Minimalistic = "minimalistic"
|
show Minimalistic = "minimalistic"
|
||||||
show V13a = "13a"
|
show V13a = "13a"
|
||||||
show V14International = "v14"
|
show V14International = "v14"
|
||||||
|
show CharacterByCharacter = "character-by-character"
|
||||||
|
|
||||||
instance Read Tokenizer where
|
instance Read Tokenizer where
|
||||||
readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) =
|
readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) =
|
||||||
[(Minimalistic, theRest)]
|
[(Minimalistic, theRest)]
|
||||||
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
||||||
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
|
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
|
||||||
|
readsPrec _ ('c':'h':'a':'r':'a':'c':'t':'e':'r':'-':'b':'y':'-':'c':'h':'a':'r':'a':'c':'t':'e':'r':theRest) =
|
||||||
|
[(CharacterByCharacter, theRest)]
|
||||||
|
|
||||||
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
||||||
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
|
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
|
||||||
@ -77,5 +80,14 @@ tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
|
|||||||
$ T.replace "-\n" ""
|
$ T.replace "-\n" ""
|
||||||
$ T.replace "<skipped>" "" t
|
$ T.replace "<skipped>" "" t
|
||||||
|
|
||||||
|
tokenizeWithSpaces (Just CharacterByCharacter) t = T.intercalate " "
|
||||||
|
$ map T.singleton
|
||||||
|
$ map escapeSpace
|
||||||
|
$ T.unpack t
|
||||||
|
|
||||||
toSpace :: T.Text -> T.Text
|
toSpace :: T.Text -> T.Text
|
||||||
toSpace _ = space
|
toSpace _ = space
|
||||||
|
|
||||||
|
escapeSpace :: Char -> Char
|
||||||
|
escapeSpace ' ' = '_'
|
||||||
|
escapeSpace c = c
|
||||||
|
@ -455,6 +455,9 @@ main = hspec $ do
|
|||||||
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
|
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
|
||||||
["To", "be", "or", "not", "to", "be",
|
["To", "be", "or", "not", "to", "be",
|
||||||
",", "that's", "the", "question", "."]
|
",", "that's", "the", "question", "."]
|
||||||
|
it "simple utterance with 'character-by-character' tokenizer" $ do
|
||||||
|
tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe`
|
||||||
|
["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."]
|
||||||
describe "submit" $ do
|
describe "submit" $ do
|
||||||
it "current branch" $ do
|
it "current branch" $ do
|
||||||
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
||||||
|
Loading…
Reference in New Issue
Block a user