add minimalistic tokenizer
This commit is contained in:
parent
c79c4b356e
commit
421d2e9797
@ -135,7 +135,7 @@ specParser = GEvalSpecification
|
|||||||
( long "tokenizer"
|
( long "tokenizer"
|
||||||
<> short 'T'
|
<> short 'T'
|
||||||
<> metavar "TOKENIZER"
|
<> metavar "TOKENIZER"
|
||||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), minimalistic, 13a and v14 tokenizers are implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
@ -9,14 +9,17 @@ import Data.Monoid ((<>))
|
|||||||
|
|
||||||
import Text.Regex.PCRE.Heavy
|
import Text.Regex.PCRE.Heavy
|
||||||
|
|
||||||
data Tokenizer = V13a | V14International
|
data Tokenizer = Minimalistic | V13a | V14International
|
||||||
deriving (Eq)
|
deriving (Eq)
|
||||||
|
|
||||||
instance Show Tokenizer where
|
instance Show Tokenizer where
|
||||||
|
show Minimalistic = "minimalistic"
|
||||||
show V13a = "13a"
|
show V13a = "13a"
|
||||||
show V14International = "v14"
|
show V14International = "v14"
|
||||||
|
|
||||||
instance Read Tokenizer where
|
instance Read Tokenizer where
|
||||||
|
readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) =
|
||||||
|
[(Minimalistic, theRest)]
|
||||||
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
||||||
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
|
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
|
||||||
|
|
||||||
@ -35,6 +38,15 @@ space = " "
|
|||||||
|
|
||||||
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
||||||
tokenizeWithSpaces Nothing t = t
|
tokenizeWithSpaces Nothing t = t
|
||||||
|
-- very simple tokenization, punctuation marks are separated
|
||||||
|
-- only at the beginning and end of a word
|
||||||
|
tokenizeWithSpaces (Just Minimalistic) t = T.strip tTokenized
|
||||||
|
where tTokenized =
|
||||||
|
gsub [re|\s{2,}|] ((const space) :: T.Text -> T.Text)
|
||||||
|
$ gsub [re|[\w\d]+\S*[\w\d]+|[\w\d]|[^\w\s]+|]
|
||||||
|
(\tok -> space <> tok <> space)
|
||||||
|
t
|
||||||
|
|
||||||
-- tokenization following the official BLEU implementation
|
-- tokenization following the official BLEU implementation
|
||||||
-- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983
|
-- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983
|
||||||
-- cf. tokenize_v14_international function in sacrebleu evaluator
|
-- cf. tokenize_v14_international function in sacrebleu evaluator
|
||||||
|
Loading…
Reference in New Issue
Block a user