diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 1569e2b..c6bd134 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -80,6 +80,7 @@ import GEval.CharMatch import GEval.BIO import GEval.ProbList import Data.Conduit.AutoDecompress +import Text.Tokenizer import qualified Data.HashMap.Strict as M @@ -213,7 +214,8 @@ data GEvalSpecification = GEvalSpecification gesExpectedFile :: String, gesInputFile :: String, gesMetrics :: [Metric], - gesPrecision :: Maybe Int} + gesPrecision :: Maybe Int, + gesTokenizer :: Maybe Tokenizer } gesMainMetric :: GEvalSpecification -> Metric gesMainMetric spec = case gesMetrics spec of @@ -284,7 +286,8 @@ defaultGEvalSpecification = GEvalSpecification { gesExpectedFile = defaultExpectedFile, gesInputFile = defaultInputFile, gesMetrics = [defaultMetric], - gesPrecision = Nothing} + gesPrecision = Nothing, + gesTokenizer = Nothing} isEmptyFile :: FilePath -> IO (Bool) isEmptyFile path = do diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 297234d..6193dbf 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -126,6 +126,11 @@ specParser = GEvalSpecification <> help "The name of the file with the input (applicable only for some metrics)" ) <*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader) <*> optional precisionArgParser + <*> (optional $ option auto + ( long "tokenize" + <> short 'T' + <> metavar "TOKENIZER" + <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" )) singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x] diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index 64d609c..ff8ebea 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -10,12 +10,21 @@ import Data.Monoid ((<>)) import Text.Regex.PCRE.Heavy data Tokenizer = V13a + deriving (Eq) + +instance Show Tokenizer where + show V13a = "13a" + +instance Read Tokenizer where + readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)] tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] -tokenize Nothing t = T.words t -tokenize (Just V13a) t = T.words tWithSpaces - where tWithSpaces = T.strip tTokenized - tTokenized = +tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer) + +tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text +tokenizeWithSpaces Nothing t = t +tokenizeWithSpaces (Just V13a) t = T.strip tTokenized + where tTokenized = gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p) $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p) $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p) diff --git a/test/Spec.hs b/test/Spec.hs index 1270087..53f129a 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -71,6 +71,8 @@ main = hspec $ do runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000 it "empty translation" $ runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000 + it "with tokenization" $ + runGEvalTest "bleu-with-tokenization" `shouldReturnAlmost` 0.6501914150070065 describe "Accuracy" $ do it "simple example" $ runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6 diff --git a/test/bleu-with-tokenization/bleu-with-tokenization-solution/test-A/out.tsv b/test/bleu-with-tokenization/bleu-with-tokenization-solution/test-A/out.tsv new file mode 100644 index 0000000..28a7ed1 --- /dev/null +++ b/test/bleu-with-tokenization/bleu-with-tokenization-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +Do you like pickles? +John who is a plumber +Alica has a cat. diff --git a/test/bleu-with-tokenization/bleu-with-tokenization/config.txt b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt new file mode 100644 index 0000000..b615171 --- /dev/null +++ b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt @@ -0,0 +1 @@ +--metric BLEU --tokenize 13a diff --git a/test/bleu-with-tokenization/bleu-with-tokenization/test-A/expected.tsv b/test/bleu-with-tokenization/bleu-with-tokenization/test-A/expected.tsv new file mode 100644 index 0000000..6a01ca8 --- /dev/null +++ b/test/bleu-with-tokenization/bleu-with-tokenization/test-A/expected.tsv @@ -0,0 +1,3 @@ +Do you like cucumbers? +John, who is a plumber +Alica has a cat.