diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index c95a15d..c6a2246 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -230,10 +230,12 @@ getExpectedDirectory :: GEvalSpecification -> FilePath getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec where outDirectory = gesOutDirectory spec +-- | Special command, not just running the regular evaluation. +-- See OptionsParser.hs for more information. data GEvalSpecialCommand = Init | LineByLine | WorstFeatures | Diff FilePath | MostWorseningFeatures FilePath - | PrintVersion + | PrintVersion | JustTokenize data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 05d0599..3942fe9 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -15,16 +15,19 @@ module GEval.LineByLine runMostWorseningFeatures, runDiffGeneralized, LineRecord(..), - ResultOrdering(..) + ResultOrdering(..), + justTokenize ) where import GEval.Core +import Text.Tokenizer import Data.Conduit.AutoDecompress (doNothing) import Data.Conduit import qualified Data.Conduit.List as CL import qualified Data.Conduit.Combinators as CC +import qualified Data.Conduit.Text as CT import Data.Text import Data.Text.Encoding import Data.Conduit.Rank @@ -288,3 +291,16 @@ gevalLineByLineSource metric preprocess inputSource expectedSource outSource = (LineInFile expectedSource lineNo exp) (LineInFile outSource lineNo out) return $ LineRecord inp exp out lineNo s + +justTokenize :: Maybe Tokenizer -> IO () +justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option" +justTokenize (Just tokenizer) = + runResourceT + $ runConduit + $ CC.stdin + .| CC.decodeUtf8Lenient + .| CT.lines + .| CC.map (tokenizeWithSpaces (Just tokenizer)) + .| CC.unlines + .| CC.encodeUtf8 + .| CC.stdout diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 6193dbf..3847b3b 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -58,12 +58,17 @@ optionsParser = GEvalOptions ( long "diff" <> short 'd' <> metavar "OTHER-OUT" - <> help "compare results")) + <> help "Compare results of evaluations (line by line) for two outputs.")) <|> (MostWorseningFeatures <$> strOption ( long "most-worsening-features" <> short 'm' - <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems."))) + <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems.")) + <|> + (flag' JustTokenize + ( long "just-tokenize" + <> short 'j' + <> help "Just tokenise standard input and print out the tokens (separated by spaces) on the standard output. rather than do any evaluation. The --tokenizer option must be given."))) <*> ((flag' FirstTheWorst (long "sort" @@ -127,7 +132,7 @@ specParser = GEvalSpecification <*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader) <*> optional precisionArgParser <*> (optional $ option auto - ( long "tokenize" + ( long "tokenizer" <> short 'T' <> metavar "TOKENIZER" <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" )) @@ -222,6 +227,9 @@ runGEval''' (Just (Diff otherOut)) ordering spec = do runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do runMostWorseningFeatures ordering otherOut spec return Nothing +runGEval''' (Just JustTokenize) _ spec = do + justTokenize (gesTokenizer spec) + return Nothing initChallenge :: GEvalSpecification -> IO () initChallenge spec = case gesExpectedDirectory spec of diff --git a/test/bleu-with-tokenization/bleu-with-tokenization/config.txt b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt index b615171..008cb29 100644 --- a/test/bleu-with-tokenization/bleu-with-tokenization/config.txt +++ b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt @@ -1 +1 @@ ---metric BLEU --tokenize 13a +--metric BLEU --tokenizer 13a