add --just-tokenize option

2018-08-17 16:57:47 +02:00 · 2018-08-17 16:57:47 +02:00 · 0871b57bbc
commit 0871b57bbc
parent 3a68324a6e
4 changed files with 32 additions and 6 deletions
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -230,10 +230,12 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
 getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
                            where outDirectory = gesOutDirectory spec
 -- | Special command, not just running the regular evaluation.
 -- See OptionsParser.hs for more information.
 data GEvalSpecialCommand = Init
                           | LineByLine | WorstFeatures
                           | Diff FilePath | MostWorseningFeatures FilePath
-                           | PrintVersion
+                           | PrintVersion | JustTokenize
 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
--- a/src/GEval/LineByLine.hs
+++ b/src/GEval/LineByLine.hs
@ -15,16 +15,19 @@ module GEval.LineByLine
        runMostWorseningFeatures,
        runDiffGeneralized,
        LineRecord(..),
-        ResultOrdering(..)
+        ResultOrdering(..),
        justTokenize
       ) where
 import GEval.Core
 import Text.Tokenizer
 import Data.Conduit.AutoDecompress (doNothing)
 import Data.Conduit
 import qualified Data.Conduit.List as CL
 import qualified Data.Conduit.Combinators as CC
 import qualified Data.Conduit.Text as CT
 import Data.Text
 import Data.Text.Encoding
 import Data.Conduit.Rank
@ -288,3 +291,16 @@ gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
                                                                (LineInFile expectedSource lineNo exp)
                                                                (LineInFile outSource lineNo out)
          return $ LineRecord inp exp out lineNo s
 justTokenize :: Maybe Tokenizer -> IO ()
 justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
 justTokenize (Just tokenizer) =
             runResourceT
             $ runConduit
             $ CC.stdin
               .| CC.decodeUtf8Lenient
               .| CT.lines
               .| CC.map (tokenizeWithSpaces (Just tokenizer))
               .| CC.unlines
               .| CC.encodeUtf8
               .| CC.stdout
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -58,12 +58,17 @@ optionsParser = GEvalOptions
                    ( long "diff"
                      <> short 'd'
                      <> metavar "OTHER-OUT"
-                      <> help "compare results"))
+                      <> help "Compare results of evaluations (line by line) for two outputs."))
                <|>
                (MostWorseningFeatures <$> strOption
                    ( long "most-worsening-features"
                      <> short 'm'
-                      <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems.")))
+                      <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems."))
                <|>
                (flag' JustTokenize
                    ( long "just-tokenize"
                      <> short 'j'
                      <> help "Just tokenise standard input and print out the tokens (separated by spaces) on the standard output. rather than do any evaluation. The --tokenizer option must be given.")))
   <*> ((flag' FirstTheWorst
         (long "sort"
@ -127,7 +132,7 @@ specParser = GEvalSpecification
  <*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
  <*> optional precisionArgParser
  <*> (optional $ option auto
-       ( long "tokenize"
+       ( long "tokenizer"
         <> short 'T'
         <> metavar "TOKENIZER"
         <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
@ -222,6 +227,9 @@ runGEval''' (Just (Diff otherOut)) ordering spec = do
 runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do
  runMostWorseningFeatures ordering otherOut spec
  return Nothing
 runGEval''' (Just JustTokenize) _ spec = do
  justTokenize (gesTokenizer spec)
  return Nothing
 initChallenge :: GEvalSpecification -> IO ()
 initChallenge spec = case gesExpectedDirectory spec of
--- a/test/bleu-with-tokenization/bleu-with-tokenization/config.txt
+++ b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt
@ -1 +1 @@
--metric BLEU --tokenize 13a
+--metric BLEU --tokenizer 13a
`@ -1 +1 @@`
	`--metric BLEU --tokenize 13a`	`--metric BLEU --tokenizer 13a`