add --just-tokenize option

2018-08-17 16:57:47 +02:00 · 2018-08-17 16:57:47 +02:00 · 0871b57bbc
commit 0871b57bbc
parent 3a68324a6e
4 changed files with 32 additions and 6 deletions
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -230,10 +230,12 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
 getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
                            where outDirectory = gesOutDirectory spec

+-- | Special command, not just running the regular evaluation.
+-- See OptionsParser.hs for more information.
 data GEvalSpecialCommand = Init
                           | LineByLine | WorstFeatures
                           | Diff FilePath | MostWorseningFeatures FilePath
-                           | PrintVersion
+                           | PrintVersion | JustTokenize

 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest

--- a/src/GEval/LineByLine.hs
+++ b/src/GEval/LineByLine.hs
@ -15,16 +15,19 @@ module GEval.LineByLine
        runMostWorseningFeatures,
        runDiffGeneralized,
        LineRecord(..),
-        ResultOrdering(..)
+        ResultOrdering(..),
+        justTokenize
       ) where

 import GEval.Core
+import Text.Tokenizer

 import Data.Conduit.AutoDecompress (doNothing)

 import Data.Conduit
 import qualified Data.Conduit.List as CL
 import qualified Data.Conduit.Combinators as CC
+import qualified Data.Conduit.Text as CT
 import Data.Text
 import Data.Text.Encoding
 import Data.Conduit.Rank
@ -288,3 +291,16 @@ gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
                                                                (LineInFile expectedSource lineNo exp)
                                                                (LineInFile outSource lineNo out)
          return $ LineRecord inp exp out lineNo s
+
+justTokenize :: Maybe Tokenizer -> IO ()
+justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
+justTokenize (Just tokenizer) =
+             runResourceT
+             $ runConduit
+             $ CC.stdin
+               .| CC.decodeUtf8Lenient
+               .| CT.lines
+               .| CC.map (tokenizeWithSpaces (Just tokenizer))
+               .| CC.unlines
+               .| CC.encodeUtf8
+               .| CC.stdout
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -58,12 +58,17 @@ optionsParser = GEvalOptions
                    ( long "diff"
                      <> short 'd'
                      <> metavar "OTHER-OUT"
-                      <> help "compare results"))
+                      <> help "Compare results of evaluations (line by line) for two outputs."))
                <|>
                (MostWorseningFeatures <$> strOption
                    ( long "most-worsening-features"
                      <> short 'm'
-                      <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems.")))
+                      <> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems."))
+                <|>
+                (flag' JustTokenize
+                    ( long "just-tokenize"
+                      <> short 'j'
+                      <> help "Just tokenise standard input and print out the tokens (separated by spaces) on the standard output. rather than do any evaluation. The --tokenizer option must be given.")))

   <*> ((flag' FirstTheWorst
         (long "sort"
@ -127,7 +132,7 @@ specParser = GEvalSpecification
  <*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
  <*> optional precisionArgParser
  <*> (optional $ option auto
-       ( long "tokenize"
+       ( long "tokenizer"
         <> short 'T'
         <> metavar "TOKENIZER"
         <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
@ -222,6 +227,9 @@ runGEval''' (Just (Diff otherOut)) ordering spec = do
 runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do
  runMostWorseningFeatures ordering otherOut spec
  return Nothing
+runGEval''' (Just JustTokenize) _ spec = do
+  justTokenize (gesTokenizer spec)
+  return Nothing

 initChallenge :: GEvalSpecification -> IO ()
 initChallenge spec = case gesExpectedDirectory spec of
--- a/test/bleu-with-tokenization/bleu-with-tokenization/config.txt
+++ b/test/bleu-with-tokenization/bleu-with-tokenization/config.txt
@ -1 +1 @@
--metric BLEU --tokenize 13a
+--metric BLEU --tokenizer 13a