add --just-tokenize option

This commit is contained in:
Filip Gralinski 2018-08-17 16:57:47 +02:00
parent 3a68324a6e
commit 0871b57bbc
4 changed files with 32 additions and 6 deletions

View File

@ -230,10 +230,12 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
where outDirectory = gesOutDirectory spec
-- | Special command, not just running the regular evaluation.
-- See OptionsParser.hs for more information.
data GEvalSpecialCommand = Init
| LineByLine | WorstFeatures
| Diff FilePath | MostWorseningFeatures FilePath
| PrintVersion
| PrintVersion | JustTokenize
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest

View File

@ -15,16 +15,19 @@ module GEval.LineByLine
runMostWorseningFeatures,
runDiffGeneralized,
LineRecord(..),
ResultOrdering(..)
ResultOrdering(..),
justTokenize
) where
import GEval.Core
import Text.Tokenizer
import Data.Conduit.AutoDecompress (doNothing)
import Data.Conduit
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Combinators as CC
import qualified Data.Conduit.Text as CT
import Data.Text
import Data.Text.Encoding
import Data.Conduit.Rank
@ -288,3 +291,16 @@ gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
(LineInFile expectedSource lineNo exp)
(LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo s
justTokenize :: Maybe Tokenizer -> IO ()
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
justTokenize (Just tokenizer) =
runResourceT
$ runConduit
$ CC.stdin
.| CC.decodeUtf8Lenient
.| CT.lines
.| CC.map (tokenizeWithSpaces (Just tokenizer))
.| CC.unlines
.| CC.encodeUtf8
.| CC.stdout

View File

@ -58,12 +58,17 @@ optionsParser = GEvalOptions
( long "diff"
<> short 'd'
<> metavar "OTHER-OUT"
<> help "compare results"))
<> help "Compare results of evaluations (line by line) for two outputs."))
<|>
(MostWorseningFeatures <$> strOption
( long "most-worsening-features"
<> short 'm'
<> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems.")))
<> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems."))
<|>
(flag' JustTokenize
( long "just-tokenize"
<> short 'j'
<> help "Just tokenise standard input and print out the tokens (separated by spaces) on the standard output. rather than do any evaluation. The --tokenizer option must be given.")))
<*> ((flag' FirstTheWorst
(long "sort"
@ -127,7 +132,7 @@ specParser = GEvalSpecification
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
<*> optional precisionArgParser
<*> (optional $ option auto
( long "tokenize"
( long "tokenizer"
<> short 'T'
<> metavar "TOKENIZER"
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
@ -222,6 +227,9 @@ runGEval''' (Just (Diff otherOut)) ordering spec = do
runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do
runMostWorseningFeatures ordering otherOut spec
return Nothing
runGEval''' (Just JustTokenize) _ spec = do
justTokenize (gesTokenizer spec)
return Nothing
initChallenge :: GEvalSpecification -> IO ()
initChallenge spec = case gesExpectedDirectory spec of

View File

@ -1 +1 @@
--metric BLEU --tokenize 13a
--metric BLEU --tokenizer 13a