add --just-tokenize option
This commit is contained in:
parent
3a68324a6e
commit
0871b57bbc
@ -230,10 +230,12 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
|
|||||||
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
|
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
|
||||||
where outDirectory = gesOutDirectory spec
|
where outDirectory = gesOutDirectory spec
|
||||||
|
|
||||||
|
-- | Special command, not just running the regular evaluation.
|
||||||
|
-- See OptionsParser.hs for more information.
|
||||||
data GEvalSpecialCommand = Init
|
data GEvalSpecialCommand = Init
|
||||||
| LineByLine | WorstFeatures
|
| LineByLine | WorstFeatures
|
||||||
| Diff FilePath | MostWorseningFeatures FilePath
|
| Diff FilePath | MostWorseningFeatures FilePath
|
||||||
| PrintVersion
|
| PrintVersion | JustTokenize
|
||||||
|
|
||||||
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
|
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
|
||||||
|
|
||||||
|
@ -15,16 +15,19 @@ module GEval.LineByLine
|
|||||||
runMostWorseningFeatures,
|
runMostWorseningFeatures,
|
||||||
runDiffGeneralized,
|
runDiffGeneralized,
|
||||||
LineRecord(..),
|
LineRecord(..),
|
||||||
ResultOrdering(..)
|
ResultOrdering(..),
|
||||||
|
justTokenize
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
|
import Text.Tokenizer
|
||||||
|
|
||||||
import Data.Conduit.AutoDecompress (doNothing)
|
import Data.Conduit.AutoDecompress (doNothing)
|
||||||
|
|
||||||
import Data.Conduit
|
import Data.Conduit
|
||||||
import qualified Data.Conduit.List as CL
|
import qualified Data.Conduit.List as CL
|
||||||
import qualified Data.Conduit.Combinators as CC
|
import qualified Data.Conduit.Combinators as CC
|
||||||
|
import qualified Data.Conduit.Text as CT
|
||||||
import Data.Text
|
import Data.Text
|
||||||
import Data.Text.Encoding
|
import Data.Text.Encoding
|
||||||
import Data.Conduit.Rank
|
import Data.Conduit.Rank
|
||||||
@ -288,3 +291,16 @@ gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
|
|||||||
(LineInFile expectedSource lineNo exp)
|
(LineInFile expectedSource lineNo exp)
|
||||||
(LineInFile outSource lineNo out)
|
(LineInFile outSource lineNo out)
|
||||||
return $ LineRecord inp exp out lineNo s
|
return $ LineRecord inp exp out lineNo s
|
||||||
|
|
||||||
|
justTokenize :: Maybe Tokenizer -> IO ()
|
||||||
|
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
|
||||||
|
justTokenize (Just tokenizer) =
|
||||||
|
runResourceT
|
||||||
|
$ runConduit
|
||||||
|
$ CC.stdin
|
||||||
|
.| CC.decodeUtf8Lenient
|
||||||
|
.| CT.lines
|
||||||
|
.| CC.map (tokenizeWithSpaces (Just tokenizer))
|
||||||
|
.| CC.unlines
|
||||||
|
.| CC.encodeUtf8
|
||||||
|
.| CC.stdout
|
||||||
|
@ -58,12 +58,17 @@ optionsParser = GEvalOptions
|
|||||||
( long "diff"
|
( long "diff"
|
||||||
<> short 'd'
|
<> short 'd'
|
||||||
<> metavar "OTHER-OUT"
|
<> metavar "OTHER-OUT"
|
||||||
<> help "compare results"))
|
<> help "Compare results of evaluations (line by line) for two outputs."))
|
||||||
<|>
|
<|>
|
||||||
(MostWorseningFeatures <$> strOption
|
(MostWorseningFeatures <$> strOption
|
||||||
( long "most-worsening-features"
|
( long "most-worsening-features"
|
||||||
<> short 'm'
|
<> short 'm'
|
||||||
<> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems.")))
|
<> help "Print a ranking of the \"most worsening\" features, i.e. features that worsen the score the most when comparing outputs from two systems."))
|
||||||
|
<|>
|
||||||
|
(flag' JustTokenize
|
||||||
|
( long "just-tokenize"
|
||||||
|
<> short 'j'
|
||||||
|
<> help "Just tokenise standard input and print out the tokens (separated by spaces) on the standard output. rather than do any evaluation. The --tokenizer option must be given.")))
|
||||||
|
|
||||||
<*> ((flag' FirstTheWorst
|
<*> ((flag' FirstTheWorst
|
||||||
(long "sort"
|
(long "sort"
|
||||||
@ -127,7 +132,7 @@ specParser = GEvalSpecification
|
|||||||
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
|
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
|
||||||
<*> optional precisionArgParser
|
<*> optional precisionArgParser
|
||||||
<*> (optional $ option auto
|
<*> (optional $ option auto
|
||||||
( long "tokenize"
|
( long "tokenizer"
|
||||||
<> short 'T'
|
<> short 'T'
|
||||||
<> metavar "TOKENIZER"
|
<> metavar "TOKENIZER"
|
||||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
||||||
@ -222,6 +227,9 @@ runGEval''' (Just (Diff otherOut)) ordering spec = do
|
|||||||
runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do
|
runGEval''' (Just (MostWorseningFeatures otherOut)) ordering spec = do
|
||||||
runMostWorseningFeatures ordering otherOut spec
|
runMostWorseningFeatures ordering otherOut spec
|
||||||
return Nothing
|
return Nothing
|
||||||
|
runGEval''' (Just JustTokenize) _ spec = do
|
||||||
|
justTokenize (gesTokenizer spec)
|
||||||
|
return Nothing
|
||||||
|
|
||||||
initChallenge :: GEvalSpecification -> IO ()
|
initChallenge :: GEvalSpecification -> IO ()
|
||||||
initChallenge spec = case gesExpectedDirectory spec of
|
initChallenge spec = case gesExpectedDirectory spec of
|
||||||
|
@ -1 +1 @@
|
|||||||
--metric BLEU --tokenize 13a
|
--metric BLEU --tokenizer 13a
|
||||||
|
Loading…
Reference in New Issue
Block a user