WIP
This commit is contained in:
parent
8388ab4d27
commit
d3da3a0ca5
@ -80,6 +80,7 @@ import GEval.CharMatch
|
||||
import GEval.BIO
|
||||
import GEval.ProbList
|
||||
import Data.Conduit.AutoDecompress
|
||||
import Text.Tokenizer
|
||||
|
||||
import qualified Data.HashMap.Strict as M
|
||||
|
||||
@ -213,7 +214,8 @@ data GEvalSpecification = GEvalSpecification
|
||||
gesExpectedFile :: String,
|
||||
gesInputFile :: String,
|
||||
gesMetrics :: [Metric],
|
||||
gesPrecision :: Maybe Int}
|
||||
gesPrecision :: Maybe Int,
|
||||
gesTokenizer :: Maybe Tokenizer }
|
||||
|
||||
gesMainMetric :: GEvalSpecification -> Metric
|
||||
gesMainMetric spec = case gesMetrics spec of
|
||||
@ -284,7 +286,8 @@ defaultGEvalSpecification = GEvalSpecification {
|
||||
gesExpectedFile = defaultExpectedFile,
|
||||
gesInputFile = defaultInputFile,
|
||||
gesMetrics = [defaultMetric],
|
||||
gesPrecision = Nothing}
|
||||
gesPrecision = Nothing,
|
||||
gesTokenizer = Nothing}
|
||||
|
||||
isEmptyFile :: FilePath -> IO (Bool)
|
||||
isEmptyFile path = do
|
||||
|
@ -126,6 +126,11 @@ specParser = GEvalSpecification
|
||||
<> help "The name of the file with the input (applicable only for some metrics)" )
|
||||
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
|
||||
<*> optional precisionArgParser
|
||||
<*> (optional $ option auto
|
||||
( long "tokenize"
|
||||
<> short 'T'
|
||||
<> metavar "TOKENIZER"
|
||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
||||
|
||||
singletonMaybe :: Maybe a -> Maybe [a]
|
||||
singletonMaybe (Just x) = Just [x]
|
||||
|
@ -10,12 +10,21 @@ import Data.Monoid ((<>))
|
||||
import Text.Regex.PCRE.Heavy
|
||||
|
||||
data Tokenizer = V13a
|
||||
deriving (Eq)
|
||||
|
||||
instance Show Tokenizer where
|
||||
show V13a = "13a"
|
||||
|
||||
instance Read Tokenizer where
|
||||
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
||||
|
||||
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
||||
tokenize Nothing t = T.words t
|
||||
tokenize (Just V13a) t = T.words tWithSpaces
|
||||
where tWithSpaces = T.strip tTokenized
|
||||
tTokenized =
|
||||
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
|
||||
|
||||
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
||||
tokenizeWithSpaces Nothing t = t
|
||||
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
|
||||
where tTokenized =
|
||||
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
|
||||
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
|
||||
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
|
||||
|
@ -71,6 +71,8 @@ main = hspec $ do
|
||||
runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000
|
||||
it "empty translation" $
|
||||
runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000
|
||||
it "with tokenization" $
|
||||
runGEvalTest "bleu-with-tokenization" `shouldReturnAlmost` 0.6501914150070065
|
||||
describe "Accuracy" $ do
|
||||
it "simple example" $
|
||||
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
|
||||
|
@ -0,0 +1,3 @@
|
||||
Do you like pickles?
|
||||
John who is a plumber
|
||||
Alica has a cat.
|
|
@ -0,0 +1 @@
|
||||
--metric BLEU --tokenize 13a
|
@ -0,0 +1,3 @@
|
||||
Do you like cucumbers?
|
||||
John, who is a plumber
|
||||
Alica has a cat.
|
|
Loading…
Reference in New Issue
Block a user