WIP
This commit is contained in:
parent
8388ab4d27
commit
d3da3a0ca5
@ -80,6 +80,7 @@ import GEval.CharMatch
|
|||||||
import GEval.BIO
|
import GEval.BIO
|
||||||
import GEval.ProbList
|
import GEval.ProbList
|
||||||
import Data.Conduit.AutoDecompress
|
import Data.Conduit.AutoDecompress
|
||||||
|
import Text.Tokenizer
|
||||||
|
|
||||||
import qualified Data.HashMap.Strict as M
|
import qualified Data.HashMap.Strict as M
|
||||||
|
|
||||||
@ -213,7 +214,8 @@ data GEvalSpecification = GEvalSpecification
|
|||||||
gesExpectedFile :: String,
|
gesExpectedFile :: String,
|
||||||
gesInputFile :: String,
|
gesInputFile :: String,
|
||||||
gesMetrics :: [Metric],
|
gesMetrics :: [Metric],
|
||||||
gesPrecision :: Maybe Int}
|
gesPrecision :: Maybe Int,
|
||||||
|
gesTokenizer :: Maybe Tokenizer }
|
||||||
|
|
||||||
gesMainMetric :: GEvalSpecification -> Metric
|
gesMainMetric :: GEvalSpecification -> Metric
|
||||||
gesMainMetric spec = case gesMetrics spec of
|
gesMainMetric spec = case gesMetrics spec of
|
||||||
@ -284,7 +286,8 @@ defaultGEvalSpecification = GEvalSpecification {
|
|||||||
gesExpectedFile = defaultExpectedFile,
|
gesExpectedFile = defaultExpectedFile,
|
||||||
gesInputFile = defaultInputFile,
|
gesInputFile = defaultInputFile,
|
||||||
gesMetrics = [defaultMetric],
|
gesMetrics = [defaultMetric],
|
||||||
gesPrecision = Nothing}
|
gesPrecision = Nothing,
|
||||||
|
gesTokenizer = Nothing}
|
||||||
|
|
||||||
isEmptyFile :: FilePath -> IO (Bool)
|
isEmptyFile :: FilePath -> IO (Bool)
|
||||||
isEmptyFile path = do
|
isEmptyFile path = do
|
||||||
|
@ -126,6 +126,11 @@ specParser = GEvalSpecification
|
|||||||
<> help "The name of the file with the input (applicable only for some metrics)" )
|
<> help "The name of the file with the input (applicable only for some metrics)" )
|
||||||
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
|
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
|
||||||
<*> optional precisionArgParser
|
<*> optional precisionArgParser
|
||||||
|
<*> (optional $ option auto
|
||||||
|
( long "tokenize"
|
||||||
|
<> short 'T'
|
||||||
|
<> metavar "TOKENIZER"
|
||||||
|
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
@ -10,12 +10,21 @@ import Data.Monoid ((<>))
|
|||||||
import Text.Regex.PCRE.Heavy
|
import Text.Regex.PCRE.Heavy
|
||||||
|
|
||||||
data Tokenizer = V13a
|
data Tokenizer = V13a
|
||||||
|
deriving (Eq)
|
||||||
|
|
||||||
|
instance Show Tokenizer where
|
||||||
|
show V13a = "13a"
|
||||||
|
|
||||||
|
instance Read Tokenizer where
|
||||||
|
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
||||||
|
|
||||||
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
||||||
tokenize Nothing t = T.words t
|
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
|
||||||
tokenize (Just V13a) t = T.words tWithSpaces
|
|
||||||
where tWithSpaces = T.strip tTokenized
|
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
||||||
tTokenized =
|
tokenizeWithSpaces Nothing t = t
|
||||||
|
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
|
||||||
|
where tTokenized =
|
||||||
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
|
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
|
||||||
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
|
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
|
||||||
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
|
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
|
||||||
|
@ -71,6 +71,8 @@ main = hspec $ do
|
|||||||
runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000
|
runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000
|
||||||
it "empty translation" $
|
it "empty translation" $
|
||||||
runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000
|
runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000
|
||||||
|
it "with tokenization" $
|
||||||
|
runGEvalTest "bleu-with-tokenization" `shouldReturnAlmost` 0.6501914150070065
|
||||||
describe "Accuracy" $ do
|
describe "Accuracy" $ do
|
||||||
it "simple example" $
|
it "simple example" $
|
||||||
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
|
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
Do you like pickles?
|
||||||
|
John who is a plumber
|
||||||
|
Alica has a cat.
|
|
@ -0,0 +1 @@
|
|||||||
|
--metric BLEU --tokenize 13a
|
@ -0,0 +1,3 @@
|
|||||||
|
Do you like cucumbers?
|
||||||
|
John, who is a plumber
|
||||||
|
Alica has a cat.
|
|
Loading…
Reference in New Issue
Block a user