This commit is contained in:
Filip Gralinski 2018-08-13 07:38:46 +02:00
parent 8388ab4d27
commit d3da3a0ca5
7 changed files with 32 additions and 6 deletions

View File

@ -80,6 +80,7 @@ import GEval.CharMatch
import GEval.BIO
import GEval.ProbList
import Data.Conduit.AutoDecompress
import Text.Tokenizer
import qualified Data.HashMap.Strict as M
@ -213,7 +214,8 @@ data GEvalSpecification = GEvalSpecification
gesExpectedFile :: String,
gesInputFile :: String,
gesMetrics :: [Metric],
gesPrecision :: Maybe Int}
gesPrecision :: Maybe Int,
gesTokenizer :: Maybe Tokenizer }
gesMainMetric :: GEvalSpecification -> Metric
gesMainMetric spec = case gesMetrics spec of
@ -284,7 +286,8 @@ defaultGEvalSpecification = GEvalSpecification {
gesExpectedFile = defaultExpectedFile,
gesInputFile = defaultInputFile,
gesMetrics = [defaultMetric],
gesPrecision = Nothing}
gesPrecision = Nothing,
gesTokenizer = Nothing}
isEmptyFile :: FilePath -> IO (Bool)
isEmptyFile path = do

View File

@ -126,6 +126,11 @@ specParser = GEvalSpecification
<> help "The name of the file with the input (applicable only for some metrics)" )
<*> ((flip fromMaybe) <$> (singletonMaybe <$> altMetricReader) <*> metricReader)
<*> optional precisionArgParser
<*> (optional $ option auto
( long "tokenize"
<> short 'T'
<> metavar "TOKENIZER"
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x]

View File

@ -10,12 +10,21 @@ import Data.Monoid ((<>))
import Text.Regex.PCRE.Heavy
data Tokenizer = V13a
deriving (Eq)
instance Show Tokenizer where
show V13a = "13a"
instance Read Tokenizer where
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
tokenize Nothing t = T.words t
tokenize (Just V13a) t = T.words tWithSpaces
where tWithSpaces = T.strip tTokenized
tTokenized =
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
tokenizeWithSpaces Nothing t = t
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
where tTokenized =
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)

View File

@ -71,6 +71,8 @@ main = hspec $ do
runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000
it "empty translation" $
runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000
it "with tokenization" $
runGEvalTest "bleu-with-tokenization" `shouldReturnAlmost` 0.6501914150070065
describe "Accuracy" $ do
it "simple example" $
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6

View File

@ -0,0 +1,3 @@
Do you like pickles?
John who is a plumber
Alica has a cat.
1 Do you like pickles?
2 John who is a plumber
3 Alica has a cat.

View File

@ -0,0 +1 @@
--metric BLEU --tokenize 13a

View File

@ -0,0 +1,3 @@
Do you like cucumbers?
John, who is a plumber
Alica has a cat.
1 Do you like cucumbers?
2 John, who is a plumber
3 Alica has a cat.