From d67061c230bfdf2970bb4defdd301f8d6bcd6bc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20Grali=C5=84ski?= <filip.gralinski@applica.ai>
Date: Sat, 2 Nov 2019 11:11:53 +0100
Subject: [PATCH] Hande itemStep via dependent types

---
 src/GEval/Clippings.hs        |   5 +
 src/GEval/Common.hs           |  62 +++++++++++
 src/GEval/Core.hs             | 199 +++++-----------------------------
 src/GEval/CreateChallenge.hs  |   3 +-
 src/GEval/MetricsMechanics.hs | 128 +++++++++++++++++++++-
 src/GEval/Validation.hs       |   2 +-
 6 files changed, 226 insertions(+), 173 deletions(-)

diff --git a/src/GEval/Clippings.hs b/src/GEval/Clippings.hs
index 9b3b527..eb7c86a 100644
--- a/src/GEval/Clippings.hs
+++ b/src/GEval/Clippings.hs
@@ -14,6 +14,7 @@ import Data.Maybe (catMaybes)
 import Debug.Trace
 
 import GEval.Common
+import GEval.PrecisionRecall (maxMatch)
 
 newtype PageNumber = PageNumber Int
         deriving (Eq, Show)
@@ -160,3 +161,7 @@ getLeftovers (Rectangle (Point x0 y0) (Point x1 y1))
                          Rectangle (Point x0' y0') (Point (x0 - 1) y1'),
                          Rectangle (Point (x1 + 1) y0') (Point x1' y1')]
   where validRectangle (Rectangle (Point x0 y0) (Point x1 y1)) = x0 <= x1 && y0 <= y1
+
+clippEUMatchStep (clippingSpecs, clippings) = (maxMatch matchClippingToSpec clippingSpecs clippings,
+                                               Prelude.length clippingSpecs,
+                                               Prelude.length clippings)
diff --git a/src/GEval/Common.hs b/src/GEval/Common.hs
index 6fe958d..80a1ede 100644
--- a/src/GEval/Common.hs
+++ b/src/GEval/Common.hs
@@ -4,6 +4,9 @@ module GEval.Common
 import qualified Data.Text as T
 import Data.Text.Read as TR
 
+import Data.Word
+import Control.Exception
+
 import Data.Attoparsec.Text
 
 type MetricValue = Double
@@ -79,3 +82,62 @@ class AEq a where
 
 instance AEq Double where
     x =~ y = abs ( x - y ) < (1.0e-4 :: Double)
+
+itemAbsoluteError :: (Double, Double) -> Double
+itemAbsoluteError (exp, out) = abs (exp-out)
+
+itemSquaredError :: (Double, Double) -> Double
+itemSquaredError (exp, out) = (exp-out)**2
+
+itemLogLossError :: (Double, Double) -> Double
+itemLogLossError (exp, out)
+  | exp' > 0.5 = - (log out')
+  | otherwise = - (log (1 - out'))
+  where exp' = normalizeAsProb exp
+        out' = normalizeAsProb out
+        normalizeAsProb v
+          | v >= 1.0 = 1.0
+          | v <= 0.0 = 0.0
+          | otherwise = v
+
+data GEvalException = NoExpectedFile FilePath
+                      | NoOutFile FilePath
+                      | NoExpectedDirectory FilePath
+                      | NoOutDirectory FilePath
+                      | NoExpectedTestDirectory FilePath
+                      | NoOutTestDirectory FilePath
+                      | NoInputFile FilePath
+                      | FileAlreadyThere FilePath
+                      | TooFewLines
+                      | TooManyLines
+                      | TooFewLinesInInput
+                      | TooManyLinesInInput
+                      | EmptyOutput
+                      | UnexpectedData Word32 String
+                      | UnexpectedMultipleOutputs
+                      | OtherException String
+                      deriving (Eq)
+
+instance Exception GEvalException
+
+instance Show GEvalException where
+  show (NoExpectedFile filePath) = somethingWrongWithFilesMessage "No file with the expected results" filePath
+  show (NoOutFile filePath) = somethingWrongWithFilesMessage "No file with the test results" filePath
+  show (NoExpectedDirectory filePath) = somethingWrongWithFilesMessage "No directory with the expected results" filePath
+  show (NoOutDirectory filePath) = somethingWrongWithFilesMessage "No directory with the test results" filePath
+  show (NoExpectedTestDirectory filePath) = somethingWrongWithFilesMessage "No test subdirectory with the expected results" filePath
+  show (NoOutTestDirectory filePath) = somethingWrongWithFilesMessage "No test subdirectory with the results obtained" filePath
+  show (NoInputFile filePath) = somethingWrongWithFilesMessage "No file with the input" filePath
+  show (FileAlreadyThere filePath) = somethingWrongWithFilesMessage "File already there" filePath
+  show TooFewLines = "Too few lines in the output file"
+  show TooManyLines = "Too many lines in the output file"
+  show TooFewLinesInInput = "Too few lines in the input file"
+  show TooManyLinesInInput = "Too many lines in the input file"
+  show EmptyOutput = "The output file is empty"
+  show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
+  show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
+  show (OtherException message) = message
+
+somethingWrongWithFilesMessage :: String -> FilePath -> String
+somethingWrongWithFilesMessage msg filePath = Prelude.concat
+                                [ msg, ": `", filePath, "`" ]
diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index 494d28e..89bb99a 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -19,7 +19,6 @@ module GEval.Core
       GEvalSpecification(..),
       ResultOrdering(..),
       GEvalOptions(..),
-      GEvalException(..),
       defaultGEvalSpecification,
       defaultOutDirectory,
       defaultTestName,
@@ -44,8 +43,7 @@ module GEval.Core
       getDataDecoder,
       threeLineSource,
       extensionsHandled,
-      isEmptyFile,
-      somethingWrongWithFilesMessage
+      isEmptyFile
     ) where
 
 import Debug.Trace
@@ -66,7 +64,6 @@ import Data.Text
 import Data.Text.Read as TR
 import Control.Applicative
 import Control.Exception
-import Text.Read (readMaybe)
 import Control.Conditional (unlessM, whenM)
 import qualified System.Directory as D
 import System.Posix
@@ -204,47 +201,6 @@ data GEvalOptions = GEvalOptions
                       geoBlackBoxDebugginsOptions :: BlackBoxDebuggingOptions,
                       geoGraphFile :: Maybe FilePath }
 
-data GEvalException = NoExpectedFile FilePath
-                      | NoOutFile FilePath
-                      | NoExpectedDirectory FilePath
-                      | NoOutDirectory FilePath
-                      | NoExpectedTestDirectory FilePath
-                      | NoOutTestDirectory FilePath
-                      | NoInputFile FilePath
-                      | FileAlreadyThere FilePath
-                      | TooFewLines
-                      | TooManyLines
-                      | TooFewLinesInInput
-                      | TooManyLinesInInput
-                      | EmptyOutput
-                      | UnexpectedData Word32 String
-                      | UnexpectedMultipleOutputs
-                      | OtherException String
-                      deriving (Eq)
-
-instance Exception GEvalException
-
-instance Show GEvalException where
-  show (NoExpectedFile filePath) = somethingWrongWithFilesMessage "No file with the expected results" filePath
-  show (NoOutFile filePath) = somethingWrongWithFilesMessage "No file with the test results" filePath
-  show (NoExpectedDirectory filePath) = somethingWrongWithFilesMessage "No directory with the expected results" filePath
-  show (NoOutDirectory filePath) = somethingWrongWithFilesMessage "No directory with the test results" filePath
-  show (NoExpectedTestDirectory filePath) = somethingWrongWithFilesMessage "No test subdirectory with the expected results" filePath
-  show (NoOutTestDirectory filePath) = somethingWrongWithFilesMessage "No test subdirectory with the results obtained" filePath
-  show (NoInputFile filePath) = somethingWrongWithFilesMessage "No file with the input" filePath
-  show (FileAlreadyThere filePath) = somethingWrongWithFilesMessage "File already there" filePath
-  show TooFewLines = "Too few lines in the output file"
-  show TooManyLines = "Too many lines in the output file"
-  show TooFewLinesInInput = "Too few lines in the input file"
-  show TooManyLinesInInput = "Too many lines in the input file"
-  show EmptyOutput = "The output file is empty"
-  show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
-  show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
-  show (OtherException message) = message
-
-somethingWrongWithFilesMessage :: String -> FilePath -> String
-somethingWrongWithFilesMessage msg filePath = Prelude.concat
-                                [ msg, ": `", filePath, "`" ]
 
 defaultGEvalSpecification = GEvalSpecification {
   gesOutDirectory = defaultOutDirectory,
@@ -513,12 +469,10 @@ gevalCoreOnSources Spearman _ = gevalCoreByCorrelationMeasure spearman
 
 gevalCoreOnSources (ProbabilisticMultiLabelFMeasure beta) _ = generalizedProbabilisticFMeasure beta
                                                                                                SAProbabilisticMultiLabelFMeasure
-                                                                                               (Right . (\(ProbList es) -> es) . parseIntoProbList)
   where intoWords = Right . Data.Text.words
 
 gevalCoreOnSources (ProbabilisticSoftFMeasure beta) _ = generalizedProbabilisticFMeasure beta
                                                                                          SAProbabilisticSoftFMeasure
-                                                                                         parseObtainedAnnotations
 
 -- and now more typical metrics, which:
 -- 1) parse the expected output
@@ -527,30 +481,27 @@ gevalCoreOnSources (ProbabilisticSoftFMeasure beta) _ = generalizedProbabilistic
 -- 4) aggregate the results
 -- 5) apply some final funtion on the aggregate
 -- 6) create a graph using the aggregate (applicable only to some metrics)
-gevalCoreOnSources Likelihood _ = gevalCoreWithoutInput SALikelihood itemLogLossError averageC logLossToLikehood noGraph
+gevalCoreOnSources Likelihood _ = gevalCoreWithoutInput SALikelihood averageC logLossToLikehood noGraph
 
 gevalCoreOnSources MultiLabelLikelihood _ = gevalCoreWithoutInput SAMultiLabelLikelihood
-                                                                  (uncurry countLogLossOnProbList)
                                                                   averageC
                                                                   logLossToLikehood
                                                                   noGraph
     where
       intoWords = Right . Data.Text.words
 
-gevalCoreOnSources MSE _ = gevalCoreWithoutInput SAMSE itemSquaredError averageC id noGraph
+gevalCoreOnSources MSE _ = gevalCoreWithoutInput SAMSE averageC id noGraph
 
-gevalCoreOnSources RMSE _ = gevalCoreWithoutInput SARMSE itemSquaredError averageC (** 0.5) noGraph
+gevalCoreOnSources RMSE _ = gevalCoreWithoutInput SARMSE averageC (** 0.5) noGraph
 
-gevalCoreOnSources MAE _ = gevalCoreWithoutInput SAMAE itemAbsoluteError averageC id noGraph
+gevalCoreOnSources MAE _ = gevalCoreWithoutInput SAMAE averageC id noGraph
 
-gevalCoreOnSources SMAPE _ = gevalCoreWithoutInput SASMAPE smape averageC (* 100.0) noGraph
-  where smape (exp, out) = (abs (exp-out)) `safeDoubleDiv` ((abs exp) + (abs out))
+gevalCoreOnSources SMAPE _ = gevalCoreWithoutInput SASMAPE averageC (* 100.0) noGraph
 
-gevalCoreOnSources LogLoss _ = gevalCoreWithoutInput SALogLoss itemLogLossError averageC id noGraph
+gevalCoreOnSources LogLoss _ = gevalCoreWithoutInput SALogLoss averageC id noGraph
 
-gevalCoreOnSources BLEU _ = gevalCoreWithoutInput SABLEU bleuCombine bleuAgg bleuFinal noGraph
+gevalCoreOnSources BLEU _ = gevalCoreWithoutInput SABLEU bleuAgg bleuFinal noGraph
   where bleuFinal (p1, p2, p3, p4, rl, l1, l2, l3, l4) = ((p1 /. l1) * (p2 /. l2) * (p3 /. l3) * (p4 /. l4)) ** 0.25 * (brevityPenalty l1 rl)
-        bleuCombine (refs, sen) = bleuStep refs sen
         bleuAgg = CC.foldl bleuFuse (0, 0, 0, 0, 0,  0, 0, 0, 0)
         bleuFuse (a1, a2, a3, a4, a5, a6, a7, a8, a9) (b1, b2, b3, b4, b5, b6, b7, b8, b9) = (a1+b1, a2+b2, a3+b3, a4+b4, a5+b5, a6+b6, a7+b7, a8+b8, a9+b9)
         brevityPenalty c r
@@ -558,63 +509,19 @@ gevalCoreOnSources BLEU _ = gevalCoreWithoutInput SABLEU bleuCombine bleuAgg ble
           | c == 0 && r > 0 = 0.0
           | otherwise = exp (1.0 - (r /. c))
 
-gevalCoreOnSources GLEU _ = gevalCoreWithoutInput SAGLEU gleuCombine gleuAgg gleuFinal noGraph
+gevalCoreOnSources GLEU _ = gevalCoreWithoutInput SAGLEU gleuAgg gleuFinal noGraph
   where gleuFinal (m, t) = m /. t
-        gleuCombine (refs, sen) = gleuStep refs sen
         gleuAgg = CC.foldl gleuFuse (0, 0)
         gleuFuse (a1, a2) (b1, b2) = (a1+b1, a2+b2)
 
-gevalCoreOnSources WER _ = gevalCoreWithoutInput SAWER (uncurry werStep) averageC id noGraph
+gevalCoreOnSources WER _ = gevalCoreWithoutInput SAWER averageC id noGraph
 
-gevalCoreOnSources Accuracy _ = gevalCoreWithoutInput SAAccuracy hitOrMiss averageC id noGraph
-                      where hitOrMiss (exp, got) =
-                              -- first try to parse what we got as a probability distribution
-                              -- (like the one used for Likelikehood/LogLossHashed metric)
-                              case parseWordSpecs got of
-                                Right wordSpecs -> if Prelude.null pairs
-                                                   then 0.0
-                                                   else indicator (exp == (snd $ Prelude.maximum pairs))
-                                                 where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
-                                Left _ ->  indicator ((normalizeProbForAccuracy exp got) == exp)
-                                          -- if the expected value is 0 or 1 treat values
-                                          -- between 0.0 and 1.0 as probabilities
-                                          -- for the positive outcome
-                            normalizeProbForAccuracy :: Text -> Text -> Text
-                            normalizeProbForAccuracy exp got
-                              | exp == (pack "1") = case tryReadingAsFloat got of
-                                            Just p -> if p >= 0.5 && p <= 1.0 then exp else got
-                                            Nothing -> got
-                              | exp == (pack "0") = case tryReadingAsFloat got of
-                                            Just p -> if p < 0.5 && p >= 0.0 then exp else got
-                                            Nothing -> got
-                              | otherwise = got
-                            tryReadingAsFloat :: Text -> Maybe Float
-                            tryReadingAsFloat = readMaybe . unpack
+gevalCoreOnSources Accuracy _ = gevalCoreWithoutInput SAAccuracy averageC id noGraph
 
-gevalCoreOnSources (FMeasure beta) _ = gevalCoreWithoutInput SAFMeasure getCount countAgg (fMeasureOnCounts beta) noGraph
-  where -- output value could be a probability (for compatibility with other measures)
-        getCount :: (Bool, Bool) -> (Int, Int, Int)
-        getCount (True, True)   = (1, 1, 1)
-        getCount (True, False)  = (0, 1, 0)
-        getCount (False, True)  = (0, 0, 1)
-        getCount (False, False) = (0, 0, 0)
+gevalCoreOnSources (FMeasure beta) _ = gevalCoreWithoutInput SAFMeasure countAgg (fMeasureOnCounts beta) noGraph
 
-gevalCoreOnSources (MacroFMeasure beta) _ = gevalCoreWithoutInput SAMacroFMeasure getClassesInvolved gatherClassC macroAverageOnCounts noGraph
-                      where predicted got =
-                              -- first try to parse what we got as a probability distribution
-                              -- (like the one used for Likelikehood/LogLossHashed metric)
-                              case parseWordSpecs got of
-                                Right wordSpecs -> if Prelude.null pairs
-                                                   then Nothing
-                                                   else Just $ snd $ Prelude.maximum pairs
-                                                 where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
-                                Left _ -> Just got
-                            getClassesInvolved (Just a, Nothing) = (Nothing, Just a, Nothing)
-                            getClassesInvolved (Nothing, Just b) = (Nothing, Nothing, Just b) -- should not occur, for completeness
-                            getClassesInvolved (Just a, Just b) = if a == b
-                                                                     then (Just a, Just a, Just a)
-                                                                     else (Nothing, Just a, Just b)
-                            gatherClassC = CC.foldl gatherClassCombiner (M.empty, M.empty, M.empty)
+gevalCoreOnSources (MacroFMeasure beta) _ = gevalCoreWithoutInput SAMacroFMeasure gatherClassC macroAverageOnCounts noGraph
+                      where gatherClassC = CC.foldl gatherClassCombiner (M.empty, M.empty, M.empty)
                             gatherClassCombiner (tpMap, expectedMap, gotMap) (tp, expected, got) =
                               (insertMaybeToMap tp tpMap,
                                insertMaybeToMap expected expectedMap,
@@ -629,66 +536,39 @@ gevalCoreOnSources (MacroFMeasure beta) _ = gevalCoreWithoutInput SAMacroFMeasur
                                $ M.keys expectedMap) / (fromIntegral $ Prelude.length $ M.keys expectedMap)
 
 gevalCoreOnSources (SoftFMeasure beta) _ = gevalCoreWithoutInput SASoftFMeasure
-                                                                 getSoftCounts
                                                                  countAgg
                                                                  (fMeasureOnCounts beta)
                                                                  noGraph
-                      where getSoftCounts (expected, got) = (weightedMaxMatch matchScore expected got,
-                                                             Prelude.length expected,
-                                                             Prelude.length got)
 
 gevalCoreOnSources (Soft2DFMeasure beta) _ = gevalCoreWithoutInput SASoft2DFMeasure
-                                                                   count2DFScore
-                                                                   averageC
+                                                                   (CC.map (fMeasureOnCounts beta) .| averageC)
                                                                    id
                                                                    noGraph
-                      where
-                        count2DFScore (expected, got) = fMeasureOnCounts beta (tpArea, expArea, gotArea)
-                                                        where tpArea = coveredBy expected got
-                                                              expArea = totalArea expected
-                                                              gotArea = totalArea got
 
-gevalCoreOnSources ClippEU _ = gevalCoreWithoutInput SAClippEU matchStep clippeuAgg finalStep noGraph
+gevalCoreOnSources ClippEU _ = gevalCoreWithoutInput SAClippEU clippeuAgg finalStep noGraph
   where
-    matchStep (clippingSpecs, clippings) = (maxMatch matchClippingToSpec clippingSpecs clippings,
-                                            Prelude.length clippingSpecs,
-                                            Prelude.length clippings)
     clippeuAgg = CC.foldl countFolder (0, 0, 0)
     finalStep counts = f2MeasureOnCounts counts
 
-gevalCoreOnSources NMI _ = gevalCoreWithoutInput SANMI id (CC.foldl updateConfusionMatrix M.empty) normalizedMutualInformationFromConfusionMatrix noGraph
+gevalCoreOnSources NMI _ = gevalCoreWithoutInput SANMI (CC.foldl updateConfusionMatrix M.empty) normalizedMutualInformationFromConfusionMatrix noGraph
 
 gevalCoreOnSources MAP _ = gevalCoreWithoutInput SAMAP
-                                                 (\(e,g) -> calculateMAPForOneResult e g)
                                                  averageC
                                                  id
                                                  noGraph
 
-gevalCoreOnSources BIOF1 _ = gevalCoreWithoutInput SABIOF1 (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts noGraph
+gevalCoreOnSources BIOF1 _ = gevalCoreWithoutInput SABIOF1 countAgg f1MeasureOnCounts noGraph
 
-gevalCoreOnSources BIOF1Labels _ = gevalCoreWithoutInput SABIOF1Labels (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts noGraph
+gevalCoreOnSources BIOF1Labels _ = gevalCoreWithoutInput SABIOF1Labels countAgg f1MeasureOnCounts noGraph
 
 gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput SATokenAccuracy
-                                                           countHitsAndTotals
                                                            hitsAndTotalsAgg
                                                            (\(hits, total) -> hits /. total)
                                                            noGraph
-   where countHitsAndTotals :: ([Text], [Text]) -> (Int, Int)
-         countHitsAndTotals (es, os) =
-             if Prelude.length os /= Prelude.length es
-               then throw $ OtherException "wrong number of tokens"
-               else Prelude.foldl matchFun
-                                  (0, 0)
-                                  (Prelude.zip es os)
-         matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
-         matchFun (h, t) (e, o)
-           | e == (pack "*") = (h, t)
-           | o `Prelude.elem` (splitOn (pack ";") e) = (h + 1, t + 1)
-           | otherwise = (h, t + 1)
+   where
          hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0)
 
 gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput SAMultiLabelLogLoss
-                                                               (uncurry countLogLossOnProbList)
                                                                averageC
                                                                id
                                                                noGraph
@@ -703,11 +583,10 @@ helperLogLossHashed nbOfBits finalStep expectedLineSource outLineSource =
           Right _ -> Right t
           Left m -> Left m
 
-generalizedProbabilisticFMeasure beta metric parseEntities = gevalCoreWithoutInput metric
-                                                                                   getProbabilisticCounts
-                                                                                   probabilisticSoftAgg
-                                                                                   (fMeasureOnProbabilisticCounts beta)
-                                                                                   loessGraph
+generalizedProbabilisticFMeasure beta metric = gevalCoreWithoutInput metric
+                                               probabilisticSoftAgg
+                                               (fMeasureOnProbabilisticCounts beta)
+                                               loessGraph
   where probabilisticSoftAgg :: Monad m => ConduitM ([Double], [Double], Double, Int) o m ([Double], [Double], Double, Int)
         probabilisticSoftAgg = CC.foldl probabilisticSoftFolder ([], [], fromInteger 0, 0)
         probabilisticSoftFolder (r1, p1, g1, e1) (r2, p2, g2, e2) = (r1 ++ r2, p1 ++ p2, g1 + g2, e1 + e2)
@@ -732,7 +611,7 @@ gevalCoreByCorrelationMeasure :: (MonadUnliftIO m, MonadThrow m, MonadIO m) =>
                                 LineSource (ResourceT m) ->  -- ^ source to read the output
                                 m (MetricOutput)             -- ^ metric values for the output against the expected output
 gevalCoreByCorrelationMeasure correlationFunction =
-  gevalCoreWithoutInput SAPearson id correlationC finalStep noGraph
+  gevalCoreWithoutInput SAPearson correlationC finalStep noGraph
   where correlationC = CC.foldl (flip (:)) []
         finalStep pairs = correlationFunction $ V.fromList pairs
 
@@ -749,20 +628,18 @@ skipLineNumber fun = fun . snd
 -- | A helper function to run evaluation when the input is not needed to calculate the metric value.
 gevalCoreWithoutInput :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
                       => SAMetric t
-                      -> ((ParsedExpectedType t, ParsedOutputType t) -> c)             -- ^ function which combines parsed values into a single value
-                                                   -- (will be launched for each item, e.g. an error/cost function
-                                                   -- could be calculated here)
-                      -> (ConduitT c Void (ResourceT m) d)  -- ^ a Conduit which aggregates all the combined values into
+                      -> (ConduitT (ItemIntermediateRepresentationType t) Void (ResourceT m) d)  -- ^ a Conduit which aggregates all the combined values into
                                                    -- a "total" value
                       -> (d -> Double)             -- ^ function to transform the "total" value into the final score
                       -> (d -> Maybe GraphSeries)
                       -> LineSource (ResourceT m)  -- ^ source to read the expected output
                       -> LineSource (ResourceT m)  -- ^ source to read the output
                       -> m (MetricOutput)           -- ^ metric values for the output against the expected output
-gevalCoreWithoutInput smetric itemStep aggregator finalStep generateGraph expectedLineStream outLineStream =
-  gevalCoreWithoutInputOnItemTargets (liftOp expParser) (liftOp outParser) itemStep aggregator finalStep generateGraph expectedLineStream outLineStream
+gevalCoreWithoutInput smetric aggregator finalStep generateGraph expectedLineStream outLineStream =
+  gevalCoreWithoutInputOnItemTargets (liftOp expParser) (liftOp outParser) iStep aggregator finalStep generateGraph expectedLineStream outLineStream
   where expParser = expectedParser smetric
         outParser = outputParser smetric
+        iStep = itemStep smetric
 
 gevalCoreWithoutInputOnItemTargets :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
                       => (ItemTarget -> Either String a) -- ^ parser for values in the expected output
@@ -922,21 +799,3 @@ items (LineSource lineSource itemDecoder preprocess _ _) parser =
 linesAsItems :: MonadResource m => LineSource m -> ConduitT () (SourceItem Text) m ()
 linesAsItems (LineSource lineSource _ _ _ _) =
   (lineSource .| CL.map Got) >> yield Done
-
-itemAbsoluteError :: (Double, Double) -> Double
-itemAbsoluteError (exp, out) = abs (exp-out)
-
-itemSquaredError :: (Double, Double) -> Double
-itemSquaredError (exp, out) = (exp-out)**2
-
-
-itemLogLossError :: (Double, Double) -> Double
-itemLogLossError (exp, out)
-  | exp' > 0.5 = - (log out')
-  | otherwise = - (log (1 - out'))
-  where exp' = normalizeAsProb exp
-        out' = normalizeAsProb out
-        normalizeAsProb v
-          | v >= 1.0 = 1.0
-          | v <= 0.0 = 0.0
-          | otherwise = v
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index 85430eb..5c2b266 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -7,7 +7,8 @@ module GEval.CreateChallenge
 
 import GEval.Metric
 import GEval.EvaluationScheme
-import GEval.Core (GEvalSpecification(..), GEvalException(..), configFileName, gesMainMetric, defaultTestName)
+import GEval.Common (GEvalException(..))
+import GEval.Core (GEvalSpecification(..), configFileName, gesMainMetric, defaultTestName)
 import GEval.Submit (tokenFileName)
 import qualified System.Directory as D
 import Control.Conditional (whenM)
diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs
index f8c05a5..dc52eb2 100644
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@@ -11,7 +11,19 @@ module GEval.MetricsMechanics
 
 import Data.Singletons.TH
 
+import Text.Read (readMaybe)
+
 import GEval.Metric
+import GEval.Common
+import GEval.BLEU (bleuStep, gleuStep)
+import GEval.WER (werStep)
+import GEval.Clippings (totalArea, coveredBy, clippEUMatchStep)
+import GEval.BIO (gatherCountsForBIO)
+
+import GEval.Probability
+import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPForOneResult, getProbabilisticCounts, getCounts)
+
+import Control.Exception
 
 import Data.Text
 import Data.Text.Read as TR
@@ -25,7 +37,7 @@ import GEval.Annotation (Annotation, ObtainedAnnotation, parseAnnotations, parse
 import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser)
 import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization)
 import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair)
-import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..))
+import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList)
 
 -- | Helper type so that singleton can be used.
 -- | (The problem is that some metrics are parametrized by Double
@@ -184,6 +196,60 @@ outputParser SAMultiLabelFMeasure = intoWords
 outputParser SAMultiLabelLogLoss = Right . parseIntoProbList
 outputParser SAMultiLabelLikelihood = Right . parseIntoProbList
 
+type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
+  ItemIntermediateRepresentationType ABLEU = (Int, Int, Int, Int, Int, Int, Int, Int, Int)
+  ItemIntermediateRepresentationType AGLEU = (Int, Int)
+  ItemIntermediateRepresentationType AFMeasure = (Int, Int, Int)
+  ItemIntermediateRepresentationType AMacroFMeasure = (Maybe Text, Maybe Text, Maybe Text)
+  ItemIntermediateRepresentationType ASoftFMeasure = (Double, Int, Int)
+  ItemIntermediateRepresentationType ASoft2DFMeasure = (Integer, Integer, Integer)
+  ItemIntermediateRepresentationType AClippEU = (Int, Int, Int)
+  ItemIntermediateRepresentationType ANMI = (Text, Text)
+  ItemIntermediateRepresentationType ABIOF1 = (Int, Int, Int)
+  ItemIntermediateRepresentationType ABIOF1Labels = (Int, Int, Int)
+  ItemIntermediateRepresentationType ATokenAccuracy = (Int, Int)
+  ItemIntermediateRepresentationType AProbabilisticMultiLabelFMeasure = ([Double], [Double], Double, Int)
+  ItemIntermediateRepresentationType AProbabilisticSoftFMeasure = ([Double], [Double], Double, Int)
+  ItemIntermediateRepresentationType APearson = (Double, Double)
+  ItemIntermediateRepresentationType ASpearman = (Double, Double)
+  ItemIntermediateRepresentationType AMultiLabelFMeasure = (Int, Int, Int)
+  ItemIntermediateRepresentationType ALogLossHashed = (Text, Text)
+  ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text)
+  ItemIntermediateRepresentationType ACharMatch = (Text, Text)
+  ItemIntermediateRepresentationType t = Double
+
+itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t
+itemStep SARMSE = itemSquaredError
+itemStep SAMSE = itemSquaredError
+itemStep SAPearson = id
+itemStep SASpearman = id
+itemStep SABLEU = uncurry bleuStep
+itemStep SAGLEU = uncurry gleuStep
+itemStep SAWER = uncurry werStep
+itemStep SAAccuracy = hitOrMiss
+itemStep SAClippEU = clippEUMatchStep
+itemStep SAFMeasure = getCount
+itemStep SAMacroFMeasure = getClassesInvolved
+itemStep SASoftFMeasure = getSoftCounts
+itemStep SAProbabilisticMultiLabelFMeasure = getProbabilisticCounts
+itemStep SAProbabilisticSoftFMeasure = getProbabilisticCounts
+itemStep SASoft2DFMeasure = getSoft2DCounts
+itemStep SANMI = id
+itemStep SALogLossHashed = id
+itemStep SALikelihoodHashed = id
+itemStep SACharMatch = id
+itemStep SAMAP = uncurry calculateMAPForOneResult
+itemStep SALogLoss = itemLogLossError
+itemStep SALikelihood = itemLogLossError
+itemStep SABIOF1 = uncurry gatherCountsForBIO
+itemStep SABIOF1Labels = uncurry gatherCountsForBIO
+itemStep SATokenAccuracy = countHitsAndTotals
+itemStep SAMAE = itemAbsoluteError
+itemStep SASMAPE = smape
+itemStep SAMultiLabelFMeasure = getCounts (==)
+itemStep SAMultiLabelLogLoss = uncurry countLogLossOnProbList
+itemStep SAMultiLabelLikelihood = uncurry countLogLossOnProbList
+
 
 doubleParser = getValue . TR.double
 
@@ -233,3 +299,63 @@ controlledParse parser t =
   case parseOnly parser t of
     (Right v) -> Right v
     (Left _) -> Left "cannot parse line"
+
+smape (exp, out) = (abs (exp-out)) `safeDoubleDiv` ((abs exp) + (abs out))
+
+hitOrMiss (exp, got) =
+  -- first try to parse what we got as a probability distribution
+  -- (like the one used for Likelikehood/LogLossHashed metric)
+  case parseWordSpecs got of
+    Right wordSpecs -> if Prelude.null pairs
+                      then 0.0
+                      else indicator (exp == (snd $ Prelude.maximum pairs))
+      where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
+    Left _ ->  indicator ((normalizeProbForAccuracy exp got) == exp)
+              -- if the expected value is 0 or 1 treat values
+              -- between 0.0 and 1.0 as probabilities
+              -- for the positive outcome
+  where  normalizeProbForAccuracy :: Text -> Text -> Text
+         normalizeProbForAccuracy exp got
+           | exp == (pack "1") = case tryReadingAsFloat got of
+                                  Just p -> if p >= 0.5 && p <= 1.0 then exp else got
+                                  Nothing -> got
+           | exp == (pack "0") = case tryReadingAsFloat got of
+                                  Just p -> if p < 0.5 && p >= 0.0 then exp else got
+                                  Nothing -> got
+           | otherwise = got
+         tryReadingAsFloat :: Text -> Maybe Float
+         tryReadingAsFloat = readMaybe . unpack
+
+getCount :: (Bool, Bool) -> (Int, Int, Int)
+getCount (True, True)   = (1, 1, 1)
+getCount (True, False)  = (0, 1, 0)
+getCount (False, True)  = (0, 0, 1)
+getCount (False, False) = (0, 0, 0)
+
+getClassesInvolved (Just a, Nothing) = (Nothing, Just a, Nothing)
+getClassesInvolved (Nothing, Just b) = (Nothing, Nothing, Just b) -- should not occur, for completeness
+getClassesInvolved (Just a, Just b) = if a == b
+                                      then (Just a, Just a, Just a)
+                                      else (Nothing, Just a, Just b)
+
+getSoftCounts (expected, got) = (weightedMaxMatch matchScore expected got,
+                                 Prelude.length expected,
+                                 Prelude.length got)
+
+getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea)
+  where tpArea = coveredBy expected got
+        expArea = totalArea expected
+        gotArea = totalArea got
+
+countHitsAndTotals :: ([Text], [Text]) -> (Int, Int)
+countHitsAndTotals (es, os) =
+  if Prelude.length os /= Prelude.length es
+  then throw $ OtherException "wrong number of tokens"
+  else Prelude.foldl matchFun
+                     (0, 0)
+                     (Prelude.zip es os)
+  where  matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
+         matchFun (h, t) (e, o)
+           | e == (pack "*") = (h, t)
+           | o `Prelude.elem` (splitOn (pack ";") e) = (h + 1, t + 1)
+           | otherwise = (h, t + 1)
diff --git a/src/GEval/Validation.hs b/src/GEval/Validation.hs
index ef2e981..a9b89e6 100644
--- a/src/GEval/Validation.hs
+++ b/src/GEval/Validation.hs
@@ -6,7 +6,7 @@ module GEval.Validation
 
 import GEval.Metric
 import GEval.EvaluationScheme
-import GEval.Core (GEvalSpecification(..), GEvalException(..), somethingWrongWithFilesMessage, isEmptyFile, geval, defaultInputFile, defaultExpectedFile, defaultOutFile)
+import GEval.Core (GEvalSpecification(..), isEmptyFile, geval, defaultInputFile, defaultExpectedFile, defaultOutFile)
 import GEval.Common
 import qualified System.Directory as D