Merge branch 'master' into bootstrap

2020-01-18 18:09:19 +01:00 · 2020-01-18 18:09:19 +01:00 · 608b1f9d73
commit 608b1f9d73
parent d83c1eeac1 68dc2fad0c
35 changed files with 642 additions and 80 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,39 @@
 ## 1.28.0.0
 * Add `s` flag for substitution
 ## 1.27.0.0
 * Results are formatted in cross-tables (if possible)
 ## 1.26.0.0
 * Change the meaning of WER (WER is calculated for the whole set now
  - similar to the way BLEU is calculated)
 * Use `Mean/WER` if you want the old meaning (average of per-item results)
 ## 1.25.0.0
 * Add --oracle-item-based
 ## 1.24.0.0
 * Introduce metric priorities
 * Use "Cartesian" strings in metrics
 ## 1.23.0.0
 * New style of train data is preferred
  - `in.tsv` and `expected.tsv` instead of `train.tsv`
  - though this is not required as sometimes training data look different than test data
  - `--validate` option was changed accordingly
 ## 1.22.1.0
 * Add "Mean/" meta-metric (for the time being working only with MultiLabel-F-measure)
 * Add :S flag
 ## 1.22.0.0
 * Add SegmentAccuracy
--- a/app/Main.hs
+++ b/app/Main.hs
@ -16,6 +16,8 @@ import System.Exit
 import Data.Conduit.SmartSource
 import Data.SplitIntoCrossTabs
 import System.FilePath
 import Data.List (intercalate, sort)
@ -23,6 +25,7 @@ import Data.List (intercalate, sort)
 import qualified Data.Text as T
 import Data.Map.Strict as M
 import qualified Data.Map.Lazy as LM
 import Data.Set as S
 main :: IO ()
@ -79,7 +82,27 @@ showTheResult' opts [val] = putStrLn $ formatTheResult (gesPrecision $ geoSpec o
 showTheResult' opts [] = do
  hPutStrLn stderr "no metric given, use --metric option"
  exitFailure
-showTheResult' opts vals =  mapM_ putStrLn $ Prelude.map (formatTheMetricAndResult (gesPrecision $ geoSpec opts)) $ zip (gesMetrics $ geoSpec opts) vals
+showTheResult' opts vals =  mapM_ putStrLn
                            $ intercalate [""]
                            $ Prelude.map (formatCrossTable (gesPrecision $ geoSpec opts))
                            $ splitIntoTablesWithValues (T.pack "metric") (T.pack "value") mapping metricLabels
  where mapping = LM.fromList $ zip metricLabels vals
        metricLabels = Prelude.map T.pack $ Prelude.map evaluationSchemeName $ gesMetrics $ geoSpec opts
 formatCrossTable :: Maybe Int -> TableWithValues MetricResult -> [String]
 formatCrossTable mPrecision (TableWithValues [_, _] body) =
  -- actually we won't print metric/value header
  -- (1) to keep backward-compatible with the previous version
  -- (2) to be concise
  Prelude.map (formatCrossTableLine mPrecision) body
 formatCrossTable mPrecision (TableWithValues header body) =
  (intercalate "\t" $ Prelude.map T.unpack header) : Prelude.map (formatCrossTableLine mPrecision) body
 formatCrossTableLine :: Maybe Int -> (T.Text, [MetricResult]) -> String
 formatCrossTableLine mPrecision (rowName, values) =
  intercalate "\t" ((T.unpack rowName):Prelude.map (formatTheResult mPrecision) values)
 formatSourceSpec :: SourceSpec -> String
 formatSourceSpec (FilePathSpec fp) = dropExtensions $ takeFileName fp
--- a/default.nix
+++ b/default.nix
@ -26,7 +26,7 @@ let
  stack2nix-script = import "${static-haskell-nix}/static-stack2nix-builder/stack2nix-script.nix" {
    inherit pkgs;
    stack-project-dir = toString ./.; # where stack.yaml is
-    hackageSnapshot = "2019-05-08T00:00:00Z"; # pins e.g. extra-deps without hashes or revisions
+    hackageSnapshot = "2020-01-03T00:00:00Z"; # pins e.g. extra-deps without hashes or revisions
  };
  static-stack2nix-builder = import "${static-haskell-nix}/static-stack2nix-builder/default.nix" {
--- a/geval.cabal
+++ b/geval.cabal
@ -1,5 +1,5 @@
 name:                geval
-version:             1.22.0.0
+version:             1.28.0.0
 synopsis:            Machine learning evaluation tools
 description:         Please see README.md
 homepage:            http://github.com/name/project
@ -49,6 +49,8 @@ library
                     , GEval.Selector
                     , Data.Statistics.Loess
                     , Data.Statistics.Calibration
                     , Data.CartesianStrings
                     , Data.SplitIntoCrossTabs
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -102,6 +104,7 @@ library
                     , temporary
                     , utf8-string
                     , singletons
                     , ordered-containers
  default-language:    Haskell2010
 executable geval
--- a/src/Data/CartesianStrings.hs
+++ b/src/Data/CartesianStrings.hs
@ -0,0 +1,42 @@
 {-# LANGUAGE OverloadedStrings #-}
 module Data.CartesianStrings
       (parseCartesianString,
        concatCartesianStrings,
        CartesianStrings(..))
       where
 import Data.List (findIndex)
 import Data.List.Split (splitOn)
 -- A helper library for parsing strings representing sets of strings
 -- obtained via a Cartesian product, e.g.:
 -- - "foo" represents just ["foo"]
 -- - "a-{foo,bar,baz}-b" represents ["a-foo-b", "a-bar-b", "a-baz-b"]
 -- - "{foo,bar,baz}-{x,y}-{0,1,2}" represents a set containing 18 strings
 cartProd :: [a] -> [b] -> [(a, b)]
 cartProd xs ys = [(x,y) | x <- xs, y <- ys]
 parseCartesianString :: String -> [String]
 parseCartesianString s =
  case findIndex (=='{') s of
    Just begIx ->
      let pref = take begIx s
          c = drop (begIx + 1) s
      in case findIndex (=='}') c of
           Just endIx ->
             let inf = take endIx c
                 current = splitOn "," inf
                 rest = parseCartesianString $ drop (endIx + 1) c
             in map (uncurry (++)) $ cartProd (map (pref++) current) rest
    Nothing -> [s]
 data CartesianStrings a = CartesianStrings [a]
  deriving (Eq)
 instance Read a => Read (CartesianStrings a) where
  readsPrec _ s = [(CartesianStrings (map read $ parseCartesianString s), "")]
 concatCartesianStrings :: [CartesianStrings a] -> [a]
 concatCartesianStrings = concat . map (\(CartesianStrings ss) -> ss)
--- a/src/Data/SplitIntoCrossTabs.hs
+++ b/src/Data/SplitIntoCrossTabs.hs
@ -0,0 +1,148 @@
 {-# LANGUAGE OverloadedStrings #-}
 module Data.SplitIntoCrossTabs
       (splitIntoCrossTabs,
        splitIntoTablesWithValues,
        CrossTab(..),
        TableWithValues(..),
        TextFrag(..))
       where
 import qualified Data.Text as T
 import Data.Text (Text)
 import qualified Data.Set.Ordered as OS
 import qualified Data.Map.Ordered as OM
 import qualified Data.Set as S
 import qualified Data.Foldable as F
 import qualified Data.Map as M
 import qualified Data.Map.Lazy as LM
 import Debug.Trace
 import Data.List (unfoldr, sortBy, maximumBy, minimumBy)
 data TableWithValues a = TableWithValues [Text] [(Text, [a])]
 data CrossTab = SingleItem Text | CrossTab [TextFrag] [TextFrag]
  deriving (Show, Eq)
 data TextFrag = Prefix Text | Suffix Text
  deriving (Show, Eq, Ord)
 splitIntoTablesWithValues :: Text
                            -> Text
                            -> LM.Map Text a -- ^ map from which values will be taken,
                                            -- deliberately a lazy map so that
                                            -- values could be shown one by one
                            -> [Text]
                            -> [TableWithValues a]
 splitIntoTablesWithValues defaultMainHeader defaultSecondaryHeader mapping =
  joinSingleItems . map (convertIntoTableWithValues defaultMainHeader defaultSecondaryHeader mapping) . splitIntoCrossTabs
  where joinSingleItems (TableWithValues h@[_, _] arows : TableWithValues [_, _] brows : rest) =
          joinSingleItems (TableWithValues h (arows ++ brows) : rest)
        joinSingleItems (e : rest) = e : joinSingleItems rest
        joinSingleItems [] = []
 convertIntoTableWithValues :: Text -> Text -> LM.Map Text a -> CrossTab -> TableWithValues a
 convertIntoTableWithValues defaultMainHeader defaultSecondaryHeader mapping (SingleItem t) =
  TableWithValues [defaultMainHeader, defaultSecondaryHeader] [(t, [mapping LM.! t])]
 convertIntoTableWithValues defaultMainHeader defaultSecondaryHeader mapping (CrossTab rowNames columnNames) =
  TableWithValues (T.empty : (map toText columnNames)) (map processRow rowNames)
  where processRow rowName = (toText rowName, map (\colName -> mapping LM.! (combineFrags rowName colName)) columnNames)
 splitIntoCrossTabs :: [Text] -> [CrossTab]
 splitIntoCrossTabs inputs =
  map preferVertical
  $ map snd
  $ sortBy (\(r1,_) (r2, _) -> r1 `compare` r2)
  $ map (getRank inputRanks)
  $ unfoldr extractTheBestCrossTab inputs
  where inputRanks = M.fromList $ zip inputs [1..]
 preferVertical :: CrossTab -> CrossTab
 preferVertical s@(SingleItem _) = s
 preferVertical c@(CrossTab rowNames columnNames)
  | length rowNames < length columnNames = CrossTab columnNames rowNames
  | otherwise = c
 getRank :: M.Map Text Int -> CrossTab -> (Int, CrossTab)
 getRank ranks c = (bestRank, c)
  where bestRank = minimum
                   $ map (ranks M.!)
                   $ S.toList
                   $ toSet c
 extractTheBestCrossTab :: [Text] -> Maybe (CrossTab, [Text])
 extractTheBestCrossTab [] = Nothing
 extractTheBestCrossTab ts = Just (theBestCrossTab, rest)
  where theBestCrossTab = findTheBestCrossTab ts
        rest = filter (`S.notMember` (toSet theBestCrossTab)) ts
 findTheBestCrossTab :: [Text] -> CrossTab
 findTheBestCrossTab ts = case orderedEntries of
  [] -> SingleItem defaultSingleton
  _ -> maximumBy (\t1 t2 -> crossTabSize t1 `compare` crossTabSize t2)
      $ map (findTheBestCrossTabForTextPart (SingleItem defaultSingleton) orderedEntries)
      $ map snd orderedEntries
  where mapping = gatherTextParts ts
        orderedEntries = sortBy entryComparator
                         $ filter (\(_, (_, tset)) -> OS.size tset >= 2)
                         $ zip [1..] (OM.assocs mapping)
        (defaultSingleton:_) = ts
 thenCmp :: Ordering -> Ordering -> Ordering
 thenCmp EQ o2 = o2
 thenCmp o1 _  = o1
 entryComparator (r1, (_, s1)) (r2, (_, s2)) = (OS.size s2 `compare` OS.size s1)
                                                      `thenCmp`
                                                    (r1 `compare` r2)
 findTheBestCrossTabForTextPart :: CrossTab -> [(Int, (TextFrag, OS.OSet TextFrag))] -> (TextFrag, OS.OSet TextFrag) -> CrossTab
 findTheBestCrossTabForTextPart defaultCrossTab entries chosenEntry@(t, tset) = if crossTabSize bestCrossTabFound > 1
                                                                               then bestCrossTabFound
                                                                               else defaultCrossTab
  where bestCrossTabFound = foldr step (CrossTab [] (F.toList tset)) entriesOrderedByIntersection
        entriesOrderedByIntersection =
          sortBy entryComparator
          $ filter (\(_, (_, tset')) -> OS.size tset' >= 2)
          $ map (\(r, (t', tset')) -> (r, (t', tset' OS.|/\ tset))) entries
        step (_, (t', tset')) currentTab@(CrossTab frags common) = selectedTab
            where newTab = CrossTab newT (F.toList newCommon)
                  newT = t':frags
                  newCommon = (OS.fromList common) OS.|/\ tset'
                  selectedTab = if crossTabSize newTab >= crossTabSize currentTab
                                then newTab
                                else currentTab
 crossTabSize :: CrossTab -> Int
 crossTabSize (SingleItem _) = 1
 crossTabSize (CrossTab [] _) = 0
 crossTabSize (CrossTab _ []) = 0
 -- tables really start from 2x2
 crossTabSize (CrossTab [_] _) = 0
 crossTabSize (CrossTab _ [_]) = 0
 crossTabSize (CrossTab rows columns) = length rows * length columns
 toSet :: CrossTab -> S.Set Text
 toSet (SingleItem t) = S.singleton t
 toSet (CrossTab rowNames columnNames) = S.fromList [rName `combineFrags` cName | rName <- rowNames, cName <- columnNames]
 toText :: TextFrag -> Text
 toText (Prefix prefix) = T.stripEnd prefix
 toText (Suffix prefix) = T.stripStart prefix
 combineFrags :: TextFrag -> TextFrag -> Text
 combineFrags (Prefix prefix) (Suffix suffix) = prefix <> suffix
 combineFrags (Suffix suffix) (Prefix prefix) = prefix <> suffix
 combineFrags _ _ = error $ "incompatible text fragments"
 getTextParts :: Text -> [(TextFrag, TextFrag)]
 getTextParts t = [(Prefix (T.take ix t), Suffix (T.drop ix t)) | ix <- [1..(T.length t)-1]]
 gatherTextParts :: [Text] -> OM.OMap TextFrag (OS.OSet TextFrag)
 gatherTextParts = (gather OS.singleton (OS.|<>)) . concat . (map getTextParts)
 gather :: Ord a => (b -> c) -> (c -> c -> c) -> [(a, b)] -> OM.OMap a c
 gather createEntry combine = foldr extend OM.empty
  where extend (k, v) m = OM.unionWithL (\_ v1 v2 -> combine v1 v2) (OM.singleton (k, (createEntry v))) m
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -39,6 +39,7 @@ module GEval.Core
      checkMultipleOuts,
      checkMultipleOutsCore,
      gesMainMetric,
      gesMainScheme,
      gesPreprocess,
      getDataDecoder,
      threeLineSource,
@ -160,6 +161,7 @@ data GEvalSpecification = GEvalSpecification
                            gesTestName :: String,
                            gesSelector :: Maybe Selector,
                            gesOutFile :: String,
                            gesAltOutFiles :: Maybe [String],
                            gesExpectedFile :: String,
                            gesInputFile :: String,
                            gesMetrics :: [EvaluationScheme],
@ -177,6 +179,11 @@ gesMainMetric spec = case gesMetrics spec of
  (scheme:_) -> evaluationSchemeMetric scheme
  otherwise -> error "no metric given"
 gesMainScheme :: GEvalSpecification -> EvaluationScheme
 gesMainScheme spec = case gesMetrics spec of
  (scheme:_) -> scheme
  otherwise -> error "no metric given"
 gesPreprocess :: GEvalSpecification -> (Text -> Text)
 gesPreprocess spec = tokenizeTabSeparatedWithSpaces (gesTokenizer spec)
@ -191,6 +198,7 @@ data GEvalSpecialCommand = Init
                           | Diff FilePath | MostWorseningFeatures FilePath
                           | PrintVersion | JustTokenize | Submit
                           | Validate | ListMetrics
                           | OracleItemBased
 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
@ -209,6 +217,7 @@ defaultGEvalSpecification = GEvalSpecification {
  gesTestName = defaultTestName,
  gesSelector = Nothing,
  gesOutFile = defaultOutFile,
  gesAltOutFiles = Nothing,
  gesExpectedFile = defaultExpectedFile,
  gesInputFile = defaultInputFile,
  gesMetrics = [EvaluationScheme defaultMetric []],
@ -453,6 +462,37 @@ gevalCoreOnSources CharMatch inputLineSource = helper inputLineSource
 gevalCoreOnSources (LogLossHashed nbOfBits) _ = helperLogLossHashed nbOfBits id
 gevalCoreOnSources (LikelihoodHashed nbOfBits) _ = helperLogLossHashed nbOfBits logLossToLikehood
 gevalCoreOnSources (Mean (MultiLabelFMeasure beta)) _
  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
                                       (Right . getWords)
                                       ((fMeasureOnCounts beta) . (getCounts (==)))
                                       averageC
                                       id
                                       noGraph
    where
      -- repeated as below, as it will be refactored into dependent types soon anyway
      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
 gevalCoreOnSources (Mean WER) _
  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
                                       (Right . getWords)
                                       ((uncurry (/.)) . (uncurry werStep))
                                       averageC
                                       id
                                       noGraph
    where
      -- repeated as below, as it will be refactored into dependent types soon anyway
      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
 gevalCoreOnSources (Mean _) _ = error $ "Mean/ meta-metric defined only for MultiLabel-F1 and WER for the time being"
 -- only MultiLabel-F1 handled for JSONs for the time being...
 gevalCoreOnSources (MultiLabelFMeasure beta) _ = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
                                                                            (Right . getWords)
@ -511,9 +551,12 @@ gevalCoreOnSources BLEU _ = gevalCoreWithoutInput SABLEU bleuAgg bleuFinal noGra
 gevalCoreOnSources GLEU _ = gevalCoreWithoutInput SAGLEU gleuAgg gleuFinal noGraph
  where gleuFinal (m, t) = m /. t
        gleuAgg = CC.foldl gleuFuse (0, 0)
-        gleuFuse (a1, a2) (b1, b2) = (a1+b1, a2+b2)
+        gleuFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
-gevalCoreOnSources WER _ = gevalCoreWithoutInput SAWER averageC id noGraph
+gevalCoreOnSources WER _ = gevalCoreWithoutInput SAWER werAgg werFinal noGraph
  where werAgg = CC.foldl werFuse (0, 0)
        werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
        werFinal (errors, ref) = errors /. ref
 gevalCoreOnSources Accuracy _ = gevalCoreWithoutInput SAAccuracy averageC id noGraph
@ -753,7 +796,10 @@ continueGEvalCalculations SAGLEU GLEU = defineContinuation gleuAgg gleuFinal noG
        gleuAgg = CC.foldl gleuFuse (0, 0)
        gleuFuse (a1, a2) (b1, b2) = (a1+b1, a2+b2)
-continueGEvalCalculations SAWER WER = defineContinuation averageC id noGraph
+continueGEvalCalculations SAWER WER = defineContinuation werAgg werFinal noGraph
  where werAgg = CC.foldl werFuse (0, 0)
        werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
        werFinal (errors, ref) = errors /. ref
 continueGEvalCalculations SAAccuracy Accuracy = defineContinuation averageC id noGraph
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -19,6 +19,9 @@ import Control.Exception
 import Control.Monad.Trans.Resource
 import Data.String.Here
 import Data.List (intercalate)
 import Data.List.Split (splitOn)
 createChallenge :: Bool -> FilePath -> GEvalSpecification -> IO ()
 createChallenge withDataFiles expectedDirectory spec = do
  D.createDirectoryIfMissing False expectedDirectory
@ -31,7 +34,7 @@ createChallenge withDataFiles expectedDirectory spec = do
  if withDataFiles
    then
     do
-      createFile (trainDirectory </> "train.tsv") $ trainContents metric
+      createTrainFiles metric trainDirectory expectedFile
      createFile (devDirectory </> "in.tsv") $ devInContents metric
      createFile (devDirectory </> expectedFile) $ devExpectedContents metric
@ -50,12 +53,23 @@ createChallenge withDataFiles expectedDirectory spec = do
        testDirectory = expectedDirectory </> testName
        expectedFile = gesExpectedFile spec
 createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
 createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
 createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
 createTrainFiles metric trainDirectory expectedFile = do
  createFile (trainDirectory </> "in.tsv") $ trainInContents metric
  createFile (trainDirectory </> expectedFile) $ trainExpectedContents metric
 createSingleTrainFile metric trainDirectory =
  createFile (trainDirectory </> "train.tsv") $ trainContents metric
 createFile :: FilePath -> String -> IO ()
 createFile filePath contents = do
  whenM (D.doesFileExist filePath) $ throwM $ FileAlreadyThere filePath
  writeFile filePath contents
 readmeMDContents :: Metric -> String -> String
 readmeMDContents (Mean metric) testName = readmeMDContents metric testName
 readmeMDContents GLEU testName = readmeMDContents BLEU testName
 readmeMDContents BLEU testName = [i|
 GEval sample machine translation challenge
@ -413,7 +427,22 @@ configContents schemes precision testName = unwords (Prelude.map (\scheme -> ("-
    where precisionOpt Nothing = ""
          precisionOpt (Just p) = " --precision " ++ (show p)
 -- Originally train content was in one file, to avoid large changes
 -- for the time being we are using the original function.
 trainInContents :: Metric -> String
 trainInContents metric = unlines
                         $ map (intercalate "\t")
                         $ map tail
                         $ map (splitOn "\t")
                         $ lines
                         $ trainContents metric
 trainExpectedContents :: Metric -> String
 trainExpectedContents metric = unlines $ map head $ map (splitOn "\t") $ lines $ trainContents metric
 trainContents :: Metric -> String
 trainContents (Mean metric) = trainContents metric
 trainContents GLEU = trainContents BLEU
 trainContents BLEU = [hereLit|alussa loi jumala taivaan ja maan	he mea hanga na te atua i te timatanga te rangi me te whenua
 ja maa oli autio ja tyhjä , ja pimeys oli syvyyden päällä	a kahore he ahua o te whenua , i takoto kau ; he pouri ano a runga i te mata o te hohonu
@ -482,7 +511,7 @@ trainContents LogLoss = [hereLit|0.0	Hell, no!!!
 trainContents BIOF1Labels = trainContents BIOF1
 trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND	My name is Bond , James Bond
 O O O O O	There is no name here
-B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
+B-firstname/JOHN B-surname/VON I-surname/NEUMANN	John von Nueman
 |]
 trainContents TokenAccuracy = [hereLit|* V N	I like cats
 * * V * N	I can see the rainbow
@ -500,10 +529,12 @@ Love and hate	LOVE HATE
 I am sad	SADNESS
 I am so sad and hateful	SADNESS HATE
 |]
-trainContents (Soft2DFMeasure _) = trainContents ClippEU
+trainContents (Soft2DFMeasure _) = [hereLit|2/0,0,10,150	foo.djvu
 trainContents ClippEU = [hereLit|2/0,0,10,150	foo.djvu
 1/30,40,100,1000	bar.djvu
 |]
 trainContents ClippEU = [hereLit|1/30,40,100,1000/10	bar.djvu
 2/30,40,500,600/10	foo.djvu
 |]
 trainContents _ = [hereLit|0.06	0.39	0	0.206
 1.00	1.00	1	0.017
 317.8	5.20	67	0.048
@ -511,6 +542,7 @@ trainContents _ = [hereLit|0.06        0.39    0       0.206
 |]
 devInContents :: Metric -> String
 devInContents (Mean metric) = devInContents metric
 devInContents GLEU = devInContents BLEU
 devInContents BLEU = [hereLit|ja jumala sanoi : " tulkoon valkeus " , ja valkeus tuli
 ja jumala näki , että valkeus oli hyvä ; ja jumala erotti valkeuden pimeydestä
@ -578,6 +610,7 @@ devInContents _ = [hereLit|0.72	0	0.007
 |]
 devExpectedContents :: Metric -> String
 devExpectedContents (Mean metric) = devExpectedContents metric
 devExpectedContents GLEU = devExpectedContents BLEU
 devExpectedContents BLEU = [hereLit|a ka ki te atua , kia marama : na ka marama
 a ka kite te atua i te marama , he pai : a ka wehea e te atua te marama i te pouri
@ -647,6 +680,7 @@ devExpectedContents _ = [hereLit|0.82
 |]
 testInContents :: Metric -> String
 testInContents (Mean metric) = testInContents metric
 testInContents GLEU = [hereLit|Alice has a black
 |]
 testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi
@ -717,6 +751,7 @@ testInContents _ = [hereLit|0.72	0	0.007
 |]
 testExpectedContents :: Metric -> String
 testExpectedContents (Mean metric) = testExpectedContents metric
 testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
 a ko te ahiahi , ko te ata , he ra kotahi
 |]
--- a/src/GEval/EvaluationScheme.hs
+++ b/src/GEval/EvaluationScheme.hs
@ -1,22 +1,33 @@
 module GEval.EvaluationScheme
-  (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
+  (EvaluationScheme(..),
   evaluationSchemeMetric,
   applyPreprocessingOperations,
   evaluationSchemeName,
   evaluationSchemePriority,
   PreprocessingOperation(..))
  where
 import GEval.Metric
 import Text.Regex.PCRE.Heavy
 import Text.Regex.PCRE.Light.Base (Regex(..))
-import Data.Text (Text(..), concat, toLower, toUpper, pack, unpack)
+import Data.Text (Text(..), concat, toLower, toUpper, pack, unpack, words, unwords)
-import Data.List (intercalate, break)
+import Data.List (intercalate, break, sort)
 import Data.Either
-import Data.Maybe (fromMaybe)
+import Data.Maybe (fromMaybe, catMaybes)
 import qualified Data.ByteString.UTF8 as BSU
 data EvaluationScheme = EvaluationScheme Metric [PreprocessingOperation]
  deriving (Eq)
-data PreprocessingOperation = RegexpMatch Regex | LowerCasing | UpperCasing | SetName Text
+data PreprocessingOperation = RegexpMatch Regex
                              | LowerCasing
                              | UpperCasing
                              | Sorting
                              | SetName Text
                              | SetPriority Int
                              | RegexpSubstition Regex Text
  deriving (Eq)
 leftParameterBracket :: Char
@ -39,16 +50,37 @@ readOps ('l':theRest) = (LowerCasing:ops, theRest')
 readOps ('u':theRest) = (UpperCasing:ops, theRest')
    where (ops, theRest') = readOps theRest
 readOps ('m':theRest) = handleParametrizedOp (RegexpMatch . (fromRight undefined) . ((flip compileM) []) . BSU.fromString) theRest
 readOps ('S':theRest) = (Sorting:ops, theRest')
    where (ops, theRest') = readOps theRest
 readOps ('N':theRest) = handleParametrizedOp (SetName . pack) theRest
 readOps ('P':theRest) = handleParametrizedOp (SetPriority . read) theRest
 readOps ('s':theRest) = handleParametrizedBinaryOp (\a b -> RegexpSubstition (fromRight undefined $ compileM (BSU.fromString a) []) (pack b)) theRest
 readOps s = ([], s)
 handleParametrizedOp :: (String -> PreprocessingOperation) -> String -> ([PreprocessingOperation], String)
-handleParametrizedOp constructor (leftParameterBracket:theRest) =
+handleParametrizedOp constructor theRest =
-  case break (== rightParameterBracket) theRest of
+  case parseParameter theRest of
-    (s, []) -> ([], s)
+    (Nothing, s) -> ([], s)
-    (param, (_:theRest')) -> let (ops, theRest'') = readOps theRest'
+    (Just param, theRest') -> let (ops, theRest'') = readOps theRest'
                             in ((constructor param):ops, theRest'')
-handleParametrizedOp _ s = ([], s)
+
 handleParametrizedBinaryOp :: (String -> String -> PreprocessingOperation) -> String -> ([PreprocessingOperation], String)
 handleParametrizedBinaryOp constructor theRest =
  case parseParameter theRest of
    (Nothing, s) -> ([], s)
    (Just paramA, theRest') ->
      case parseParameter theRest' of
        (Nothing, s) -> ([], s)
        (Just paramB, theRest'') -> let (ops, theRest''') = readOps theRest''
                                   in ((constructor paramA paramB):ops, theRest''')
 parseParameter :: String -> (Maybe String, String)
 parseParameter (leftParameterBracket:theRest) =
  case break (== rightParameterBracket) theRest of
    (s, []) -> (Nothing, s)
    (param, (_:theRest')) -> (Just param, theRest')
 parseParameter s = (Nothing, s)
 instance Show EvaluationScheme where
  show (EvaluationScheme metric operations) = (show metric) ++ (if null operations
@ -58,10 +90,21 @@ instance Show EvaluationScheme where
 evaluationSchemeName :: EvaluationScheme -> String
 evaluationSchemeName scheme@(EvaluationScheme metric operations) = fromMaybe (show scheme) (findNameSet operations)
 evaluationSchemePriority scheme@(EvaluationScheme _ operations) = fromMaybe defaultPriority (findPrioritySet operations)
  where defaultPriority = 1
 findNameSet :: [PreprocessingOperation] -> Maybe String
-findNameSet [] = Nothing
+findNameSet ops = case names of
-findNameSet ((SetName name):_) = Just (unpack name)
+  [] -> Nothing
-findNameSet (_:ops) = findNameSet ops
+  _ -> Just $ intercalate " " names
  where names = catMaybes $ map extractName ops
        extractName (SetName n) = Just (unpack n)
        extractName _ = Nothing
 findPrioritySet :: [PreprocessingOperation] -> Maybe Int
 findPrioritySet [] = Nothing
 findPrioritySet ((SetPriority p):_) = Just p
 findPrioritySet (_:ops) = findPrioritySet ops
 evaluationSchemeMetric :: EvaluationScheme -> Metric
 evaluationSchemeMetric (EvaluationScheme metric _) = metric
@ -70,10 +113,31 @@ instance Show PreprocessingOperation where
  show (RegexpMatch (Regex _ regexp)) = parametrizedOperation "m" (BSU.toString regexp)
  show LowerCasing = "l"
  show UpperCasing = "u"
  show Sorting = "S"
  show (SetName t) = parametrizedOperation "N" (unpack t)
  show (SetPriority p) = parametrizedOperation "P" (show p)
  show (RegexpSubstition (Regex _ regexp) s) = "s" ++ (formatParameter $ BSU.toString regexp) ++ (formatParameter $ unpack s)
 applySubstitution :: Regex -> Text -> Text -> Text
 applySubstitution r substitution t =
  gsub r (handleRefs substitution) t
 handleRefs :: Text -> Text -> [Text] -> Text
 handleRefs substitution mainMatch subMatches = gsub refRegexp handleRef substitution
  where Right refRegexp = compileM (BSU.fromString "\\\\\\d+") []
        indexables = mainMatch : subMatches
        handleRef :: Text -> Text
        handleRef ref =
          let ix = (read $ tail $ unpack ref)
          in if ix >= length indexables
             then (pack "")
             else indexables !! ix
 parametrizedOperation :: String -> String -> String
-parametrizedOperation opCode opArg = opCode ++ [leftParameterBracket] ++ opArg ++ [rightParameterBracket]
+parametrizedOperation opCode opArg = opCode ++ (formatParameter opArg)
 formatParameter :: String -> String
 formatParameter p = [leftParameterBracket] ++ p ++ [rightParameterBracket]
 applyPreprocessingOperations :: EvaluationScheme -> Text -> Text
 applyPreprocessingOperations (EvaluationScheme _ operations) t = foldl (flip applyPreprocessingOperation) t operations
@ -82,4 +146,7 @@ applyPreprocessingOperation :: PreprocessingOperation -> Text -> Text
 applyPreprocessingOperation (RegexpMatch regex) = Data.Text.concat . (map fst) . (scan regex)
 applyPreprocessingOperation LowerCasing = toLower
 applyPreprocessingOperation UpperCasing = toUpper
 applyPreprocessingOperation Sorting = Data.Text.unwords . sort . Data.Text.words
 applyPreprocessingOperation (SetName _) = id
 applyPreprocessingOperation (SetPriority _) = id
 applyPreprocessingOperation (RegexpSubstition regex substition) = applySubstitution regex substition
--- a/src/GEval/LineByLine.hs
+++ b/src/GEval/LineByLine.hs
@ -16,11 +16,14 @@ module GEval.LineByLine
        runDiffGeneralized,
        LineRecord(..),
        ResultOrdering(..),
-        justTokenize
+        justTokenize,
        worstFeaturesPipeline,
        runOracleItemBased
       ) where
 import GEval.Core
 import GEval.Common
 import GEval.EvaluationScheme
 import Text.Tokenizer
 import Data.Conduit.AutoDecompress (doNothing)
@ -33,10 +36,11 @@ import Data.Text
 import Data.Text.Encoding
 import Data.Conduit.Rank
 import Data.Maybe (fromMaybe)
 import Data.Either (rights)
 import qualified Data.Vector as V
-import Data.List (sortBy, sortOn, sort, concat)
+import Data.List (sortBy, sortOn, sort, concat, maximumBy)
 import Control.Monad.IO.Class
 import Control.Monad.Trans.Resource
@ -83,7 +87,6 @@ parseReferenceEntry :: Text -> (Integer, Text)
 parseReferenceEntry line = (read $ unpack refId, t)
  where [refId, t] = splitOn "\t" line
 runLineByLine :: ResultOrdering -> Maybe String -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
 runLineByLine ordering featureFilter spec bbdo = runLineByLineGeneralized ordering spec consum
   where consum :: Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
@ -107,28 +110,26 @@ runFeatureFilter (Just feature) spec bbdo mReferences = CC.map (\l -> (fakeRank,
        checkFeature feature (_, LineWithFactors _ _ fs) = feature `elem` (Prelude.map show fs)
 runWorstFeatures :: ResultOrdering -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
-runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences)
+runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences consumFeatures)
  where ordering' = forceSomeOrdering ordering
 consumFeatures = CL.map (encodeUtf8 . formatFeatureWithPValue)
                 .| CC.unlinesAscii
                 .| CC.stdout
-
+worstFeaturesPipeline :: Bool
-worstFeaturesPipeline :: Bool -> GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
+                        -> GEvalSpecification
-worstFeaturesPipeline reversed spec bbdo mReferences = rank (lessByMetric reversed $ gesMainMetric spec)
+                        -> BlackBoxDebuggingOptions
                        -> Maybe References
                        -> ConduitT FeatureWithPValue Void (ResourceT IO) ()
                        -> ConduitT LineRecord Void (ResourceT IO) ()
 worstFeaturesPipeline reversed spec bbdo mReferences consum = rank (lessByMetric reversed $ gesMainMetric spec)
                                      .| evalStateC 0 (extractFeaturesAndPValues spec bbdo mReferences)
                                      .| CC.filter (\(FeatureWithPValue _ p _ _) -> not $ isNaN p) -- NaN values would poison sorting
                                      .| gobbleAndDo (sortBy featureOrder)
                                      .| filtreCartesian (bbdoCartesian bbdo)
-                                      .| CL.map (encodeUtf8 . formatFeatureWithPValue)
+                                      .| consum
-                                      .| CC.unlinesAscii
+  where  featureOrder (FeatureWithPValue _ p1 _ _) (FeatureWithPValue _ p2 _ _) =
                                      .| CC.stdout
  where  formatOutput (LineRecord inp exp out _ score) = Data.Text.intercalate "\t" [
           formatScore score,
           escapeTabs inp,
           escapeTabs exp,
           escapeTabs out]
         formatScore :: MetricValue -> Text
         formatScore = Data.Text.pack . printf "%f"
         featureOrder (FeatureWithPValue _ p1 _ _) (FeatureWithPValue _ p2 _ _) =
           p1 `compare` p2
 -- for commands like --worst-features we need some ordering (KeepTheOriginalOrder
@ -359,8 +360,9 @@ runLineByLineGeneralized ordering spec consum = do
  (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
  gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
  where metric = gesMainMetric spec
        scheme = gesMainScheme spec
        mSelector = gesSelector spec
-        preprocess = gesPreprocess spec
+        preprocess = (gesPreprocess spec) . (applyPreprocessingOperations scheme)
        sorter KeepTheOriginalOrder = doNothing
        sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
        sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -393,6 +395,31 @@ runDiff ordering featureFilter otherOut spec bbdo = runDiffGeneralized ordering
        formatScoreDiff :: Double -> Text
        formatScoreDiff = Data.Text.pack . printf "%f"
 runOracleItemBased :: GEvalSpecification -> IO ()
 runOracleItemBased spec = runMultiOutputGeneralized spec consum
  where consum = CL.map picker .| format
        picker = maximumBy (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) -> metricCompare metric scoreA scoreB)
        format = CL.map (encodeUtf8 . formatOutput)
                 .| CC.unlinesAscii
                 .| CC.stdout
        formatOutput (LineRecord _ _ out _ _) = out
        metric = gesMainMetric spec
 runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
 runMultiOutputGeneralized spec consum = do
  (inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
  let (Just altOuts) = gesAltOutFiles spec
  altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
  let altSourceSpecs = rights altSourceSpecs'
  let sourceSpecs = (outSource:altSourceSpecs)
  let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess inputSource expectedSource) sourceSpecs
  runResourceT $ runConduit $
    (sequenceSources sources .| consum)
  where metric = gesMainMetric spec
        scheme = gesMainScheme spec
        preprocess = (gesPreprocess spec) . (applyPreprocessingOperations scheme)
        mSelector = gesSelector spec
 runMostWorseningFeatures :: ResultOrdering -> FilePath -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
 runMostWorseningFeatures ordering otherOut spec bbdo  = runDiffGeneralized ordering' otherOut spec consum
  where ordering' = forceSomeOrdering ordering
@ -402,7 +429,7 @@ runMostWorseningFeatures ordering otherOut spec bbdo  = runDiffGeneralized order
          FirstTheBest -> True
        consum :: Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
        consum = \mReferences -> CC.map prepareFakeLineRecord
-                                .| (worstFeaturesPipeline reversed spec bbdo mReferences)
+                                .| (worstFeaturesPipeline reversed spec bbdo mReferences consumFeatures)
        prepareFakeLineRecord :: (LineRecord, LineRecord) -> LineRecord
        prepareFakeLineRecord (LineRecord _ _ _ _ scorePrev, LineRecord inp exp out c score) =
          LineRecord inp exp out c (score - scorePrev)
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -8,7 +8,8 @@ module GEval.Metric
   bestPossibleValue,
   perfectOutLineFromExpectedLine,
   fixedNumberOfColumnsInExpected,
-   fixedNumberOfColumnsInInput)
+   fixedNumberOfColumnsInInput,
   metricCompare)
  where
 import Data.Word
@ -28,7 +29,12 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
              | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
              | MultiLabelLogLoss | MultiLabelLikelihood
-              | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double
+              | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double
              | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double
              -- it would be better to avoid infinite recursion here
              -- `Mean (Mean BLEU)` is not useful, but as it would mean
              -- a larger refactor, we will postpone this
              | Mean Metric
              deriving (Eq)
 instance Show Metric where
@ -73,8 +79,12 @@ instance Show Metric where
  show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta)
  show MultiLabelLogLoss = "MultiLabel-Logloss"
  show MultiLabelLikelihood = "MultiLabel-Likelihood"
  show (Mean metric) = "Mean/" ++ (show metric)
 instance Read Metric where
  readsPrec p ('M':'e':'a':'n':'/':theRest) = case readsPrec p theRest of
    [(metric, theRest)] -> [(Mean metric, theRest)]
    _ -> []
  readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
  readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)]
  readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)]
@ -162,6 +172,12 @@ getMetricOrdering SMAPE = TheLowerTheBetter
 getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter
 getMetricOrdering MultiLabelLogLoss = TheLowerTheBetter
 getMetricOrdering MultiLabelLikelihood = TheHigherTheBetter
 getMetricOrdering (Mean metric) = getMetricOrdering metric
 metricCompare :: Metric -> MetricValue -> MetricValue -> Ordering
 metricCompare metric a b = metricCompare' (getMetricOrdering metric) a b
  where metricCompare' TheHigherTheBetter a b = a `compare` b
        metricCompare' TheLowerTheBetter a b = b `compare` a
 bestPossibleValue :: Metric -> MetricValue
 bestPossibleValue metric = case getMetricOrdering metric of
@ -169,18 +185,21 @@ bestPossibleValue metric = case getMetricOrdering metric of
  TheHigherTheBetter -> 1.0
 fixedNumberOfColumnsInExpected :: Metric -> Bool
 fixedNumberOfColumnsInExpected (Mean metric) = fixedNumberOfColumnsInExpected metric
 fixedNumberOfColumnsInExpected MAP = False
 fixedNumberOfColumnsInExpected BLEU = False
 fixedNumberOfColumnsInExpected GLEU = False
 fixedNumberOfColumnsInExpected _ = True
 fixedNumberOfColumnsInInput :: Metric -> Bool
 fixedNumberOfColumnsInInput (Mean metric) = fixedNumberOfColumnsInInput metric
 fixedNumberOfColumnsInInput (SoftFMeasure _) = False
 fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
 fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False
 fixedNumberOfColumnsInInput _ = True
 perfectOutLineFromExpectedLine :: Metric -> Text -> Text
 perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t
 perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
 perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
 perfectOutLineFromExpectedLine BLEU t = getFirstColumn t
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@ -222,6 +222,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
  ItemIntermediateRepresentationType ALogLossHashed = (Text, Text)
  ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text)
  ItemIntermediateRepresentationType ACharMatch = (Text, Text)
  ItemIntermediateRepresentationType AWER = (Int, Int)
  ItemIntermediateRepresentationType t = Double
 itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -48,6 +48,7 @@ listOfAvailableMetrics = [RMSE,
                          MultiLabelFMeasure 1.0,
                          MultiLabelFMeasure 2.0,
                          MultiLabelFMeasure 0.25,
                          Mean (MultiLabelFMeasure 1.0),
                          ProbabilisticMultiLabelFMeasure 1.0,
                          ProbabilisticMultiLabelFMeasure 2.0,
                          ProbabilisticMultiLabelFMeasure 0.25,
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -39,6 +39,7 @@ import GEval.Validation
 import Data.List (intercalate)
 import Data.Conduit.SmartSource
 import Data.CartesianStrings
 fullOptionsParser = info (helper <*> optionsParser)
       (fullDesc
@ -94,6 +95,10 @@ optionsParser = GEvalOptions
                (flag' ListMetrics
                    ( long "list-metrics"
                      <> help "List all metrics with their descriptions"))
                <|>
                (flag' OracleItemBased
                    ( long "oracle-item-based"
                      <> help "Generate the best possible output considering outputs given by --out-file and --alt-out-file options (and peeking into the expected file)."))
                )
   <*> ((flag' FirstTheWorst
@ -151,6 +156,10 @@ specParser = GEvalSpecification
    <> showDefault
    <> metavar "OUT"
    <> help "The name of the file to be evaluated" )
  <*> (optional $ some $ strOption
        ( long "alt-out-file"
          <> metavar "OUT"
          <> help "Alternative output file, makes sense only for some options, e.g. --oracle-item-based"))
  <*> strOption
  ( long "expected-file"
    <> short 'e'
@ -249,14 +258,12 @@ sel (Just m) _ = m
 metricReader :: Parser [EvaluationScheme]
-metricReader = many $ option auto         -- actually `some` should be used instead of `many`, the problem is that
+metricReader = concatCartesianStrings <$>
  (many $ option auto         -- actually `some` should be used instead of `many`, the problem is that
    ( long "metric"           -- --metric might be in the config.txt file...
      <> short 'm'
      <> metavar "METRIC"
-                 <> help ("Metric to be used, e.g.:" ++ helpMetricParameterMetricsList))
+      <> help ("Metric to be used, e.g.:" ++ helpMetricParameterMetricsList)))
 --                   RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
 altMetricReader :: Parser (Maybe EvaluationScheme)
 altMetricReader = optional $ option auto
@ -362,6 +369,9 @@ runGEval''' (Just Validate) _ _ spec _ _ = do
 runGEval''' (Just ListMetrics) _ _ _ _ _ = do
  listMetrics
  return Nothing
 runGEval''' (Just OracleItemBased) _ _ spec _ _ = do
  runOracleItemBased spec
  return Nothing
 getGraphFilename :: Int -> FilePath -> FilePath
 getGraphFilename 0 fp = fp
--- a/src/GEval/Validation.hs
+++ b/src/GEval/Validation.hs
@ -76,16 +76,13 @@ validationChallenge challengeDirectory spec = do
  checkCorrectFile gitignoreFile
  checkCorrectFile readmeFile
  testDirectories <- findTestDirs challengeDirectory
-  checkTestDirectories mainMetric testDirectories
+  checkTestDirectories spec testDirectories
-  checkTrainDirectory mainMetric challengeDirectory
+  checkTrainDirectory spec challengeDirectory
  mapM_ (runOnTest spec) testDirectories
  where
    configFile = challengeDirectory </> "config.txt"
    gitignoreFile = challengeDirectory </> ".gitignore"
    readmeFile = challengeDirectory </> "README.md"
    mainMetric = evaluationSchemeMetric $ head $ gesMetrics spec
 checkCorrectFile :: FilePath -> IO ()
 checkCorrectFile filePath = do
@ -147,7 +144,8 @@ never :: FindClause Bool
 never = depth ==? 0
 testDirFilter :: FindClause Bool
-testDirFilter = (SFF.fileType ==? Directory) &&? (SFF.fileName ~~? "dev-*" ||? SFF.fileName ~~? "test-*")
+testDirFilter = (SFF.fileType ==? Directory) &&? (SFF.fileName ~~? "dev-*"
                                                  ||? SFF.fileName ~~? "test-*")
 fileFilter :: String -> FindClause Bool
 fileFilter fileName = (SFF.fileType ==? RegularFile) &&? (SFF.fileName ~~? fileName ||? SFF.fileName ~~? fileName ++ exts)
@ -155,12 +153,12 @@ fileFilter fileName = (SFF.fileType ==? RegularFile) &&? (SFF.fileName ~~? fileN
    exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ]
-checkTestDirectories :: Metric -> [FilePath] -> IO ()
+checkTestDirectories :: GEvalSpecification -> [FilePath] -> IO ()
 checkTestDirectories _ [] = throwM NoTestDirectories
-checkTestDirectories metric directories = mapM_ (checkTestDirectory metric) directories
+checkTestDirectories spec directories = mapM_ (checkTestDirectory spec) directories
-checkTestDirectory :: Metric -> FilePath -> IO ()
+checkTestDirectory :: GEvalSpecification -> FilePath -> IO ()
-checkTestDirectory metric directoryPath = do
+checkTestDirectory spec directoryPath = do
  inputFiles <- findInputFiles directoryPath
  when (null inputFiles) $ throw $ NoInputFile inputFile
  when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles
@ -180,21 +178,29 @@ checkTestDirectory metric directoryPath = do
  outputFiles <- findOutputFiles directoryPath
  unless (null outputFiles) $ throw $ OutputFileDetected outputFiles
  runOnTest spec directoryPath
  where
    metric = evaluationSchemeMetric $ head $ gesMetrics spec
    inputFile = directoryPath </> defaultInputFile
    expectedFile = directoryPath </> defaultExpectedFile
-checkTrainDirectory :: Metric -> FilePath -> IO ()
+checkTrainDirectory :: GEvalSpecification -> FilePath -> IO ()
-checkTrainDirectory metric challengeDirectory = do
+checkTrainDirectory spec challengeDirectory = do
  let trainDirectory = challengeDirectory </> "train"
  whenM (doesDirectoryExist trainDirectory) $ do
    trainFiles <- findTrainFiles trainDirectory
-    when (null trainFiles) $ throw $ NoInputFile "train.tsv"
+    if (not $ null trainFiles)
-    when (length trainFiles > 1) $ throw $ TooManyTrainFiles trainFiles
+    then
-    let [trainFile] = trainFiles
+     do
-    checkCorrectFile trainFile
+      putStrLn "WARNING: Found old-style train file `train.tsv`, whereas the same convention as in"
-    when (fixedNumberOfColumnsInInput metric && fixedNumberOfColumnsInExpected metric) $ do
+      putStrLn "WARNING: test directories if preferred (`in.tsv` and `expected.tsv`)."
-      checkColumns trainFile
+      putStrLn "WARNING: (Though, there might still be some cases when `train.tsv` is needed, e.g. for training LMs.)"
    else
     do
      runOnTest spec trainDirectory
 checkColumns :: FilePath -> IO ()
 checkColumns filePath = do
--- a/src/GEval/WER.hs
+++ b/src/GEval/WER.hs
@ -5,8 +5,8 @@ module GEval.WER
 import Data.Array
 import GEval.Common
-werStep :: Eq a => [a] -> [a] -> Double
+werStep :: Eq a => [a] -> [a] -> (Int, Int)
-werStep expected got = (fromIntegral $ distance expected got) `safeDoubleDiv` (fromIntegral $ length expected)
+werStep expected got = (distance expected got, length expected)
 -- see https://stackoverflow.com/questions/6718787/levenshtein-distance-cost
 distance u v = memo ! (m, n)
--- a/stack.yaml
+++ b/stack.yaml
@ -1,5 +1,5 @@
 flags: {}
 packages:
 - '.'
-extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1]
+extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1,'ordered-containers-0.2.2@sha256:ebf2be3f592d9cf148ea6b8375f8af97148d44f82d8d04476899285e965afdbf,810']
 resolver: lts-12.26
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -64,6 +64,8 @@ import qualified Data.Vector.Unboxed as DVU
 import qualified Statistics.Matrix.Types as SMT
 import Data.Statistics.Loess (loess)
 import Data.Statistics.Calibration (calibration)
 import Data.CartesianStrings (parseCartesianString)
 import Data.SplitIntoCrossTabs (splitIntoCrossTabs, CrossTab(..), TextFrag(..))
 informationRetrievalBookExample :: [(String, Int)]
 informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
@ -127,6 +129,8 @@ main = hspec $ do
      runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
    it "with probs" $
      runGEvalTest "accuracy-probs" `shouldReturnAlmost` 0.4
    it "sorted" $
      runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
  describe "F-measure" $ do
    it "simple example" $
      runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857
@ -326,12 +330,17 @@ main = hspec $ do
      runGEvalTest "multilabel-f1-with-probs" `shouldReturnAlmost` 0.615384615384615
    it "labels given with probs and numbers" $ do
      runGEvalTest "multilabel-f1-with-probs-and-numbers" `shouldReturnAlmost` 0.6666666666666
  describe "Mean/MultiLabel-F" $ do
    it "simple" $ do
      runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5
  describe "MultiLabel-Likelihood" $ do
    it "simple" $ do
      runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
  describe "Preprocessing operations" $ do
    it "F1 with preprocessing" $ do
      runGEvalTest "f1-with-preprocessing" `shouldReturnAlmost` 0.57142857142857
    it "Regexp substition" $ do
      runGEvalTest "accuracy-with-flags" `shouldReturnAlmost` 0.8
  describe "evaluating single lines" $ do
    it "RMSE" $ do
      (MetricOutput (SimpleRun v) _) <- gevalCoreOnSingleLines RMSE id RawItemTarget
@ -439,6 +448,7 @@ main = hspec $ do
            gesTestName = "test-A",
            gesSelector = Nothing,
            gesOutFile = "out.tsv",
            gesAltOutFiles = Nothing,
            gesExpectedFile = "expected.tsv",
            gesInputFile = "in.tsv",
            gesMetrics = [EvaluationScheme Likelihood []],
@ -666,6 +676,41 @@ main = hspec $ do
      calibration [True, False] [0.0, 1.0] `shouldBeAlmost` 0.0
      calibration [True, False, False, True, False] [0.0, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
      calibration [False, True, True, True, True, False, False, True, False] [0.25, 0.25, 0.0, 0.25, 0.25, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
  describe "Cartesian strings" $ do
    it "singleton" $ do
      (parseCartesianString "foo") `shouldBe` ["foo"]
    it "simple" $ do
      parseCartesianString "a-{foo,bar,baz}-b" `shouldBe` ["a-foo-b", "a-bar-b", "a-baz-b"]
    it "3x2" $ do
      parseCartesianString "a-{foo,bar,baz}-{b,c}" `shouldBe` ["a-foo-b", "a-foo-c", "a-bar-b",
                                                               "a-bar-c", "a-baz-b", "a-baz-c" ]
    it "3x2x3" $ do
      parseCartesianString "{foo,bar,ba}-{b,c}-{0,1,2}x" `shouldBe` ["foo-b-0x", "foo-b-1x", "foo-b-2x",
                                                                      "foo-c-0x", "foo-c-1x", "foo-c-2x",
                                                                      "bar-b-0x", "bar-b-1x", "bar-b-2x",
                                                                      "bar-c-0x", "bar-c-1x", "bar-c-2x",
                                                                      "ba-b-0x", "ba-b-1x", "ba-b-2x",
                                                                      "ba-c-0x", "ba-c-1x", "ba-c-2x" ]
  describe "cross-tabs" $ do
    it "singleton" $ do
      splitIntoCrossTabs ["abababab"] `shouldBe` [SingleItem "abababab"]
    it "too small" $ do
      splitIntoCrossTabs ["aabb", "aacc"] `shouldBe` [SingleItem "aabb", SingleItem "aacc"]
    it "two tables" $ do
      splitIntoCrossTabs ["yABC", "xx00", "yABD", "ZC", "xx11", "yy00", "yy11", "ZD"] `shouldBe` [
                                         CrossTab [Prefix "yAB", Prefix "Z"] [Suffix "C", Suffix "D"],
                                         CrossTab [Prefix "xx", Prefix "yy"] [Suffix "00", Suffix "11"]]
    it "simple" $ do
      splitIntoCrossTabs ["aabsolutely",
                          "aaafoo",
                          "other",
                          "aaabaz",
                          "aaabaq",
                          "bbbfoo",
                          "bbbbaz",
                          "bbbbaq"] `shouldBe` [SingleItem "aabsolutely",
                                                CrossTab [Suffix "foo", Suffix "baz", Suffix "baq"] [Prefix "aaa", Prefix "bbb"],
                                                SingleItem "other"]
 checkConduitPure conduit inList expList = do
  let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
--- a/test/accuracy-on-sorted/accuracy-on-sorted-solution/test-A/out.tsv
+++ b/test/accuracy-on-sorted/accuracy-on-sorted-solution/test-A/out.tsv
@ -0,0 +1,4 @@
 foo baz bar
 xyz aaa
 2 a:1 3
--- a/test/accuracy-on-sorted/accuracy-on-sorted/config.txt
+++ b/test/accuracy-on-sorted/accuracy-on-sorted/config.txt
@ -0,0 +1 @@
 --metric Accuracy:S
--- a/test/accuracy-on-sorted/accuracy-on-sorted/test-A/expected.tsv
+++ b/test/accuracy-on-sorted/accuracy-on-sorted/test-A/expected.tsv
@ -0,0 +1,4 @@
 bar baz foo
 xyz
 a:1 2 3
--- a/test/accuracy-with-flags/accuracy-with-flags-solution/test-A/out.tsv
+++ b/test/accuracy-with-flags/accuracy-with-flags-solution/test-A/out.tsv
@ -0,0 +1,5 @@
 b88 b901
 a100
 a93
 t34
 y23
--- a/test/accuracy-with-flags/accuracy-with-flags/config.txt
+++ b/test/accuracy-with-flags/accuracy-with-flags/config.txt
@ -0,0 +1 @@
 --metric Accuracy:s<[abc](\d+)><!\1>
--- a/test/accuracy-with-flags/accuracy-with-flags/test-A/expected.tsv
+++ b/test/accuracy-with-flags/accuracy-with-flags/test-A/expected.tsv
@ -0,0 +1,5 @@
 a88 b901
 c100
 b93
 t34
 z23
--- a/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple-solution/test-A/out.tsv
+++ b/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple-solution/test-A/out.tsv
@ -0,0 +1,4 @@
 foo bar baz
 uuu
 foo bar baz
 qqq aaa
--- a/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple/config.txt
+++ b/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple/config.txt
@ -0,0 +1 @@
 --metric Mean/MultiLabel-F1
--- a/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple/test-A/expected.tsv
+++ b/test/mean-multilabel-f1-simple/mean-multilabel-f1-simple/test-A/expected.tsv
@ -0,0 +1,4 @@
 foo bar baz
 foo
 qqq qqq
--- a/test/oracle-item-based/oracle-item-based-solution/test-A/out-X.tsv
+++ b/test/oracle-item-based/oracle-item-based-solution/test-A/out-X.tsv
@ -0,0 +1,4 @@
 A
 C
 D
 D
--- a/test/oracle-item-based/oracle-item-based-solution/test-A/out-Y.tsv
+++ b/test/oracle-item-based/oracle-item-based-solution/test-A/out-Y.tsv
@ -0,0 +1,4 @@
 D
 C
 B
 A
--- a/test/oracle-item-based/oracle-item-based-solution/test-A/out.tsv
+++ b/test/oracle-item-based/oracle-item-based-solution/test-A/out.tsv
@ -0,0 +1,4 @@
 B
 A
 C
 A
--- a/test/oracle-item-based/oracle-item-based/config.txt
+++ b/test/oracle-item-based/oracle-item-based/config.txt
@ -0,0 +1 @@
 --metric Accuracy
--- a/test/oracle-item-based/oracle-item-based/test-A/expected.tsv
+++ b/test/oracle-item-based/oracle-item-based/test-A/expected.tsv
@ -0,0 +1,4 @@
 A
 B
 C
 D
--- a/test/oracle-item-based/oracle-item-based/test-A/in.tsv
+++ b/test/oracle-item-based/oracle-item-based/test-A/in.tsv
@ -0,0 +1,4 @@
 1
 2
 3
 4
--- a/test/oracle-item-based/oracle-item-based/test-A/out-O.tsv
+++ b/test/oracle-item-based/oracle-item-based/test-A/out-O.tsv
@ -0,0 +1,4 @@
 A
 C
 C
 D
--- a/test/wer-simple/wer-simple/config.txt
+++ b/test/wer-simple/wer-simple/config.txt
@ -1 +1 @@
--metric WER
+--metric Mean/WER