geval/test/Spec.hs

687 lines
34 KiB
Haskell
Raw Normal View History

2016-08-02 08:37:29 +02:00
{-# LANGUAGE OverloadedStrings #-}
2018-08-01 22:39:34 +02:00
{-# LANGUAGE FlexibleContexts #-}
2016-08-02 08:37:29 +02:00
2015-08-19 22:14:34 +02:00
import Test.Hspec
2019-08-10 13:00:29 +02:00
import GEval.Metric
2015-08-23 08:14:47 +02:00
import GEval.Core
2019-01-10 22:53:43 +01:00
import GEval.Common
import GEval.EvaluationScheme
2015-08-23 08:14:47 +02:00
import GEval.OptionsParser
2015-08-24 22:23:35 +02:00
import GEval.BLEU
2016-08-02 08:37:29 +02:00
import GEval.ClippEU
import GEval.PrecisionRecall
2017-03-24 18:26:12 +01:00
import GEval.ClusteringMetrics
2018-05-15 09:38:13 +02:00
import GEval.BIO
2018-05-26 21:10:22 +02:00
import GEval.LineByLine
import GEval.ParseParams
2018-08-27 17:57:07 +02:00
import GEval.Submit
2018-08-11 22:59:10 +02:00
import Text.Tokenizer
import Text.WordShape
2016-08-02 08:37:29 +02:00
import Data.Attoparsec.Text
import Options.Applicative
2016-08-02 08:37:29 +02:00
import Data.Text
import Text.EditDistance
2018-10-17 17:52:43 +02:00
import GEval.Annotation
2019-01-23 13:00:37 +01:00
import GEval.BlackBoxDebugging
import GEval.FeatureExtractor
2019-02-14 16:35:41 +01:00
import GEval.Selector
2019-08-10 13:00:29 +02:00
import GEval.CreateChallenge
import GEval.Validation
import Data.Map.Strict
2018-05-26 21:10:22 +02:00
import Data.Conduit.List (consume)
2018-08-27 17:57:07 +02:00
import System.Directory
import System.Process
import System.Exit
import System.IO
import System.IO.Temp
import System.IO.Silently
2019-01-23 13:00:37 +01:00
import Data.List (sort)
2015-08-19 23:24:19 +02:00
import qualified Test.HUnit as HU
2015-08-19 22:14:34 +02:00
2018-10-17 17:52:43 +02:00
import qualified Data.IntSet as IS
2019-01-27 20:18:39 +01:00
import qualified Data.Vector as V
2018-10-17 17:52:43 +02:00
2018-05-12 10:53:21 +02:00
import Data.Conduit.SmartSource
2018-08-01 22:39:34 +02:00
import Data.Conduit.Rank
2018-05-12 10:53:21 +02:00
import qualified Data.Conduit.Text as CT
import Data.Conduit
import Control.Monad.Trans.Resource
import qualified Data.Conduit.List as CL
2018-08-01 22:39:34 +02:00
import qualified Data.Conduit.Combinators as CC
2018-05-12 10:53:21 +02:00
2019-01-27 20:18:39 +01:00
import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendall, kendallZ)
import qualified Data.Vector.Unboxed as DVU
import qualified Statistics.Matrix.Types as SMT
import Data.Statistics.Loess (loess)
import Data.Statistics.Calibration (calibration)
2019-01-27 20:18:39 +01:00
2017-03-25 22:11:23 +01:00
informationRetrievalBookExample :: [(String, Int)]
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
("x", 2), ("o", 2), ("o", 2),
("x", 3), ("d", 3), ("d", 3)]
perfectClustering :: [(Int, Char)]
perfectClustering = [(0, 'a'), (2, 'b'), (3, 'c'), (2, 'b'), (2, 'b'), (1, 'd'), (0, 'a')]
stupidClusteringOneBigCluster :: [(Int, Int)]
stupidClusteringOneBigCluster = [(0, 2), (2, 2), (1, 2), (2, 2), (0, 2), (0, 2), (0, 2), (0, 2), (1, 2), (1, 2)]
stupidClusteringManySmallClusters :: [(Int, Int)]
stupidClusteringManySmallClusters = [(0, 0), (2, 1), (1, 2), (2, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 8), (1, 9)]
2015-08-17 23:32:00 +02:00
main :: IO ()
2015-08-19 22:14:34 +02:00
main = hspec $ do
describe "root mean square error" $ do
2015-08-19 22:14:34 +02:00
it "simple test" $ do
[(_, ((MetricOutput val _):_))] <- geval $ defaultGEvalSpecification {gesExpectedDirectory=Just "test/rmse-simple/rmse-simple", gesOutDirectory="test/rmse-simple/rmse-simple-solution"}
val `shouldBeAlmost` 0.64549722436790
describe "mean square error" $ do
2015-11-06 22:02:01 +01:00
it "simple test with arguments" $
runGEvalTest "mse-simple" `shouldReturnAlmost` 0.4166666666666667
2018-06-13 12:30:11 +02:00
describe "mean absolute error" $ do
it "simple test with arguments" $
runGEvalTest "mae-simple" `shouldReturnAlmost` 1.5
describe "SMAPE" $ do
it "simple test" $
2019-02-12 08:36:52 +01:00
runGEvalTest "smape-simple" `shouldReturnAlmost` 45.1851851851852
describe "Spearman's rank correlation coefficient" $ do
it "simple test" $ do
runGEvalTest "spearman-simple" `shouldReturnAlmost` (- 0.5735)
2015-08-24 23:40:40 +02:00
describe "BLEU" $ do
2015-11-06 22:02:01 +01:00
it "trivial example from Wikipedia" $
runGEvalTest "bleu-trivial" `shouldReturnAlmost` 0.0
it "complex example" $
runGEvalTest "bleu-complex" `shouldReturnAlmost` 0.6211
it "perfect translation" $
runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000
2017-10-04 21:56:17 +02:00
it "empty translation" $
runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000
2018-08-13 07:38:46 +02:00
it "with tokenization" $
runGEvalTest "bleu-with-tokenization" `shouldReturnAlmost` 0.6501914150070065
2018-09-11 08:03:07 +02:00
describe "GLEU" $ do
it "simple example" $
runGEvalTest "gleu-simple" `shouldReturnAlmost` 0.462962962962963
2018-09-12 20:37:44 +02:00
it "empty translation" $
runGEvalTest "gleu-empty" `shouldReturnAlmost` 0.0
it "perfect translation" $
runGEvalTest "gleu-perfect" `shouldReturnAlmost` 1.0
2018-09-25 08:13:57 +02:00
describe "WER" $ do
it "simple example" $
runGEvalTest "wer-simple" `shouldReturnAlmost` 0.5555555555
2015-10-31 19:05:23 +01:00
describe "Accuracy" $ do
2015-11-06 22:02:01 +01:00
it "simple example" $
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
2018-04-07 21:13:37 +02:00
it "with probs" $
runGEvalTest "accuracy-probs" `shouldReturnAlmost` 0.4
2016-12-03 09:18:04 +01:00
describe "F-measure" $ do
it "simple example" $
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857
it "perfect classifier" $
runGEvalTest "f-measure-perfect" `shouldReturnAlmost` 1.0
it "stupid classifier" $
runGEvalTest "f-measure-stupid" `shouldReturnAlmost` 0.0
it "all false" $
runGEvalTest "f-measure-all-false" `shouldReturnAlmost` 1.0
it "F2-measure" $
runGEvalTest "f2-simple" `shouldReturnAlmost` 0.714285714
2018-09-27 16:33:35 +02:00
describe "Macro-F-measure" $ do
it "simple example" $
runGEvalTest "macro-f1-simple" `shouldReturnAlmost` 0.266666
it "perfect soltion" $
runGEvalTest "macro-f-measure-perfect" `shouldReturnAlmost` 1.00000
2018-10-23 16:26:05 +02:00
describe "TokenAccuracy" $ do
it "simple example" $ do
runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5
2015-08-24 22:23:35 +02:00
describe "precision count" $ do
it "simple test" $ do
precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2
it "none found" $ do
precisionCount [["Alice", "has", "a", "cat" ]] ["for", "bar", "baz"] `shouldBe` 0
it "multiple values" $ do
precisionCount [["bar", "bar", "bar", "bar", "foo", "xyz", "foo"]] ["foo", "bar", "foo", "baz", "bar", "foo"] `shouldBe` 4
it "multiple refs" $ do
precisionCount [["foo", "baz"], ["bar"], ["baz", "xyz"]] ["foo", "bar", "foo"] `shouldBe` 2
describe "purity (in flat clustering)" $ do
it "the example from Information Retrieval Book" $ do
2017-03-25 22:11:23 +01:00
purity informationRetrievalBookExample `shouldBeAlmost` 0.70588
describe "NMI (in flat clustering)" $ do
it "the example from Information Retrieval Book" $ do
normalizedMutualInformation informationRetrievalBookExample `shouldBeAlmost` 0.36456
it "perfect clustering" $ do
normalizedMutualInformation perfectClustering `shouldBeAlmost` 1.0
it "stupid clustering with one big cluster" $ do
normalizedMutualInformation stupidClusteringOneBigCluster `shouldBeAlmost` 0.0
it "stupid clustering with many small clusters" $ do
normalizedMutualInformation stupidClusteringManySmallClusters `shouldBeAlmost` 0.61799
2017-03-26 08:01:19 +02:00
describe "NMI challenge" $ do
it "complex test" $ do
runGEvalTest "nmi-complex" `shouldReturnAlmost` 0.36456
2017-04-01 12:24:36 +02:00
describe "LogLossHashed challenge" $ do
it "simple example" $ do
runGEvalTest "log-loss-hashed-simple" `shouldReturnAlmost` 2.398479083333333
2017-04-03 10:07:58 +02:00
it "example with unnormalized values" $ do
runGEvalTest "log-loss-hashed-not-normalized" `shouldReturnAlmost` 1.0468455186722887
2018-05-15 08:07:47 +02:00
it "with probs instead of log probs" $ do
runGEvalTest "log-loss-hashed-probs" `shouldReturnAlmost` 4.11631293099392
it "with probs instead of log probs (with normalization)" $ do
runGEvalTest "log-loss-hashed-probs-normalized" `shouldReturnAlmost` 1.55537749098853
2018-05-16 20:59:40 +02:00
it "with log probs whose probs are summing up to less than 1.0" $ do
runGEvalTest "log-loss-hashed-normalization" `shouldReturnAlmost` 5.16395069238851
2018-05-17 15:21:03 +02:00
describe "LikelihoodHashed challenge" $ do
it "example with unnormalized values" $ do
runGEvalTest "likelihood-hashed-not-normalized" `shouldReturnAlmost` 0.351043364110715
2018-05-16 20:59:40 +02:00
describe "reading options" $ do
it "can get the metric" $ do
extractMetric "bleu-complex" `shouldReturn` (Just BLEU)
2015-11-06 21:57:36 +01:00
describe "error handling" $ do
it "too few lines are handled" $ do
runGEvalTest "error-too-few-lines" `shouldThrow` (== TooFewLines)
it "too many lines are handled" $ do
runGEvalTest "error-too-many-lines" `shouldThrow` (== TooManyLines)
2015-11-06 22:42:08 +01:00
it "empty output is handled" $ do
runGEvalTest "empty-output" `shouldThrow` (== EmptyOutput)
2015-11-06 23:14:10 +01:00
it "unexpected data is handled" $
2018-01-13 15:06:09 +01:00
runGEvalTest "unexpected-data" `shouldThrow` (== UnexpectedData 3 "input does not start with a digit")
2015-11-06 23:24:46 +01:00
it "unwanted data is handled" $
2018-01-13 15:06:09 +01:00
runGEvalTest "unwanted-data" `shouldThrow` (== UnexpectedData 2 "number expected")
describe "precision and recall" $ do
it "null test" $ do
precision neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0
recall neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0
f1Measure neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0
it "basic test" $ do
precision testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.3333333333333333
recall testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.66666666666666666
f1Measure testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.444444444444444
2016-08-02 07:17:57 +02:00
it "perfect result" $ do
precision alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0
recall alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0
f1Measure alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0
it "full match" $ do
precision alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.5
recall alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 1.0
f1Measure alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3 , 4, 5] `shouldBeAlmost` 0.66666666666666
2018-10-17 17:52:43 +02:00
describe "max match" $ do
it "simple" $ do
maxMatch (==) [1,2,2] [3,2] `shouldBe` 1
maxMatch (==) [3,2] [1,2,2] `shouldBe` 1
2016-08-02 08:37:29 +02:00
describe "ClippEU" $ do
it "parsing rectangles" $ do
let (Right r) = parseOnly (lineClippingsParser <* endOfInput) "2/0,0,2,3 10/20,30,40,50 18/0,1,500,3 "
r `shouldBe` [Clipping (PageNumber 2) (Rectangle (Point 0 0) (Point 2 3)),
Clipping (PageNumber 10) (Rectangle (Point 20 30) (Point 40 50)),
Clipping (PageNumber 18) (Rectangle (Point 0 1) (Point 500 3))]
it "no rectangles" $ do
let (Right r) = parseOnly (lineClippingsParser <* endOfInput) ""
r `shouldBe` []
it "just spaces" $ do
let (Right r) = parseOnly lineClippingsParser " "
r `shouldBe` []
it "parsing specs" $ do
let (Right r) = parseOnly lineClippingSpecsParser " 2/0,0,2,3/5 10/20,30,40,50/10"
2016-08-02 09:48:58 +02:00
r `shouldBe` [ClippingSpec (PageNumber 2) (Rectangle (Point 5 5) (Point 0 0))
2016-08-02 08:37:29 +02:00
(Rectangle (Point 0 0) (Point 7 8)),
2016-08-02 09:48:58 +02:00
ClippingSpec (PageNumber 10) (Rectangle (Point 30 40) (Point 30 40))
2016-08-02 08:37:29 +02:00
(Rectangle (Point 10 20) (Point 50 60))]
2016-08-02 09:48:58 +02:00
it "full test" $ do
runGEvalTest "clippeu-simple" `shouldReturnAlmost` 0.399999999999
2016-12-03 09:18:04 +01:00
describe "evaluation metric specification is parsed" $ do
it "for simple names" $ do
let metrics = [RMSE, MSE, BLEU, Accuracy, ClippEU]
let parsedMetrics = Prelude.map (read . show) metrics
metrics `shouldBe` parsedMetrics
it "for F-Measure" $ do
read "F2" `shouldBe` (FMeasure 2.0)
read "F1" `shouldBe` (FMeasure 1.0)
read "F0.5" `shouldBe` (FMeasure 0.5)
2018-10-17 17:52:43 +02:00
describe "Soft-F1" $ do
it "simple test" $ do
runGEvalTest "soft-f1-simple" `shouldReturnAlmost` 0.33333333333333
it "perfect test" $ do
runGEvalTest "soft-f1-perfect" `shouldReturnAlmost` 1.0
2019-03-12 09:14:50 +01:00
describe "Probabilistic-Soft-F1" $ do
it "simple test" $ do
runGEvalTest "probabilistic-soft-f1-simple" `shouldReturnAlmost` 0.33333333333333
it "simple test with perfect calibration" $ do
runGEvalTest "probabilistic-soft-f1-calibrated" `shouldReturnAlmost` 0.88888888888
describe "test edit-distance library" $ do
it "for handling UTF8" $ do
levenshteinDistance defaultEditCosts "źdźbło" "źd好bło" `shouldBe` 1
levenshteinDistance defaultEditCosts "źdźbło" "źdźcło" `shouldBe` 1
describe "CharMatch" $ do
it "simple test" $ do
runGEvalTest "charmatch-simple" `shouldReturnAlmost` 0.3571428571428571
it "perfect solution" $ do
runGEvalTest "charmatch-perfect" `shouldReturnAlmost` 1.0
it "more complex test" $ do
runGEvalTest "charmatch-complex" `shouldReturnAlmost` 0.1923076923076923
it "broken test without input" $ do
runGEvalTest "charmatch-no-input" `shouldThrow` (== NoInputFile "test/charmatch-no-input/charmatch-no-input/test-A/in.tsv")
2017-12-12 07:54:21 +01:00
describe "MAP" $ do
it "one result" $ do
(calculateMAPForOneResult ["Berlin", "London", "Warsaw"]
["Warsaw", "Moscow", "Berlin", "Prague"]) `shouldBeAlmost` 0.55555555
it "check whether you cannot cheat with duplicated results" $ do
(calculateMAPForOneResult ["one", "two"]
["one", "one"]) `shouldBeAlmost` 0.5
it "simple test" $ do
runGEvalTest "map-simple" `shouldReturnAlmost` 0.444444444
2018-04-07 08:29:58 +02:00
describe "LogLoss" $ do
it "simple" $ do
runGEvalTest "logloss-simple" `shouldReturnAlmost` 0.31824
it "perfect" $ do
runGEvalTest "logloss-perfect" `shouldReturnAlmost` 0.0
2018-05-17 15:21:03 +02:00
describe "Likelihood" $ do
it "simple" $ do
runGEvalTest "likelihood-simple" `shouldReturnAlmost` 0.72742818469866
2018-07-26 13:01:10 +02:00
describe "MultiLabel-F" $ do
it "simple" $ do
runGEvalTest "multilabel-f1-simple" `shouldReturnAlmost` 0.66666666666
it "simple F2" $ do
runGEvalTest "multilabel-f2-simple" `shouldReturnAlmost` 0.441176470588235
it "labels given with probs" $ do
runGEvalTest "multilabel-f1-with-probs" `shouldReturnAlmost` 0.615384615384615
it "labels given with probs and numbers" $ do
runGEvalTest "multilabel-f1-with-probs-and-numbers" `shouldReturnAlmost` 0.6666666666666
describe "MultiLabel-Likelihood" $ do
it "simple" $ do
runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
describe "Preprocessing operations" $ do
it "F1 with preprocessing" $ do
runGEvalTest "f1-with-preprocessing" `shouldReturnAlmost` 0.57142857142857
describe "evaluating single lines" $ do
it "RMSE" $ do
(MetricOutput v _) <- gevalCoreOnSingleLines RMSE id RawItemTarget
(LineInFile (FilePathSpec "stub1") 1 "blabla")
RawItemTarget
(LineInFile (FilePathSpec "stub2") 1 "3.4")
RawItemTarget
(LineInFile (FilePathSpec "stub3") 1 "2.6")
v `shouldBeAlmost` 0.8
2018-10-17 17:52:43 +02:00
describe "Annotation format" $ do
it "just parse" $ do
parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),
2019-03-12 09:14:50 +01:00
Annotation "baz" (IS.fromList [4,5,6])]
2018-10-17 17:52:43 +02:00
it "empty" $ do
parseAnnotations "" `shouldBe` Right []
it "empty (just spaces)" $ do
parseAnnotations " " `shouldBe` Right []
it "match score" $ do
2019-03-12 09:14:50 +01:00
matchScore (Annotation "x" (IS.fromList [3..6])) (ObtainedAnnotation (Annotation "y" (IS.fromList [3..6])) 1.0) `shouldBeAlmost` 0.0
matchScore (Annotation "x" (IS.fromList [3..6])) (ObtainedAnnotation (Annotation "x" (IS.fromList [3..6])) 1.0) `shouldBeAlmost` 1.0
matchScore (Annotation "x" (IS.fromList [123..140])) (ObtainedAnnotation (Annotation "x" (IS.fromList [125..130])) 1.0) `shouldBeAlmost` 0.33333
matchScore (Annotation "x" (IS.fromList [3..4])) (ObtainedAnnotation (Annotation "x" (IS.fromList [2..13])) 1.0) `shouldBeAlmost` 0.1666666
2018-05-15 09:38:13 +02:00
describe "BIO format" $ do
it "just parse" $ do
let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
r `shouldBe` [Outside,
Beginning "city" (Just "NEW_YORK"),
Inside "city" Nothing,
Beginning "city" (Just "KALISZ"),
Inside "city" Nothing,
Outside,
Beginning "name" Nothing]
it "simplest entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
it "multi-word entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
it "multi-word entity with normalized text" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
it "simplest entity with something outside" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "another simple case" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "just parse into entities" $ do
let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "check counting" $ do
gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
it "check F1 on a more complicated example" $ do
runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
2018-05-29 22:04:19 +02:00
it "check F1 on labels only" $ do
runGEvalTest "bio-f1-complex-labels" `shouldReturnAlmost` 0.6666666666
2018-05-15 09:38:13 +02:00
it "calculate F1" $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "calculate F1 with underscores rather than minus signs" $ do
runGEvalTest "bio-f1-simple-underscores" `shouldReturnAlmost` 0.5
2018-05-15 09:38:13 +02:00
it "check perfect score" $ do
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
2018-05-25 14:44:19 +02:00
it "check inconsistent input" $ do
runGEvalTest "bio-f1-error" `shouldThrow` (== UnexpectedData 2 "inconsistent label sequence `B-NAME/JOHN I-FOO/SMITH I-FOO/X`")
2018-05-17 08:26:57 +02:00
describe "automatic decompression" $ do
it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
2019-02-13 17:53:30 +01:00
describe "handling jsonl format" $ do
it "simple test" $
2019-02-14 19:01:53 +01:00
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
2018-05-26 21:10:22 +02:00
describe "line by line mode" $ do
let sampleChallenge =
GEvalSpecification
{ gesOutDirectory = "test/likelihood-simple/likelihood-simple-solution",
gesExpectedDirectory = Just "test/likelihood-simple/likelihood-simple",
gesTestName = "test-A",
2019-02-13 17:53:30 +01:00
gesSelector = Nothing,
2018-05-26 21:10:22 +02:00
gesOutFile = "out.tsv",
gesExpectedFile = "expected.tsv",
gesInputFile = "in.tsv",
gesMetrics = [EvaluationScheme Likelihood []],
2018-08-13 10:09:55 +02:00
gesPrecision = Nothing,
gesTokenizer = Nothing,
gesGonitoHost = Nothing,
2018-12-07 09:22:55 +01:00
gesToken = Nothing,
2019-05-23 16:16:05 +02:00
gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing }
2018-05-26 21:10:22 +02:00
it "simple test" $ do
2019-05-23 16:16:05 +02:00
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
2018-05-26 21:10:22 +02:00
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
"bar",
"baz",
"baq"]
it "test sorting" $ do
2019-05-23 16:16:05 +02:00
results <- runLineByLineGeneralized FirstTheWorst sampleChallenge (const Data.Conduit.List.consume)
Prelude.head (Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results) `shouldBe` "baq"
2018-06-02 11:29:54 +02:00
describe "handle --alt-metric option" $ do
it "accuracy instead of likelihood" $ do
runGEvalTestExtraOptions ["--alt-metric", "Accuracy"] "likelihood-simple" `shouldReturnAlmost` 0.75
it "accuracy instead of log loss" $ do
runGEvalTestExtraOptions ["--alt-metric", "Accuracy"] "log-loss-hashed-probs" `shouldReturnAlmost` 0.4
2018-05-12 10:53:21 +02:00
describe "smart sources" $ do
2018-06-02 20:24:34 +02:00
it "smart specs are obtained" $ do
getSmartSourceSpec "foo" "" "" `shouldReturn` Left NoSpecGiven
getSmartSourceSpec "foo" "out.tsv" "-" `shouldReturn` Right Stdin
getSmartSourceSpec "foo" "out.sv" "http://gonito.net/foo" `shouldReturn` (Right $ Http "http://gonito.net/foo")
getSmartSourceSpec "foo" "in.tsv" "https://gonito.net" `shouldReturn` (Right $ Https "https://gonito.net")
2018-05-12 10:53:21 +02:00
it "sources are accessed" $ do
2018-06-02 20:24:34 +02:00
readFromSmartSource "baz" "out.tsv" "test/files/foo.txt" `shouldReturn` ["foo\n"]
2018-06-02 23:27:49 +02:00
readFromSmartSource "" "" "https://httpbin.org/robots.txt" `shouldReturn`
["User-agent: *\nDisallow: /deny\n"]
describe "parse model params from filenames" $ do
it "no params 1" $ do
parseParamsFromFilePath "out.tsv" `shouldBe` OutputFileParsed "out" Data.Map.Strict.empty
it "no params 2" $ do
parseParamsFromFilePath "out.tsv.xz" `shouldBe` OutputFileParsed "out" Data.Map.Strict.empty
it "no params 3" $ do
parseParamsFromFilePath "out-test-foo_bar.tsv" `shouldBe` OutputFileParsed "out-test-foo_bar" Data.Map.Strict.empty
it "one parameter" $ do
parseParamsFromFilePath "out-nb_epochs=123.tsv" `shouldBe`
OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "123")])
it "complex" $ do
parseParamsFromFilePath "out-nb_epochs = 12,foo=off, bar-baz =10.tsv" `shouldBe`
OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "12"),
("foo", "off"),
("bar-baz", "10")])
2018-07-10 12:10:02 +02:00
it "empty val" $ do
parseParamsFromFilePath "out-nb_epochs=1,foo=,bar-baz=8.tsv" `shouldBe`
OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "1"),
("foo", ""),
("bar-baz", "8")])
2018-08-01 22:39:34 +02:00
describe "ranking" $ do
it "simple case" $ do
checkConduitPure (rank (\(a,_) (b,_) -> a < b)) [(3.0::Double, "foo"::String),
(10.0, "bar"),
(12.0, "baz")]
[(1.0, (3.0::Double, "foo"::String)),
(2.0, (10.0, "bar")),
(3.0, (12.0, "baz"))]
it "one item" $ do
checkConduitPure (rank (\(a,_) (b,_) -> a < b)) [(5.0::Double, "foo"::String)]
[(1.0, (5.0::Double, "foo"::String))]
it "take between" $ do
checkConduitPure (rank (<)) [3.0::Double, 5.0, 5.0, 10.0]
[(1.0::Double, 3.0),
(2.5, 5.0),
(2.5, 5.0),
(4.0, 10.0)]
2018-08-03 08:23:55 +02:00
it "two sequences" $ do
checkConduitPure (rank (<)) [4.5::Double, 4.5, 4.5, 6.1, 6.1]
[(2.0::Double, 4.4),
(2.0, 4.5),
(2.0, 4.5),
(4.5, 6.1),
(4.5, 6.1)]
it "series at the beginning" $ do
checkConduitPure (rank (<)) [10.0::Double, 10.0, 13.0, 14.0]
[(1.5::Double, 10.0),
(1.5, 10.0),
(3.0, 13.0),
(4.0, 14.0)]
it "inverted" $ do
checkConduitPure (rank (>)) [3.0::Double, 3.0, 2.0, 1.0]
[(1.5::Double, 3.0),
(1.5, 3.0),
(3.0, 2.0),
(4.0, 1.0)]
2018-08-11 22:59:10 +02:00
describe "tokenizer" $ do
it "simple utterance with '13a' tokenizer" $ do
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
["To", "be", "or", "not", "to", "be",
",", "that's", "the", "question", "."]
it "simple utterance with 'character-by-character' tokenizer" $ do
tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe`
["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."]
describe "shapify" $ do
it "simple tests" $ do
shapify "Poznań" `shouldBe` (WordShape "Aa+")
shapify "2019" `shouldBe` (WordShape "9999")
shapify "Ala ma (czarnego) kota?" `shouldBe` (WordShape "Aa+ a+ (a+( a+.")
shapify "" `shouldBe` (WordShape "")
shapify "PCMCIA" `shouldBe` (WordShape "A+")
shapify "a" `shouldBe` (WordShape "a")
shapify "B5" `shouldBe` (WordShape "A9")
2019-08-10 13:00:29 +02:00
describe "create challenges and validate them" $ do
(flip mapM_) listOfAvailableMetrics $ \metric -> do
it (show metric) $ do
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
let spec = defaultGEvalSpecification {
gesExpectedDirectory = Just tempDir,
gesMetrics = [EvaluationScheme metric []],
2019-08-10 13:00:29 +02:00
gesPrecision = Just 4 }
createChallenge True tempDir spec
validationChallenge tempDir spec
2018-08-27 17:57:07 +02:00
describe "submit" $ do
it "current branch" $ do
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
it "challengeId" $ do
runGitTest "challengeId-test" (
\_ -> do
path <- makeAbsolute "challenge01"
setCurrentDirectory path
getChallengeId) `shouldReturn` "challenge01"
it "everything committed - positive" $ do
runGitTest "everythingCommitted-test-pos" (\_ -> checkEverythingCommitted) `shouldReturn` ()
it "everything committed - negative" $ do
hSilence [stderr] $ runGitTest "everythingCommitted-test-neg" (\_ -> checkEverythingCommitted) `shouldThrow` (== ExitFailure 1)
it "remote synced - positive" $ do
runGitTest "remoteSynced-test-pos" (\_ -> checkRemoteSynced) `shouldReturn` ()
it "remote synced - negative" $ do
hSilence [stderr] $ runGitTest "remoteSynced-test-neg" (\_ -> checkRemoteSynced) `shouldThrow` (== ExitFailure 1)
it "remote url" $ do
runGitTest "remoteUrl-test" (\_ -> getRemoteUrl "origin") `shouldReturn` "git@git.example.com:example/example.git"
it "repo root" $ do
runGitTest "repoRoot-test" (
\path -> do
subpath <- makeAbsolute "A/B"
setCurrentDirectory subpath
root <- getRepoRoot
return $ root == path
) `shouldReturn` True
it "no token" $ do
runGitTest "token-test-no" (\_ -> readToken) `shouldReturn` Nothing
it "read token" $ do
runGitTest "token-test-yes" (\_ -> readToken) `shouldReturn` (Just "AAAA")
it "write-read token" $ do
runGitTest "token-test-no" (
\_ -> do
writeToken "BBBB"
token <- readToken
return $ token == (Just "BBBB")
) `shouldReturn` True
2019-01-23 13:00:37 +01:00
describe "extracting features" $ do
it "extract factors" $ do
let bbdo = BlackBoxDebuggingOptions {
bbdoMinFrequency = 1,
bbdoWordShapes = False,
bbdoBigrams = True,
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = True }
2019-05-23 16:16:05 +02:00
(sort $ extractFactorsFromTabbed Nothing bbdo Nothing "in" "I like this\t34.3\ttests") `shouldBe` [
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "I"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "like"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "this"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (BigramFactor (TextFactor "I") (TextFactor "like"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (BigramFactor (TextFactor "like") (TextFactor "this"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 1)
(NumericalFactor Nothing 11),
PeggedFactor (FeatureTabbedNamespace "in" 2)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "34.3"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 2)
(NumericalFactor (Just 34.3) 4),
PeggedFactor (FeatureTabbedNamespace "in" 3)
2019-01-26 17:18:41 +01:00
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
2019-01-23 13:00:37 +01:00
PeggedFactor (FeatureTabbedNamespace "in" 3)
(NumericalFactor Nothing 5) ]
2019-01-27 20:18:39 +01:00
describe "Kendall's tau" $ do
it "tau" $ do
kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173)
it "z" $ do
kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
it "p-value" $ do
(2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
describe "Loess" $ do
it "simple" $ do
loess (DVU.fromList [0.2, 0.6, 1.0])
(DVU.fromList [-0.6, 0.2, 1.0])
0.4 `shouldBeAlmost` (-0.2)
describe "Calibration" $ do
it "empty list" $ do
calibration [] [] `shouldBeAlmost` 1.0
it "one element" $ do
calibration [True] [1.0] `shouldBeAlmost` 1.0
calibration [False] [0.0] `shouldBeAlmost` 1.0
calibration [True] [0.0] `shouldBeAlmost` 0.0
calibration [False] [1.0] `shouldBeAlmost` 0.0
calibration [True] [0.7] `shouldBeAlmost` 0.7
calibration [True] [0.3] `shouldBeAlmost` 0.3
calibration [False] [0.7] `shouldBeAlmost` 0.3
calibration [False] [0.3] `shouldBeAlmost` 0.7
it "perfect calibration" $ do
calibration [True, True, False] [0.5, 1.0, 0.5] `shouldBeAlmost` 1.0
it "totally wrong" $ do
calibration [True, False] [0.0, 1.0] `shouldBeAlmost` 0.0
calibration [True, False, False, True, False] [0.0, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
calibration [False, True, True, True, True, False, False, True, False] [0.25, 0.25, 0.0, 0.25, 0.25, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
2018-08-01 22:39:34 +02:00
checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
mapM_ (\(o,e) -> (fst o) `shouldBeAlmost` (fst e)) $ Prelude.zip outList expList
2018-05-12 10:53:21 +02:00
2018-06-02 20:24:34 +02:00
readFromSmartSource :: FilePath -> FilePath -> String -> IO [String]
readFromSmartSource defaultDir defaultFile specS = do
(Right spec) <- getSmartSourceSpec defaultDir defaultFile specS
let source = smartSource spec
2018-06-20 11:57:11 +02:00
contents <- runResourceT $ runConduit (source .| CT.decodeUtf8Lenient .| CL.consume)
2018-05-12 10:53:21 +02:00
return $ Prelude.map unpack contents
2015-11-06 23:24:46 +01:00
neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False
2016-08-02 07:17:57 +02:00
alwaysMatch :: Char -> Int -> Bool
alwaysMatch _ _ = True
testMatchFun :: Char -> Int -> Bool
testMatchFun 'a' 1 = True
testMatchFun 'a' 2 = True
testMatchFun 'a' 3 = True
testMatchFun 'b' 1 = True
testMatchFun 'c' 1 = True
testMatchFun _ _ = False
extractVal :: (Either (ParserResult GEvalOptions) (Maybe [(SourceSpec, [MetricValue])])) -> IO MetricValue
extractVal (Right (Just ([(_, val:_)]))) = return val
2018-09-12 20:37:44 +02:00
extractVal (Right Nothing) = return $ error "no metrics???"
extractVal (Right (Just [])) = return $ error "emtpy metric list???"
extractVal (Left result) = do
handleParseResult result
return $ error "something wrong"
2015-08-19 23:24:19 +02:00
2018-06-02 11:29:54 +02:00
runGEvalTest = runGEvalTestExtraOptions []
runGEvalTestExtraOptions extraOptions testName = (runGEval ([
2015-11-06 21:57:36 +01:00
"--expected-directory",
"test/" ++ testName ++ "/" ++ testName,
"--out-directory",
2018-06-02 11:29:54 +02:00
"test/" ++ testName ++ "/" ++ testName ++ "-solution"] ++ extraOptions)) >>= extractVal
2015-11-06 21:57:36 +01:00
extractMetric :: String -> IO (Maybe Metric)
extractMetric testName = do
result <- getOptions ["--expected-directory", "test/" ++ testName ++ "/" ++ testName]
return $ case result of
Left _ -> Nothing
2018-06-08 12:38:45 +02:00
Right opts -> Just $ gesMainMetric $ geoSpec opts
2015-08-19 23:24:19 +02:00
(@=~?) :: (Show a, AEq a) => a -> a -> HU.Assertion
(@=~?) actual expected = expected =~ actual HU.@? assertionMsg
2015-08-19 23:24:19 +02:00
where
assertionMsg = "Expected : " ++ show expected ++
"\nActual : " ++ show actual
shouldBeAlmost got expected = got @=~? expected
2015-08-19 23:24:19 +02:00
shouldReturnAlmost :: (AEq a, Show a, Eq a) => IO a -> a -> Expectation
shouldReturnAlmost action expected = action >>= (@=~? expected)
2018-08-27 17:57:07 +02:00
runGitTest :: String -> (FilePath -> IO a) -> IO a
runGitTest name callback = do
2018-10-22 13:32:36 +02:00
withSystemTempDirectory "geval-submit-test" $ \temp -> do
2018-08-27 17:57:07 +02:00
copyFile ("test/_submit-tests/" ++ name ++ ".tar") (temp ++ "/" ++ name ++ ".tar")
withCurrentDirectory temp $ do
callCommand $ "tar xf " ++ name ++ ".tar"
let testRoot = temp ++ "/" ++ name
withCurrentDirectory testRoot $ do
callback testRoot