{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} import Test.Hspec import GEval.Core import GEval.OptionsParser import GEval.BLEU import GEval.ClippEU import GEval.PrecisionRecall import GEval.ClusteringMetrics import GEval.BIO import GEval.LineByLine import GEval.ParseParams import Data.Attoparsec.Text import Options.Applicative import Data.Text import Text.EditDistance import Data.Map.Strict import Data.Conduit.List (consume) import qualified Test.HUnit as HU import Data.Conduit.SmartSource import Data.Conduit.Rank import qualified Data.Conduit.Text as CT import Data.Conduit import Control.Monad.Trans.Resource import qualified Data.Conduit.List as CL import qualified Data.Conduit.Combinators as CC informationRetrievalBookExample :: [(String, Int)] informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3), ("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1), ("x", 2), ("o", 2), ("o", 2), ("x", 3), ("d", 3), ("d", 3)] perfectClustering :: [(Int, Char)] perfectClustering = [(0, 'a'), (2, 'b'), (3, 'c'), (2, 'b'), (2, 'b'), (1, 'd'), (0, 'a')] stupidClusteringOneBigCluster :: [(Int, Int)] stupidClusteringOneBigCluster = [(0, 2), (2, 2), (1, 2), (2, 2), (0, 2), (0, 2), (0, 2), (0, 2), (1, 2), (1, 2)] stupidClusteringManySmallClusters :: [(Int, Int)] stupidClusteringManySmallClusters = [(0, 0), (2, 1), (1, 2), (2, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 8), (1, 9)] main :: IO () main = hspec $ do describe "root mean square error" $ do it "simple test" $ do [(_, (val:_))] <- geval $ defaultGEvalSpecification {gesExpectedDirectory=Just "test/rmse-simple/rmse-simple", gesOutDirectory="test/rmse-simple/rmse-simple-solution"} val `shouldBeAlmost` 0.64549722436790 describe "mean square error" $ do it "simple test with arguments" $ runGEvalTest "mse-simple" `shouldReturnAlmost` 0.4166666666666667 describe "mean absolute error" $ do it "simple test with arguments" $ runGEvalTest "mae-simple" `shouldReturnAlmost` 1.5 describe "BLEU" $ do it "trivial example from Wikipedia" $ runGEvalTest "bleu-trivial" `shouldReturnAlmost` 0.0 it "complex example" $ runGEvalTest "bleu-complex" `shouldReturnAlmost` 0.6211 it "perfect translation" $ runGEvalTest "bleu-perfect" `shouldReturnAlmost` 1.0000 it "empty translation" $ runGEvalTest "bleu-empty" `shouldReturnAlmost` 0.0000 describe "Accuracy" $ do it "simple example" $ runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6 it "with probs" $ runGEvalTest "accuracy-probs" `shouldReturnAlmost` 0.4 describe "F-measure" $ do it "simple example" $ runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857 it "perfect classifier" $ runGEvalTest "f-measure-perfect" `shouldReturnAlmost` 1.0 it "stupid classifier" $ runGEvalTest "f-measure-stupid" `shouldReturnAlmost` 0.0 it "all false" $ runGEvalTest "f-measure-all-false" `shouldReturnAlmost` 1.0 it "F2-measure" $ runGEvalTest "f2-simple" `shouldReturnAlmost` 0.714285714 describe "precision count" $ do it "simple test" $ do precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2 it "none found" $ do precisionCount [["Alice", "has", "a", "cat" ]] ["for", "bar", "baz"] `shouldBe` 0 it "multiple values" $ do precisionCount [["bar", "bar", "bar", "bar", "foo", "xyz", "foo"]] ["foo", "bar", "foo", "baz", "bar", "foo"] `shouldBe` 4 it "multiple refs" $ do precisionCount [["foo", "baz"], ["bar"], ["baz", "xyz"]] ["foo", "bar", "foo"] `shouldBe` 2 describe "purity (in flat clustering)" $ do it "the example from Information Retrieval Book" $ do purity informationRetrievalBookExample `shouldBeAlmost` 0.70588 describe "NMI (in flat clustering)" $ do it "the example from Information Retrieval Book" $ do normalizedMutualInformation informationRetrievalBookExample `shouldBeAlmost` 0.36456 it "perfect clustering" $ do normalizedMutualInformation perfectClustering `shouldBeAlmost` 1.0 it "stupid clustering with one big cluster" $ do normalizedMutualInformation stupidClusteringOneBigCluster `shouldBeAlmost` 0.0 it "stupid clustering with many small clusters" $ do normalizedMutualInformation stupidClusteringManySmallClusters `shouldBeAlmost` 0.61799 describe "NMI challenge" $ do it "complex test" $ do runGEvalTest "nmi-complex" `shouldReturnAlmost` 0.36456 describe "LogLossHashed challenge" $ do it "simple example" $ do runGEvalTest "log-loss-hashed-simple" `shouldReturnAlmost` 2.398479083333333 it "example with unnormalized values" $ do runGEvalTest "log-loss-hashed-not-normalized" `shouldReturnAlmost` 1.0468455186722887 it "with probs instead of log probs" $ do runGEvalTest "log-loss-hashed-probs" `shouldReturnAlmost` 4.11631293099392 it "with probs instead of log probs (with normalization)" $ do runGEvalTest "log-loss-hashed-probs-normalized" `shouldReturnAlmost` 1.55537749098853 it "with log probs whose probs are summing up to less than 1.0" $ do runGEvalTest "log-loss-hashed-normalization" `shouldReturnAlmost` 5.16395069238851 describe "LikelihoodHashed challenge" $ do it "example with unnormalized values" $ do runGEvalTest "likelihood-hashed-not-normalized" `shouldReturnAlmost` 0.351043364110715 describe "reading options" $ do it "can get the metric" $ do extractMetric "bleu-complex" `shouldReturn` (Just BLEU) describe "error handling" $ do it "too few lines are handled" $ do runGEvalTest "error-too-few-lines" `shouldThrow` (== TooFewLines) it "too many lines are handled" $ do runGEvalTest "error-too-many-lines" `shouldThrow` (== TooManyLines) it "empty output is handled" $ do runGEvalTest "empty-output" `shouldThrow` (== EmptyOutput) it "unexpected data is handled" $ runGEvalTest "unexpected-data" `shouldThrow` (== UnexpectedData 3 "input does not start with a digit") it "unwanted data is handled" $ runGEvalTest "unwanted-data" `shouldThrow` (== UnexpectedData 2 "number expected") describe "precision and recall" $ do it "null test" $ do precision neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0 recall neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0 f1Measure neverMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.0 it "basic test" $ do precision testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.3333333333333333 recall testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.66666666666666666 f1Measure testMatchFun ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.444444444444444 it "perfect result" $ do precision alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0 recall alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0 f1Measure alwaysMatch ['a', 'b', 'c'] [0, 1, 2] `shouldBeAlmost` 1.0 it "full match" $ do precision alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.5 recall alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 1.0 f1Measure alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3 , 4, 5] `shouldBeAlmost` 0.66666666666666 describe "ClippEU" $ do it "parsing rectangles" $ do let (Right r) = parseOnly (lineClippingsParser <* endOfInput) "2/0,0,2,3 10/20,30,40,50 18/0,1,500,3 " r `shouldBe` [Clipping (PageNumber 2) (Rectangle (Point 0 0) (Point 2 3)), Clipping (PageNumber 10) (Rectangle (Point 20 30) (Point 40 50)), Clipping (PageNumber 18) (Rectangle (Point 0 1) (Point 500 3))] it "no rectangles" $ do let (Right r) = parseOnly (lineClippingsParser <* endOfInput) "" r `shouldBe` [] it "just spaces" $ do let (Right r) = parseOnly lineClippingsParser " " r `shouldBe` [] it "parsing specs" $ do let (Right r) = parseOnly lineClippingSpecsParser " 2/0,0,2,3/5 10/20,30,40,50/10" r `shouldBe` [ClippingSpec (PageNumber 2) (Rectangle (Point 5 5) (Point 0 0)) (Rectangle (Point 0 0) (Point 7 8)), ClippingSpec (PageNumber 10) (Rectangle (Point 30 40) (Point 30 40)) (Rectangle (Point 10 20) (Point 50 60))] it "full test" $ do runGEvalTest "clippeu-simple" `shouldReturnAlmost` 0.399999999999 describe "evaluation metric specification is parsed" $ do it "for simple names" $ do let metrics = [RMSE, MSE, BLEU, Accuracy, ClippEU] let parsedMetrics = Prelude.map (read . show) metrics metrics `shouldBe` parsedMetrics it "for F-Measure" $ do read "F2" `shouldBe` (FMeasure 2.0) read "F1" `shouldBe` (FMeasure 1.0) read "F0.5" `shouldBe` (FMeasure 0.5) describe "test edit-distance library" $ do it "for handling UTF8" $ do levenshteinDistance defaultEditCosts "źdźbło" "źd好bło" `shouldBe` 1 levenshteinDistance defaultEditCosts "źdźbło" "źdźcło" `shouldBe` 1 describe "CharMatch" $ do it "simple test" $ do runGEvalTest "charmatch-simple" `shouldReturnAlmost` 0.3571428571428571 it "perfect solution" $ do runGEvalTest "charmatch-perfect" `shouldReturnAlmost` 1.0 it "more complex test" $ do runGEvalTest "charmatch-complex" `shouldReturnAlmost` 0.1923076923076923 it "broken test without input" $ do runGEvalTest "charmatch-no-input" `shouldThrow` (== NoInputFile "test/charmatch-no-input/charmatch-no-input/test-A/in.tsv") describe "MAP" $ do it "one result" $ do (calculateMAPForOneResult ["Berlin", "London", "Warsaw"] ["Warsaw", "Moscow", "Berlin", "Prague"]) `shouldBeAlmost` 0.55555555 it "check whether you cannot cheat with duplicated results" $ do (calculateMAPForOneResult ["one", "two"] ["one", "one"]) `shouldBeAlmost` 0.5 it "simple test" $ do runGEvalTest "map-simple" `shouldReturnAlmost` 0.444444444 describe "LogLoss" $ do it "simple" $ do runGEvalTest "logloss-simple" `shouldReturnAlmost` 0.31824 it "perfect" $ do runGEvalTest "logloss-perfect" `shouldReturnAlmost` 0.0 describe "Likelihood" $ do it "simple" $ do runGEvalTest "likelihood-simple" `shouldReturnAlmost` 0.72742818469866 describe "MultiLabel-F" $ do it "simple" $ do runGEvalTest "multilabel-f1-simple" `shouldReturnAlmost` 0.66666666666 it "simple F2" $ do runGEvalTest "multilabel-f2-simple" `shouldReturnAlmost` 0.441176470588235 describe "evaluating single lines" $ do it "RMSE" $ do gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla") (LineInFile (FilePathSpec "stub2") 1 "3.4") (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8 describe "BIO format" $ do it "just parse" $ do let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name" r `shouldBe` [Outside, Beginning "city" (Just "NEW_YORK"), Inside "city" Nothing, Beginning "city" (Just "KALISZ"), Inside "city" Nothing, Outside, Beginning "name" Nothing] it "simplest entity" $ do let (Right ents) = parseBioSequenceIntoEntities "B-city" ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing] it "multi-word entity" $ do let (Right ents) = parseBioSequenceIntoEntities "B-date I-date" ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing] it "multi-word entity with normalized text" $ do let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR" ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")] it "simplest entity with something outside" $ do let (Right ents) = parseBioSequenceIntoEntities "O B-city" ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing] it "another simple case" $ do let (Right ents) = parseBioSequenceIntoEntities "B-city B-city" ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing, TaggedEntity (TaggedSpan 2 2) "city" Nothing] it "just parse into entities" $ do let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN" ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"), TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"), TaggedEntity (TaggedSpan 7 7) "name" (Nothing), TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")] it "another entity parse" $ do let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar" ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"), TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"), TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")] it "another entity parse" $ do let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK" ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"), TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")] it "parse entity" $ do let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH" ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"), TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] it "parse entity" $ do let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH" ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] it "check counting" $ do gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"), TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2) it "check F1 on a more complicated example" $ do runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625 it "check F1 on labels only" $ do runGEvalTest "bio-f1-complex-labels" `shouldReturnAlmost` 0.6666666666 it "calculate F1" $ do runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5 it "calculate F1 with underscores rather than minus signs" $ do runGEvalTest "bio-f1-simple-underscores" `shouldReturnAlmost` 0.5 it "check perfect score" $ do runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0 it "check inconsistent input" $ do runGEvalTest "bio-f1-error" `shouldThrow` (== UnexpectedData 2 "inconsistent label sequence `B-NAME/JOHN I-FOO/SMITH I-FOO/X`") describe "automatic decompression" $ do it "more complex test" $ do runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923 describe "line by line mode" $ do let sampleChallenge = GEvalSpecification { gesOutDirectory = "test/likelihood-simple/likelihood-simple-solution", gesExpectedDirectory = Just "test/likelihood-simple/likelihood-simple", gesTestName = "test-A", gesOutFile = "out.tsv", gesExpectedFile = "expected.tsv", gesInputFile = "in.tsv", gesMetrics = [Likelihood], gesPrecision = Nothing } it "simple test" $ do results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo", "bar", "baz", "baq"] it "test sorting" $ do results <- runLineByLineGeneralized FirstTheWorst sampleChallenge Data.Conduit.List.consume Prelude.head (Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results) `shouldBe` "baq" describe "handle --alt-metric option" $ do it "accuracy instead of likelihood" $ do runGEvalTestExtraOptions ["--alt-metric", "Accuracy"] "likelihood-simple" `shouldReturnAlmost` 0.75 it "accuracy instead of log loss" $ do runGEvalTestExtraOptions ["--alt-metric", "Accuracy"] "log-loss-hashed-probs" `shouldReturnAlmost` 0.4 describe "smart sources" $ do it "smart specs are obtained" $ do getSmartSourceSpec "foo" "" "" `shouldReturn` Left NoSpecGiven getSmartSourceSpec "foo" "out.tsv" "-" `shouldReturn` Right Stdin getSmartSourceSpec "foo" "out.sv" "http://gonito.net/foo" `shouldReturn` (Right $ Http "http://gonito.net/foo") getSmartSourceSpec "foo" "in.tsv" "https://gonito.net" `shouldReturn` (Right $ Https "https://gonito.net") it "sources are accessed" $ do readFromSmartSource "baz" "out.tsv" "test/files/foo.txt" `shouldReturn` ["foo\n"] readFromSmartSource "" "" "https://httpbin.org/robots.txt" `shouldReturn` ["User-agent: *\nDisallow: /deny\n"] describe "parse model params from filenames" $ do it "no params 1" $ do parseParamsFromFilePath "out.tsv" `shouldBe` OutputFileParsed "out" Data.Map.Strict.empty it "no params 2" $ do parseParamsFromFilePath "out.tsv.xz" `shouldBe` OutputFileParsed "out" Data.Map.Strict.empty it "no params 3" $ do parseParamsFromFilePath "out-test-foo_bar.tsv" `shouldBe` OutputFileParsed "out-test-foo_bar" Data.Map.Strict.empty it "one parameter" $ do parseParamsFromFilePath "out-nb_epochs=123.tsv" `shouldBe` OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "123")]) it "complex" $ do parseParamsFromFilePath "out-nb_epochs = 12,foo=off, bar-baz =10.tsv" `shouldBe` OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "12"), ("foo", "off"), ("bar-baz", "10")]) it "empty val" $ do parseParamsFromFilePath "out-nb_epochs=1,foo=,bar-baz=8.tsv" `shouldBe` OutputFileParsed "out" (Data.Map.Strict.fromList [("nb_epochs", "1"), ("foo", ""), ("bar-baz", "8")]) describe "ranking" $ do it "simple case" $ do checkConduitPure (rank (\(a,_) (b,_) -> a < b)) [(3.0::Double, "foo"::String), (10.0, "bar"), (12.0, "baz")] [(1.0, (3.0::Double, "foo"::String)), (2.0, (10.0, "bar")), (3.0, (12.0, "baz"))] it "one item" $ do checkConduitPure (rank (\(a,_) (b,_) -> a < b)) [(5.0::Double, "foo"::String)] [(1.0, (5.0::Double, "foo"::String))] it "take between" $ do checkConduitPure (rank (<)) [3.0::Double, 5.0, 5.0, 10.0] [(1.0::Double, 3.0), (2.5, 5.0), (2.5, 5.0), (4.0, 10.0)] checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList mapM_ (\(o,e) -> (fst o) `shouldBeAlmost` (fst e)) $ Prelude.zip outList expList readFromSmartSource :: FilePath -> FilePath -> String -> IO [String] readFromSmartSource defaultDir defaultFile specS = do (Right spec) <- getSmartSourceSpec defaultDir defaultFile specS let source = smartSource spec contents <- runResourceT (source $$ CT.decodeUtf8Lenient =$ CL.consume) return $ Prelude.map unpack contents neverMatch :: Char -> Int -> Bool neverMatch _ _ = False alwaysMatch :: Char -> Int -> Bool alwaysMatch _ _ = True testMatchFun :: Char -> Int -> Bool testMatchFun 'a' 1 = True testMatchFun 'a' 2 = True testMatchFun 'a' 3 = True testMatchFun 'b' 1 = True testMatchFun 'c' 1 = True testMatchFun _ _ = False extractVal :: (Either (ParserResult GEvalOptions) (Maybe [(SourceSpec, [MetricValue])])) -> IO MetricValue extractVal (Right (Just ([(_, val:_)]))) = return val runGEvalTest = runGEvalTestExtraOptions [] runGEvalTestExtraOptions extraOptions testName = (runGEval ([ "--expected-directory", "test/" ++ testName ++ "/" ++ testName, "--out-directory", "test/" ++ testName ++ "/" ++ testName ++ "-solution"] ++ extraOptions)) >>= extractVal extractMetric :: String -> IO (Maybe Metric) extractMetric testName = do result <- getOptions ["--expected-directory", "test/" ++ testName ++ "/" ++ testName] return $ case result of Left _ -> Nothing Right opts -> Just $ gesMainMetric $ geoSpec opts class AEq a where (=~) :: a -> a -> Bool instance AEq Double where x =~ y = abs ( x - y ) < (1.0e-4 :: Double) (@=~?) :: (Show a, AEq a) => a -> a -> HU.Assertion (@=~?) actual expected = expected =~ actual HU.@? assertionMsg where assertionMsg = "Expected : " ++ show expected ++ "\nActual : " ++ show actual shouldBeAlmost got expected = got @=~? expected shouldReturnAlmost :: (AEq a, Show a, Eq a) => IO a -> a -> Expectation shouldReturnAlmost action expected = action >>= (@=~? expected)