From 03aacdef98ed566074e21c49ca942fbfe1b12c8f Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sun, 17 Nov 2019 21:59:20 +0100 Subject: [PATCH] Add SegmentAccuracy metric --- src/GEval/Annotation.hs | 40 ++++++++++++++++++- src/GEval/Core.hs | 7 ++++ src/GEval/CreateChallenge.hs | 28 +++++++++++++ src/GEval/Metric.hs | 5 ++- src/GEval/MetricsMeta.hs | 33 ++++++++++++--- test/Spec.hs | 8 ++++ .../test-A/out.tsv | 3 ++ .../segment-accuracy-simple/config.txt | 1 + .../test-A/expected.tsv | 3 ++ 9 files changed, 119 insertions(+), 9 deletions(-) create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple/config.txt create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv diff --git a/src/GEval/Annotation.hs b/src/GEval/Annotation.hs index abc8b59..950c93d 100644 --- a/src/GEval/Annotation.hs +++ b/src/GEval/Annotation.hs @@ -4,11 +4,12 @@ module GEval.Annotation (parseAnnotations, Annotation(..), parseObtainedAnnotations, ObtainedAnnotation(..), - matchScore, intSetParser) + matchScore, intSetParser, segmentAccuracy, parseSegmentAnnotations) where import qualified Data.IntSet as IS import qualified Data.Text as T +import Data.Set (intersection, fromList) import Data.Attoparsec.Text import Data.Attoparsec.Combinator @@ -17,11 +18,12 @@ import GEval.Common (sepByWhitespaces, (/.)) import GEval.Probability import Data.Char import Data.Maybe (fromMaybe) +import Data.Either (partitionEithers) import GEval.PrecisionRecall(weightedMaxMatching) data Annotation = Annotation T.Text IS.IntSet - deriving (Eq, Show) + deriving (Eq, Show, Ord) data ObtainedAnnotation = ObtainedAnnotation Annotation Double deriving (Eq, Show) @@ -52,6 +54,36 @@ obtainedAnnotationParser = do parseAnnotations :: T.Text -> Either String [Annotation] parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t +parseSegmentAnnotations :: T.Text -> Either String [Annotation] +parseSegmentAnnotations t = case parseAnnotationsWithColons t of + Left m -> Left m + Right annotations -> if areSegmentsDisjoint annotations + then (Right annotations) + else (Left "Overlapping segments") + +areSegmentsDisjoint :: [Annotation] -> Bool +areSegmentsDisjoint = areIntSetsDisjoint . map (\(Annotation _ s) -> s) + +areIntSetsDisjoint :: [IS.IntSet] -> Bool +areIntSetsDisjoint ss = snd $ foldr step (IS.empty, True) ss + where step _ w@(_, False) = w + step s (u, True) = (s `IS.union` u, s `IS.disjoint` u) + +-- unfortunately, attoparsec does not seem to back-track properly +-- so we need a special function if labels can contain colons +parseAnnotationsWithColons :: T.Text -> Either String [Annotation] +parseAnnotationsWithColons t = case partitionEithers (map parseAnnotationWithColons $ T.words t) of + ([], annotations) -> Right annotations + ((firstProblem:_), _) -> Left firstProblem + +parseAnnotationWithColons :: T.Text -> Either String Annotation +parseAnnotationWithColons t = if T.null label + then Left "Colon expected" + else case parseOnly (intSetParser <* endOfInput) position of + Left m -> Left m + Right s -> Right (Annotation (T.init label) s) + where (label, position) = T.breakOnEnd ":" t + annotationsParser :: Parser [Annotation] annotationsParser = sepByWhitespaces annotationParser @@ -70,3 +102,7 @@ intervalParser = do startIx <- decimal endIx <- (string "-" *> decimal <|> pure startIx) pure $ IS.fromList [startIx..endIx] + +segmentAccuracy :: [Annotation] -> [Annotation] -> Double +segmentAccuracy expected output = (fromIntegral $ length matched) / (fromIntegral $ length expected) + where matched = (fromList expected) `intersection` (fromList output) diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 1897fb7..4611671 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -706,6 +706,13 @@ gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput intoTokens | otherwise = (h, t + 1) hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0) +gevalCoreOnSources SegmentAccuracy _ = gevalCoreWithoutInput parseSegmentAnnotations + parseSegmentAnnotations + (uncurry segmentAccuracy) + averageC + id + noGraph + gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords (Right . parseIntoProbList) (uncurry countLogLossOnProbList) diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 8b501a3..3a915e5 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -297,6 +297,19 @@ in the expected file (but not in the output file). |] ++ (commonReadmeMDContents testName) +readmeMDContents SegmentAccuracy testName = [i| +Segment a sentence and tag with POS tags +======================================== + +This is a sample, toy challenge for SegmentAccuracy. + +For each sentence, give a sequence of POS tags, each one with +its position (1-indexed). For instance, `N:1-10` means a nouns +starting from the beginning (the first character) up to to the tenth +character (inclusively). + +|] ++ (commonReadmeMDContents testName) + readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName readmeMDContents (MultiLabelFMeasure beta) testName = [i| Tag names and their component @@ -473,6 +486,9 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman trainContents TokenAccuracy = [hereLit|* V N I like cats * * V * N I can see the rainbow |] +trainContents SegmentAccuracy = [hereLit|Art:1-3 N:5-11 V:12-13 A:15-19 The student's smart +N:1-6 N:8-10 V:12-13 A:15-18 Mary's dog is nice +|] trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta) trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith person/3,4,5 first-name/4 surname/5 Steven bloody Brown person/1,3 first-name/1 surname/3 @@ -540,6 +556,9 @@ Mr Jan Kowalski devInContents TokenAccuracy = [hereLit|The cats on the mat Ala has a cat |] +devInContents SegmentAccuracy = [hereLit|John is smart +Mary's intelligent +|] devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta) devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here I see him @@ -604,6 +623,9 @@ O B-firstname/JAN B-surname/KOWALSKI devExpectedContents TokenAccuracy = [hereLit|* N * * N N V * N |] +devExpectedContents SegmentAccuracy = [hereLit|N:1-4 V:6-7 A:9-13 +N:1-4 V:6-7 A:9-19 +|] devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta) devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2 @@ -673,6 +695,9 @@ No name here testInContents TokenAccuracy = [hereLit|I have cats I know |] +testInContents SegmentAccuracy = [hereLit|Mary's cat is old +John is young +|] testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta) testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith Nobody is there @@ -738,6 +763,9 @@ O O O testExpectedContents TokenAccuracy = [hereLit|* V N * V |] +testExpectedContents SegmentAccuracy = [hereLit|N:1-6 N:8-10 V:12-13 A:15-17 +N:1-4 V:6-7 A:9-13 +|] testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta) testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3 diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index 0a53a61..b87c599 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -26,7 +26,7 @@ import Data.Attoparsec.Text (parseOnly) data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU | FMeasure Double | MacroFMeasure Double | NMI | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood - | BIOF1 | BIOF1Labels | TokenAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double + | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double | MultiLabelLogLoss | MultiLabelLikelihood | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double deriving (Eq) @@ -67,6 +67,7 @@ instance Show Metric where show BIOF1 = "BIO-F1" show BIOF1Labels = "BIO-F1-Labels" show TokenAccuracy = "TokenAccuracy" + show SegmentAccuracy = "SegmentAccuracy" show MAE = "MAE" show SMAPE = "SMAPE" show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta) @@ -118,6 +119,7 @@ instance Read Metric where readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)] readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)] readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)] + readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)] readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)] readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)] readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)] @@ -154,6 +156,7 @@ getMetricOrdering Likelihood = TheHigherTheBetter getMetricOrdering BIOF1 = TheHigherTheBetter getMetricOrdering BIOF1Labels = TheHigherTheBetter getMetricOrdering TokenAccuracy = TheHigherTheBetter +getMetricOrdering SegmentAccuracy = TheHigherTheBetter getMetricOrdering MAE = TheLowerTheBetter getMetricOrdering SMAPE = TheLowerTheBetter getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index 8747bd9..21659ab 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -63,6 +63,7 @@ listOfAvailableMetrics = [RMSE, BIOF1, BIOF1Labels, TokenAccuracy, + SegmentAccuracy, SoftFMeasure 1.0, SoftFMeasure 2.0, SoftFMeasure 0.25, @@ -94,6 +95,7 @@ isMetricDescribed (SoftFMeasure _) = True isMetricDescribed (Soft2DFMeasure _) = True isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True isMetricDescribed GLEU = True +isMetricDescribed SegmentAccuracy = True isMetricDescribed _ = False getEvaluationSchemeDescription :: EvaluationScheme -> String @@ -134,7 +136,11 @@ metric on a corpus level but does not have its drawbacks for our per sentence reward objective. see: https://arxiv.org/pdf/1609.08144.pdf |] - +getMetricDescription SegmentAccuracy = + [i|Accuracy counted for segments, i.e. labels with positions. +The percentage of labels in the ground truth retrieved in the actual output is returned. +Accuracy is calculated separately for each item and then averaged. +|] outContents :: Metric -> String outContents (SoftFMeasure _) = [hereLit|inwords:1-4 @@ -147,7 +153,10 @@ outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surn surname/1:0.4 first-name/3:0.9 |] -outContents GLEU = [hereLit|Alice has a black +outContents GLEU = [hereLit|Alice has a black +|] +outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17 +N:1-4 V:6-7 A:9-13 |] expectedScore :: EvaluationScheme -> MetricValue @@ -165,6 +174,8 @@ expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) []) in weightedHarmonicMean beta precision recall expectedScore (EvaluationScheme GLEU []) = 0.7142857142857143 +expectedScore (EvaluationScheme SegmentAccuracy []) + = 0.875 helpMetricParameterMetricsList :: String helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of @@ -213,7 +224,14 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed. |] -formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. +formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. +|] +formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of +1-based indexes or spans separated by commas (spans are inclusive +ranges, e.g. "10-14"). For instance, "foo:bar:2,4-7,10" is a +label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no +overlapping segments can be returned (evaluation will fail in +such a case). |] scoreExplanation :: EvaluationScheme -> Maybe String @@ -227,13 +245,16 @@ Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000 for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|] scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing scoreExplanation (EvaluationScheme GLEU []) - = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives). + = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives). We have 4 matching unigrams ("Alice", "has", "a", "black") , 3 bigrams ("Alice has", "has a", "a black"), 2 trigrams ("Alice has a", "has a black") and 1 tetragram ("Alice has a black"), -so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat"). +so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat"). Now we have to calculate precision and recall: - Precision is tp / (tp+fp) = 10/(10+0) = 1, + Precision is tp / (tp+fp) = 10/(10+0) = 1, recall is tp / (tp+fn) = 10 / (10+4) = 10/14 =~ 0.71428... The GLEU score is min(precision,recall)=0.71428 |] +scoreExplanation (EvaluationScheme SegmentAccuracy []) + = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75). +The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|] pasteLines :: String -> String -> String pasteLines a b = printf "%-35s %s\n" a b diff --git a/test/Spec.hs b/test/Spec.hs index 1fccc45..dc68beb 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -146,6 +146,9 @@ main = hspec $ do describe "TokenAccuracy" $ do it "simple example" $ do runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5 + describe "SegmentAccuracy" $ do + it "simple test" $ do + runGEvalTest "segment-accuracy-simple" `shouldReturnAlmost` 0.4444444 describe "precision count" $ do it "simple test" $ do precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2 @@ -342,6 +345,11 @@ main = hspec $ do it "just parse" $ do parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]), Annotation "baz" (IS.fromList [4,5,6])] + it "just parse wit colons" $ do + parseSegmentAnnotations "foo:x:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo:x" (IS.fromList [3,7,8,9,10]), + Annotation "baz" (IS.fromList [4,5,6])] + it "just parse wit colons" $ do + parseSegmentAnnotations "foo:x:3,7-10 baz:2-6" `shouldBe` Left "Overlapping segments" it "just parse 2" $ do parseAnnotations "inwords:1-3 indigits:5" `shouldBe` Right [Annotation "inwords" (IS.fromList [1,2,3]), Annotation "indigits" (IS.fromList [5])] diff --git a/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv new file mode 100644 index 0000000..4af8b51 --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +foo:0 baq:1-2 baz:3 +aaa:0-1 +xyz:0 bbb:x:1 diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/config.txt b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt new file mode 100644 index 0000000..2f838f0 --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt @@ -0,0 +1 @@ +--metric SegmentAccuracy diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv new file mode 100644 index 0000000..bc95bcb --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv @@ -0,0 +1,3 @@ +foo:0 bar:1-2 baz:3 +aaa:0-2 +xyz:0 bbb:x:1 ccc:x:2