Add SegmentAccuracy metric

2019-11-17 21:59:20 +01:00 · 2019-11-17 21:59:20 +01:00 · 03aacdef98
commit 03aacdef98
parent fb74f568bb
9 changed files with 119 additions and 9 deletions
--- a/src/GEval/Annotation.hs
+++ b/src/GEval/Annotation.hs
@ -4,11 +4,12 @@
 module GEval.Annotation
       (parseAnnotations, Annotation(..),
        parseObtainedAnnotations, ObtainedAnnotation(..),
-        matchScore, intSetParser)
+        matchScore, intSetParser, segmentAccuracy, parseSegmentAnnotations)
       where
 import qualified Data.IntSet as IS
 import qualified Data.Text as T
 import Data.Set (intersection, fromList)
 import Data.Attoparsec.Text
 import Data.Attoparsec.Combinator
@ -17,11 +18,12 @@ import GEval.Common (sepByWhitespaces, (/.))
 import GEval.Probability
 import Data.Char
 import Data.Maybe (fromMaybe)
 import Data.Either (partitionEithers)
 import GEval.PrecisionRecall(weightedMaxMatching)
 data Annotation = Annotation T.Text IS.IntSet
-                  deriving (Eq, Show)
+                  deriving (Eq, Show, Ord)
 data ObtainedAnnotation = ObtainedAnnotation Annotation Double
                          deriving (Eq, Show)
@ -52,6 +54,36 @@ obtainedAnnotationParser = do
 parseAnnotations :: T.Text -> Either String [Annotation]
 parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t
 parseSegmentAnnotations :: T.Text -> Either String [Annotation]
 parseSegmentAnnotations t = case parseAnnotationsWithColons t of
  Left m -> Left m
  Right annotations -> if areSegmentsDisjoint annotations
                      then (Right annotations)
                      else (Left "Overlapping segments")
 areSegmentsDisjoint :: [Annotation] -> Bool
 areSegmentsDisjoint = areIntSetsDisjoint . map (\(Annotation _ s) -> s)
 areIntSetsDisjoint :: [IS.IntSet] -> Bool
 areIntSetsDisjoint ss = snd $ foldr step (IS.empty, True) ss
  where step _ w@(_, False) = w
        step s (u, True) = (s `IS.union` u, s `IS.disjoint` u)
 -- unfortunately, attoparsec does not seem to back-track properly
 -- so we need a special function if labels can contain colons
 parseAnnotationsWithColons :: T.Text -> Either String [Annotation]
 parseAnnotationsWithColons t = case partitionEithers (map parseAnnotationWithColons $ T.words t) of
  ([], annotations) -> Right annotations
  ((firstProblem:_), _) -> Left firstProblem
 parseAnnotationWithColons :: T.Text -> Either String Annotation
 parseAnnotationWithColons t = if T.null label
                              then Left "Colon expected"
                              else case parseOnly (intSetParser <* endOfInput) position of
                                     Left m -> Left m
                                     Right s -> Right (Annotation (T.init label) s)
  where (label, position) = T.breakOnEnd ":" t
 annotationsParser :: Parser [Annotation]
 annotationsParser = sepByWhitespaces annotationParser
@ -70,3 +102,7 @@ intervalParser = do
  startIx <- decimal
  endIx <- (string "-" *> decimal <|> pure startIx)
  pure $ IS.fromList [startIx..endIx]
 segmentAccuracy :: [Annotation] -> [Annotation] -> Double
 segmentAccuracy expected output = (fromIntegral $ length matched) / (fromIntegral $ length expected)
  where matched = (fromList expected) `intersection` (fromList output)
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -706,6 +706,13 @@ gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput intoTokens
           | otherwise = (h, t + 1)
         hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0)
 gevalCoreOnSources SegmentAccuracy _ = gevalCoreWithoutInput parseSegmentAnnotations
                                                             parseSegmentAnnotations
                                                             (uncurry segmentAccuracy)
                                                             averageC
                                                             id
                                                             noGraph
 gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
                                                       (Right . parseIntoProbList)
                                                       (uncurry countLogLossOnProbList)
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -297,6 +297,19 @@ in the expected file (but not in the output file).
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents SegmentAccuracy testName = [i|
 Segment a sentence and tag with POS tags
 ========================================
 This is a sample, toy challenge for SegmentAccuracy.
 For each sentence, give a sequence of POS tags, each one with
 its position (1-indexed). For instance, `N:1-10` means a nouns
 starting from the beginning (the first character) up to to the tenth
 character (inclusively).
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName
 readmeMDContents (MultiLabelFMeasure beta) testName = [i|
 Tag names and their component
@ -473,6 +486,9 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
 trainContents TokenAccuracy = [hereLit|* V N	I like cats
 * * V * N	I can see the rainbow
 |]
 trainContents SegmentAccuracy = [hereLit|Art:1-3 N:5-11 V:12-13 A:15-19	The student's smart
 N:1-6 N:8-10 V:12-13 A:15-18	Mary's dog is nice
 |]
 trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta)
 trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person/3,4,5 first-name/4 surname/5
 Steven bloody Brown	person/1,3 first-name/1 surname/3
@ -540,6 +556,9 @@ Mr Jan Kowalski
 devInContents TokenAccuracy = [hereLit|The cats on the mat
 Ala has a cat
 |]
 devInContents SegmentAccuracy = [hereLit|John is smart
 Mary's intelligent
 |]
 devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta)
 devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here
 I see him
@ -604,6 +623,9 @@ O B-firstname/JAN B-surname/KOWALSKI
 devExpectedContents TokenAccuracy = [hereLit|* N * * N
 N V * N
 |]
 devExpectedContents SegmentAccuracy = [hereLit|N:1-4 V:6-7 A:9-13
 N:1-4 V:6-7 A:9-19
 |]
 devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta)
 devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2
@ -673,6 +695,9 @@ No name here
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
 |]
 testInContents SegmentAccuracy = [hereLit|Mary's cat is old
 John is young
 |]
 testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta)
 testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith
 Nobody is there
@ -738,6 +763,9 @@ O O O
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
 |]
 testExpectedContents SegmentAccuracy = [hereLit|N:1-6 N:8-10 V:12-13 A:15-17
 N:1-4 V:6-7 A:9-13
 |]
 testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta)
 testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -26,7 +26,7 @@ import Data.Attoparsec.Text (parseOnly)
 data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
              | FMeasure Double | MacroFMeasure Double | NMI
              | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
-              | BIOF1 | BIOF1Labels | TokenAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
+              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
              | MultiLabelLogLoss | MultiLabelLikelihood
              | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double
              deriving (Eq)
@ -67,6 +67,7 @@ instance Show Metric where
  show BIOF1 = "BIO-F1"
  show BIOF1Labels = "BIO-F1-Labels"
  show TokenAccuracy = "TokenAccuracy"
  show SegmentAccuracy = "SegmentAccuracy"
  show MAE = "MAE"
  show SMAPE = "SMAPE"
  show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta)
@ -118,6 +119,7 @@ instance Read Metric where
  readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
  readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
  readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)]
  readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)]
  readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)]
  readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)]
  readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)]
@ -154,6 +156,7 @@ getMetricOrdering Likelihood = TheHigherTheBetter
 getMetricOrdering BIOF1 = TheHigherTheBetter
 getMetricOrdering BIOF1Labels = TheHigherTheBetter
 getMetricOrdering TokenAccuracy = TheHigherTheBetter
 getMetricOrdering SegmentAccuracy = TheHigherTheBetter
 getMetricOrdering MAE = TheLowerTheBetter
 getMetricOrdering SMAPE = TheLowerTheBetter
 getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -63,6 +63,7 @@ listOfAvailableMetrics = [RMSE,
                          BIOF1,
                          BIOF1Labels,
                          TokenAccuracy,
                          SegmentAccuracy,
                          SoftFMeasure 1.0,
                          SoftFMeasure 2.0,
                          SoftFMeasure 0.25,
@ -94,6 +95,7 @@ isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed GLEU = True
 isMetricDescribed SegmentAccuracy = True
 isMetricDescribed _ = False
 getEvaluationSchemeDescription :: EvaluationScheme -> String
@ -134,7 +136,11 @@ metric on a corpus level but does not have its drawbacks for our per
 sentence reward objective.
 see: https://arxiv.org/pdf/1609.08144.pdf
 |]
-
+getMetricDescription SegmentAccuracy =
  [i|Accuracy counted for segments, i.e. labels with positions.
 The percentage of labels in the ground truth retrieved in the actual output is returned.
 Accuracy is calculated separately for each item and then averaged.
 |]
 outContents :: Metric -> String
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
@ -147,7 +153,10 @@ outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surn
 surname/1:0.4
 first-name/3:0.9
 |]
-outContents GLEU = [hereLit|Alice has a black 
+outContents GLEU = [hereLit|Alice has a black
 |]
 outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
 N:1-4 V:6-7 A:9-13
 |]
 expectedScore :: EvaluationScheme -> MetricValue
@ -165,6 +174,8 @@ expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) [])
      in weightedHarmonicMean beta precision recall
 expectedScore (EvaluationScheme GLEU [])
  = 0.7142857142857143
 expectedScore (EvaluationScheme SegmentAccuracy [])
  = 0.875
 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@ -213,7 +224,14 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe
 formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
 can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
 |]
-formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. 
+formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
 |]
 formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
 1-based indexes or spans separated by commas (spans are inclusive
 ranges, e.g. "10-14"). For instance, "foo:bar:2,4-7,10" is a
 label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
 overlapping segments can be returned (evaluation will fail in
 such a case).
 |]
 scoreExplanation :: EvaluationScheme -> Maybe String
@ -227,13 +245,16 @@ Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000
 for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|]
 scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing
 scoreExplanation (EvaluationScheme GLEU [])
-  = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives). 
+  = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives).
  We have 4 matching unigrams ("Alice", "has", "a", "black") , 3 bigrams ("Alice has", "has a", "a black"), 2 trigrams ("Alice has a", "has a black") and 1 tetragram ("Alice has a black"),
-so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat"). 
+so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat").
 Now we have to calculate precision and recall:
-  Precision is tp / (tp+fp) = 10/(10+0) = 1, 
+  Precision is tp / (tp+fp) = 10/(10+0) = 1,
  recall is tp / (tp+fn) = 10 / (10+4) = 10/14 =~ 0.71428...
  The GLEU score is min(precision,recall)=0.71428 |]
 scoreExplanation (EvaluationScheme SegmentAccuracy [])
  = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
 The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -146,6 +146,9 @@ main = hspec $ do
  describe "TokenAccuracy" $ do
    it "simple example" $ do
       runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5
  describe "SegmentAccuracy" $ do
    it "simple test" $ do
      runGEvalTest "segment-accuracy-simple" `shouldReturnAlmost` 0.4444444
  describe "precision count" $ do
    it "simple test" $ do
      precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2
@ -342,6 +345,11 @@ main = hspec $ do
    it "just parse" $ do
      parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),
                                                              Annotation "baz" (IS.fromList [4,5,6])]
    it "just parse wit colons" $ do
      parseSegmentAnnotations "foo:x:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo:x" (IS.fromList [3,7,8,9,10]),
                                                                       Annotation "baz" (IS.fromList [4,5,6])]
    it "just parse wit colons" $ do
      parseSegmentAnnotations "foo:x:3,7-10 baz:2-6" `shouldBe` Left "Overlapping segments"
    it "just parse 2" $ do
      parseAnnotations "inwords:1-3 indigits:5" `shouldBe` Right [Annotation "inwords" (IS.fromList [1,2,3]),
                                                                  Annotation "indigits" (IS.fromList [5])]
--- a/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv
+++ b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv
@ -0,0 +1,3 @@
 foo:0 baq:1-2 baz:3
 aaa:0-1
 xyz:0 bbb:x:1
--- a/test/segment-accuracy-simple/segment-accuracy-simple/config.txt
+++ b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt
@ -0,0 +1 @@
 --metric SegmentAccuracy
--- a/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv
+++ b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv
@ -0,0 +1,3 @@
 foo:0 bar:1-2 baz:3
 aaa:0-2
 xyz:0 bbb:x:1 ccc:x:2