From 03aacdef98ed566074e21c49ca942fbfe1b12c8f Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Sun, 17 Nov 2019 21:59:20 +0100
Subject: [PATCH] Add SegmentAccuracy metric

---
 src/GEval/Annotation.hs                       | 40 ++++++++++++++++++-
 src/GEval/Core.hs                             |  7 ++++
 src/GEval/CreateChallenge.hs                  | 28 +++++++++++++
 src/GEval/Metric.hs                           |  5 ++-
 src/GEval/MetricsMeta.hs                      | 33 ++++++++++++---
 test/Spec.hs                                  |  8 ++++
 .../test-A/out.tsv                            |  3 ++
 .../segment-accuracy-simple/config.txt        |  1 +
 .../test-A/expected.tsv                       |  3 ++
 9 files changed, 119 insertions(+), 9 deletions(-)
 create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv
 create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple/config.txt
 create mode 100644 test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv

diff --git a/src/GEval/Annotation.hs b/src/GEval/Annotation.hs
index abc8b59..950c93d 100644
--- a/src/GEval/Annotation.hs
+++ b/src/GEval/Annotation.hs
@@ -4,11 +4,12 @@
 module GEval.Annotation
        (parseAnnotations, Annotation(..),
         parseObtainedAnnotations, ObtainedAnnotation(..),
-        matchScore, intSetParser)
+        matchScore, intSetParser, segmentAccuracy, parseSegmentAnnotations)
        where
 
 import qualified Data.IntSet as IS
 import qualified Data.Text as T
+import Data.Set (intersection, fromList)
 
 import Data.Attoparsec.Text
 import Data.Attoparsec.Combinator
@@ -17,11 +18,12 @@ import GEval.Common (sepByWhitespaces, (/.))
 import GEval.Probability
 import Data.Char
 import Data.Maybe (fromMaybe)
+import Data.Either (partitionEithers)
 
 import GEval.PrecisionRecall(weightedMaxMatching)
 
 data Annotation = Annotation T.Text IS.IntSet
-                  deriving (Eq, Show)
+                  deriving (Eq, Show, Ord)
 
 data ObtainedAnnotation = ObtainedAnnotation Annotation Double
                           deriving (Eq, Show)
@@ -52,6 +54,36 @@ obtainedAnnotationParser = do
 parseAnnotations :: T.Text -> Either String [Annotation]
 parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t
 
+parseSegmentAnnotations :: T.Text -> Either String [Annotation]
+parseSegmentAnnotations t = case parseAnnotationsWithColons t of
+  Left m -> Left m
+  Right annotations -> if areSegmentsDisjoint annotations
+                      then (Right annotations)
+                      else (Left "Overlapping segments")
+
+areSegmentsDisjoint :: [Annotation] -> Bool
+areSegmentsDisjoint = areIntSetsDisjoint . map (\(Annotation _ s) -> s)
+
+areIntSetsDisjoint :: [IS.IntSet] -> Bool
+areIntSetsDisjoint ss = snd $ foldr step (IS.empty, True) ss
+  where step _ w@(_, False) = w
+        step s (u, True) = (s `IS.union` u, s `IS.disjoint` u)
+
+-- unfortunately, attoparsec does not seem to back-track properly
+-- so we need a special function if labels can contain colons
+parseAnnotationsWithColons :: T.Text -> Either String [Annotation]
+parseAnnotationsWithColons t = case partitionEithers (map parseAnnotationWithColons $ T.words t) of
+  ([], annotations) -> Right annotations
+  ((firstProblem:_), _) -> Left firstProblem
+
+parseAnnotationWithColons :: T.Text -> Either String Annotation
+parseAnnotationWithColons t = if T.null label
+                              then Left "Colon expected"
+                              else case parseOnly (intSetParser <* endOfInput) position of
+                                     Left m -> Left m
+                                     Right s -> Right (Annotation (T.init label) s)
+  where (label, position) = T.breakOnEnd ":" t
+
 annotationsParser :: Parser [Annotation]
 annotationsParser = sepByWhitespaces annotationParser
 
@@ -70,3 +102,7 @@ intervalParser = do
   startIx <- decimal
   endIx <- (string "-" *> decimal <|> pure startIx)
   pure $ IS.fromList [startIx..endIx]
+
+segmentAccuracy :: [Annotation] -> [Annotation] -> Double
+segmentAccuracy expected output = (fromIntegral $ length matched) / (fromIntegral $ length expected)
+  where matched = (fromList expected) `intersection` (fromList output)
diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index 1897fb7..4611671 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -706,6 +706,13 @@ gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput intoTokens
            | otherwise = (h, t + 1)
          hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0)
 
+gevalCoreOnSources SegmentAccuracy _ = gevalCoreWithoutInput parseSegmentAnnotations
+                                                             parseSegmentAnnotations
+                                                             (uncurry segmentAccuracy)
+                                                             averageC
+                                                             id
+                                                             noGraph
+
 gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
                                                        (Right . parseIntoProbList)
                                                        (uncurry countLogLossOnProbList)
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index 8b501a3..3a915e5 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -297,6 +297,19 @@ in the expected file (but not in the output file).
 
 |] ++ (commonReadmeMDContents testName)
 
+readmeMDContents SegmentAccuracy testName = [i|
+Segment a sentence and tag with POS tags
+========================================
+
+This is a sample, toy challenge for SegmentAccuracy.
+
+For each sentence, give a sequence of POS tags, each one with
+its position (1-indexed). For instance, `N:1-10` means a nouns
+starting from the beginning (the first character) up to to the tenth
+character (inclusively).
+
+|] ++ (commonReadmeMDContents testName)
+
 readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName
 readmeMDContents (MultiLabelFMeasure beta) testName = [i|
 Tag names and their component
@@ -473,6 +486,9 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
 trainContents TokenAccuracy = [hereLit|* V N	I like cats
 * * V * N	I can see the rainbow
 |]
+trainContents SegmentAccuracy = [hereLit|Art:1-3 N:5-11 V:12-13 A:15-19	The student's smart
+N:1-6 N:8-10 V:12-13 A:15-18	Mary's dog is nice
+|]
 trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta)
 trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person/3,4,5 first-name/4 surname/5
 Steven bloody Brown	person/1,3 first-name/1 surname/3
@@ -540,6 +556,9 @@ Mr Jan Kowalski
 devInContents TokenAccuracy = [hereLit|The cats on the mat
 Ala has a cat
 |]
+devInContents SegmentAccuracy = [hereLit|John is smart
+Mary's intelligent
+|]
 devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta)
 devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here
 I see him
@@ -604,6 +623,9 @@ O B-firstname/JAN B-surname/KOWALSKI
 devExpectedContents TokenAccuracy = [hereLit|* N * * N
 N V * N
 |]
+devExpectedContents SegmentAccuracy = [hereLit|N:1-4 V:6-7 A:9-13
+N:1-4 V:6-7 A:9-19
+|]
 devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta)
 devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2
 
@@ -673,6 +695,9 @@ No name here
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
 |]
+testInContents SegmentAccuracy = [hereLit|Mary's cat is old
+John is young
+|]
 testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta)
 testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith
 Nobody is there
@@ -738,6 +763,9 @@ O O O
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
 |]
+testExpectedContents SegmentAccuracy = [hereLit|N:1-6 N:8-10 V:12-13 A:15-17
+N:1-4 V:6-7 A:9-13
+|]
 testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta)
 testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3
 
diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs
index 0a53a61..b87c599 100644
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@@ -26,7 +26,7 @@ import Data.Attoparsec.Text (parseOnly)
 data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
               | FMeasure Double | MacroFMeasure Double | NMI
               | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
-              | BIOF1 | BIOF1Labels | TokenAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
+              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
               | MultiLabelLogLoss | MultiLabelLikelihood
               | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double
               deriving (Eq)
@@ -67,6 +67,7 @@ instance Show Metric where
   show BIOF1 = "BIO-F1"
   show BIOF1Labels = "BIO-F1-Labels"
   show TokenAccuracy = "TokenAccuracy"
+  show SegmentAccuracy = "SegmentAccuracy"
   show MAE = "MAE"
   show SMAPE = "SMAPE"
   show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta)
@@ -118,6 +119,7 @@ instance Read Metric where
   readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
   readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
   readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)]
+  readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)]
   readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)]
   readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)]
   readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)]
@@ -154,6 +156,7 @@ getMetricOrdering Likelihood = TheHigherTheBetter
 getMetricOrdering BIOF1 = TheHigherTheBetter
 getMetricOrdering BIOF1Labels = TheHigherTheBetter
 getMetricOrdering TokenAccuracy = TheHigherTheBetter
+getMetricOrdering SegmentAccuracy = TheHigherTheBetter
 getMetricOrdering MAE = TheLowerTheBetter
 getMetricOrdering SMAPE = TheLowerTheBetter
 getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter
diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs
index 8747bd9..21659ab 100644
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@@ -63,6 +63,7 @@ listOfAvailableMetrics = [RMSE,
                           BIOF1,
                           BIOF1Labels,
                           TokenAccuracy,
+                          SegmentAccuracy,
                           SoftFMeasure 1.0,
                           SoftFMeasure 2.0,
                           SoftFMeasure 0.25,
@@ -94,6 +95,7 @@ isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed GLEU = True
+isMetricDescribed SegmentAccuracy = True
 isMetricDescribed _ = False
 
 getEvaluationSchemeDescription :: EvaluationScheme -> String
@@ -134,7 +136,11 @@ metric on a corpus level but does not have its drawbacks for our per
 sentence reward objective.
 see: https://arxiv.org/pdf/1609.08144.pdf
 |]
-
+getMetricDescription SegmentAccuracy =
+  [i|Accuracy counted for segments, i.e. labels with positions.
+The percentage of labels in the ground truth retrieved in the actual output is returned.
+Accuracy is calculated separately for each item and then averaged.
+|]
 
 outContents :: Metric -> String
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
@@ -147,7 +153,10 @@ outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surn
 surname/1:0.4
 first-name/3:0.9
 |]
-outContents GLEU = [hereLit|Alice has a black 
+outContents GLEU = [hereLit|Alice has a black
+|]
+outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
+N:1-4 V:6-7 A:9-13
 |]
 
 expectedScore :: EvaluationScheme -> MetricValue
@@ -165,6 +174,8 @@ expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) [])
       in weightedHarmonicMean beta precision recall
 expectedScore (EvaluationScheme GLEU [])
   = 0.7142857142857143
+expectedScore (EvaluationScheme SegmentAccuracy [])
+  = 0.875
 
 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@@ -213,7 +224,14 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe
 formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
 can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
 |]
-formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. 
+formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
+|]
+formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
+1-based indexes or spans separated by commas (spans are inclusive
+ranges, e.g. "10-14"). For instance, "foo:bar:2,4-7,10" is a
+label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
+overlapping segments can be returned (evaluation will fail in
+such a case).
 |]
 
 scoreExplanation :: EvaluationScheme -> Maybe String
@@ -227,13 +245,16 @@ Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000
 for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|]
 scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing
 scoreExplanation (EvaluationScheme GLEU [])
-  = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives). 
+  = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives).
   We have 4 matching unigrams ("Alice", "has", "a", "black") , 3 bigrams ("Alice has", "has a", "a black"), 2 trigrams ("Alice has a", "has a black") and 1 tetragram ("Alice has a black"),
-so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat"). 
+so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat").
 Now we have to calculate precision and recall:
-  Precision is tp / (tp+fp) = 10/(10+0) = 1, 
+  Precision is tp / (tp+fp) = 10/(10+0) = 1,
   recall is tp / (tp+fn) = 10 / (10+4) = 10/14 =~ 0.71428...
   The GLEU score is min(precision,recall)=0.71428 |]
+scoreExplanation (EvaluationScheme SegmentAccuracy [])
+  = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
+The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
 
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
diff --git a/test/Spec.hs b/test/Spec.hs
index 1fccc45..dc68beb 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -146,6 +146,9 @@ main = hspec $ do
   describe "TokenAccuracy" $ do
     it "simple example" $ do
        runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5
+  describe "SegmentAccuracy" $ do
+    it "simple test" $ do
+      runGEvalTest "segment-accuracy-simple" `shouldReturnAlmost` 0.4444444
   describe "precision count" $ do
     it "simple test" $ do
       precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2
@@ -342,6 +345,11 @@ main = hspec $ do
     it "just parse" $ do
       parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),
                                                               Annotation "baz" (IS.fromList [4,5,6])]
+    it "just parse wit colons" $ do
+      parseSegmentAnnotations "foo:x:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo:x" (IS.fromList [3,7,8,9,10]),
+                                                                       Annotation "baz" (IS.fromList [4,5,6])]
+    it "just parse wit colons" $ do
+      parseSegmentAnnotations "foo:x:3,7-10 baz:2-6" `shouldBe` Left "Overlapping segments"
     it "just parse 2" $ do
       parseAnnotations "inwords:1-3 indigits:5" `shouldBe` Right [Annotation "inwords" (IS.fromList [1,2,3]),
                                                                   Annotation "indigits" (IS.fromList [5])]
diff --git a/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv
new file mode 100644
index 0000000..4af8b51
--- /dev/null
+++ b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv
@@ -0,0 +1,3 @@
+foo:0 baq:1-2 baz:3
+aaa:0-1
+xyz:0 bbb:x:1
diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/config.txt b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt
new file mode 100644
index 0000000..2f838f0
--- /dev/null
+++ b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt
@@ -0,0 +1 @@
+--metric SegmentAccuracy
diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv
new file mode 100644
index 0000000..bc95bcb
--- /dev/null
+++ b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv
@@ -0,0 +1,3 @@
+foo:0 bar:1-2 baz:3
+aaa:0-2
+xyz:0 bbb:x:1 ccc:x:2