Add SegmentAccuracy metric
This commit is contained in:
parent
fb74f568bb
commit
03aacdef98
@ -4,11 +4,12 @@
|
||||
module GEval.Annotation
|
||||
(parseAnnotations, Annotation(..),
|
||||
parseObtainedAnnotations, ObtainedAnnotation(..),
|
||||
matchScore, intSetParser)
|
||||
matchScore, intSetParser, segmentAccuracy, parseSegmentAnnotations)
|
||||
where
|
||||
|
||||
import qualified Data.IntSet as IS
|
||||
import qualified Data.Text as T
|
||||
import Data.Set (intersection, fromList)
|
||||
|
||||
import Data.Attoparsec.Text
|
||||
import Data.Attoparsec.Combinator
|
||||
@ -17,11 +18,12 @@ import GEval.Common (sepByWhitespaces, (/.))
|
||||
import GEval.Probability
|
||||
import Data.Char
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Either (partitionEithers)
|
||||
|
||||
import GEval.PrecisionRecall(weightedMaxMatching)
|
||||
|
||||
data Annotation = Annotation T.Text IS.IntSet
|
||||
deriving (Eq, Show)
|
||||
deriving (Eq, Show, Ord)
|
||||
|
||||
data ObtainedAnnotation = ObtainedAnnotation Annotation Double
|
||||
deriving (Eq, Show)
|
||||
@ -52,6 +54,36 @@ obtainedAnnotationParser = do
|
||||
parseAnnotations :: T.Text -> Either String [Annotation]
|
||||
parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t
|
||||
|
||||
parseSegmentAnnotations :: T.Text -> Either String [Annotation]
|
||||
parseSegmentAnnotations t = case parseAnnotationsWithColons t of
|
||||
Left m -> Left m
|
||||
Right annotations -> if areSegmentsDisjoint annotations
|
||||
then (Right annotations)
|
||||
else (Left "Overlapping segments")
|
||||
|
||||
areSegmentsDisjoint :: [Annotation] -> Bool
|
||||
areSegmentsDisjoint = areIntSetsDisjoint . map (\(Annotation _ s) -> s)
|
||||
|
||||
areIntSetsDisjoint :: [IS.IntSet] -> Bool
|
||||
areIntSetsDisjoint ss = snd $ foldr step (IS.empty, True) ss
|
||||
where step _ w@(_, False) = w
|
||||
step s (u, True) = (s `IS.union` u, s `IS.disjoint` u)
|
||||
|
||||
-- unfortunately, attoparsec does not seem to back-track properly
|
||||
-- so we need a special function if labels can contain colons
|
||||
parseAnnotationsWithColons :: T.Text -> Either String [Annotation]
|
||||
parseAnnotationsWithColons t = case partitionEithers (map parseAnnotationWithColons $ T.words t) of
|
||||
([], annotations) -> Right annotations
|
||||
((firstProblem:_), _) -> Left firstProblem
|
||||
|
||||
parseAnnotationWithColons :: T.Text -> Either String Annotation
|
||||
parseAnnotationWithColons t = if T.null label
|
||||
then Left "Colon expected"
|
||||
else case parseOnly (intSetParser <* endOfInput) position of
|
||||
Left m -> Left m
|
||||
Right s -> Right (Annotation (T.init label) s)
|
||||
where (label, position) = T.breakOnEnd ":" t
|
||||
|
||||
annotationsParser :: Parser [Annotation]
|
||||
annotationsParser = sepByWhitespaces annotationParser
|
||||
|
||||
@ -70,3 +102,7 @@ intervalParser = do
|
||||
startIx <- decimal
|
||||
endIx <- (string "-" *> decimal <|> pure startIx)
|
||||
pure $ IS.fromList [startIx..endIx]
|
||||
|
||||
segmentAccuracy :: [Annotation] -> [Annotation] -> Double
|
||||
segmentAccuracy expected output = (fromIntegral $ length matched) / (fromIntegral $ length expected)
|
||||
where matched = (fromList expected) `intersection` (fromList output)
|
||||
|
@ -706,6 +706,13 @@ gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput intoTokens
|
||||
| otherwise = (h, t + 1)
|
||||
hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0)
|
||||
|
||||
gevalCoreOnSources SegmentAccuracy _ = gevalCoreWithoutInput parseSegmentAnnotations
|
||||
parseSegmentAnnotations
|
||||
(uncurry segmentAccuracy)
|
||||
averageC
|
||||
id
|
||||
noGraph
|
||||
|
||||
gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
|
||||
(Right . parseIntoProbList)
|
||||
(uncurry countLogLossOnProbList)
|
||||
|
@ -297,6 +297,19 @@ in the expected file (but not in the output file).
|
||||
|
||||
|] ++ (commonReadmeMDContents testName)
|
||||
|
||||
readmeMDContents SegmentAccuracy testName = [i|
|
||||
Segment a sentence and tag with POS tags
|
||||
========================================
|
||||
|
||||
This is a sample, toy challenge for SegmentAccuracy.
|
||||
|
||||
For each sentence, give a sequence of POS tags, each one with
|
||||
its position (1-indexed). For instance, `N:1-10` means a nouns
|
||||
starting from the beginning (the first character) up to to the tenth
|
||||
character (inclusively).
|
||||
|
||||
|] ++ (commonReadmeMDContents testName)
|
||||
|
||||
readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName
|
||||
readmeMDContents (MultiLabelFMeasure beta) testName = [i|
|
||||
Tag names and their component
|
||||
@ -473,6 +486,9 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman
|
||||
trainContents TokenAccuracy = [hereLit|* V N I like cats
|
||||
* * V * N I can see the rainbow
|
||||
|]
|
||||
trainContents SegmentAccuracy = [hereLit|Art:1-3 N:5-11 V:12-13 A:15-19 The student's smart
|
||||
N:1-6 N:8-10 V:12-13 A:15-18 Mary's dog is nice
|
||||
|]
|
||||
trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta)
|
||||
trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith person/3,4,5 first-name/4 surname/5
|
||||
Steven bloody Brown person/1,3 first-name/1 surname/3
|
||||
@ -540,6 +556,9 @@ Mr Jan Kowalski
|
||||
devInContents TokenAccuracy = [hereLit|The cats on the mat
|
||||
Ala has a cat
|
||||
|]
|
||||
devInContents SegmentAccuracy = [hereLit|John is smart
|
||||
Mary's intelligent
|
||||
|]
|
||||
devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta)
|
||||
devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here
|
||||
I see him
|
||||
@ -604,6 +623,9 @@ O B-firstname/JAN B-surname/KOWALSKI
|
||||
devExpectedContents TokenAccuracy = [hereLit|* N * * N
|
||||
N V * N
|
||||
|]
|
||||
devExpectedContents SegmentAccuracy = [hereLit|N:1-4 V:6-7 A:9-13
|
||||
N:1-4 V:6-7 A:9-19
|
||||
|]
|
||||
devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta)
|
||||
devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2
|
||||
|
||||
@ -673,6 +695,9 @@ No name here
|
||||
testInContents TokenAccuracy = [hereLit|I have cats
|
||||
I know
|
||||
|]
|
||||
testInContents SegmentAccuracy = [hereLit|Mary's cat is old
|
||||
John is young
|
||||
|]
|
||||
testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta)
|
||||
testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith
|
||||
Nobody is there
|
||||
@ -738,6 +763,9 @@ O O O
|
||||
testExpectedContents TokenAccuracy = [hereLit|* V N
|
||||
* V
|
||||
|]
|
||||
testExpectedContents SegmentAccuracy = [hereLit|N:1-6 N:8-10 V:12-13 A:15-17
|
||||
N:1-4 V:6-7 A:9-13
|
||||
|]
|
||||
testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta)
|
||||
testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3
|
||||
|
||||
|
@ -26,7 +26,7 @@ import Data.Attoparsec.Text (parseOnly)
|
||||
data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
|
||||
| FMeasure Double | MacroFMeasure Double | NMI
|
||||
| LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
|
||||
| BIOF1 | BIOF1Labels | TokenAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
|
||||
| BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double
|
||||
| MultiLabelLogLoss | MultiLabelLikelihood
|
||||
| SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double
|
||||
deriving (Eq)
|
||||
@ -67,6 +67,7 @@ instance Show Metric where
|
||||
show BIOF1 = "BIO-F1"
|
||||
show BIOF1Labels = "BIO-F1-Labels"
|
||||
show TokenAccuracy = "TokenAccuracy"
|
||||
show SegmentAccuracy = "SegmentAccuracy"
|
||||
show MAE = "MAE"
|
||||
show SMAPE = "SMAPE"
|
||||
show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta)
|
||||
@ -118,6 +119,7 @@ instance Read Metric where
|
||||
readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
|
||||
readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
|
||||
readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)]
|
||||
readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)]
|
||||
readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)]
|
||||
readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)]
|
||||
readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)]
|
||||
@ -154,6 +156,7 @@ getMetricOrdering Likelihood = TheHigherTheBetter
|
||||
getMetricOrdering BIOF1 = TheHigherTheBetter
|
||||
getMetricOrdering BIOF1Labels = TheHigherTheBetter
|
||||
getMetricOrdering TokenAccuracy = TheHigherTheBetter
|
||||
getMetricOrdering SegmentAccuracy = TheHigherTheBetter
|
||||
getMetricOrdering MAE = TheLowerTheBetter
|
||||
getMetricOrdering SMAPE = TheLowerTheBetter
|
||||
getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter
|
||||
|
@ -63,6 +63,7 @@ listOfAvailableMetrics = [RMSE,
|
||||
BIOF1,
|
||||
BIOF1Labels,
|
||||
TokenAccuracy,
|
||||
SegmentAccuracy,
|
||||
SoftFMeasure 1.0,
|
||||
SoftFMeasure 2.0,
|
||||
SoftFMeasure 0.25,
|
||||
@ -94,6 +95,7 @@ isMetricDescribed (SoftFMeasure _) = True
|
||||
isMetricDescribed (Soft2DFMeasure _) = True
|
||||
isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
|
||||
isMetricDescribed GLEU = True
|
||||
isMetricDescribed SegmentAccuracy = True
|
||||
isMetricDescribed _ = False
|
||||
|
||||
getEvaluationSchemeDescription :: EvaluationScheme -> String
|
||||
@ -134,7 +136,11 @@ metric on a corpus level but does not have its drawbacks for our per
|
||||
sentence reward objective.
|
||||
see: https://arxiv.org/pdf/1609.08144.pdf
|
||||
|]
|
||||
|
||||
getMetricDescription SegmentAccuracy =
|
||||
[i|Accuracy counted for segments, i.e. labels with positions.
|
||||
The percentage of labels in the ground truth retrieved in the actual output is returned.
|
||||
Accuracy is calculated separately for each item and then averaged.
|
||||
|]
|
||||
|
||||
outContents :: Metric -> String
|
||||
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
|
||||
@ -149,6 +155,9 @@ first-name/3:0.9
|
||||
|]
|
||||
outContents GLEU = [hereLit|Alice has a black
|
||||
|]
|
||||
outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
|
||||
N:1-4 V:6-7 A:9-13
|
||||
|]
|
||||
|
||||
expectedScore :: EvaluationScheme -> MetricValue
|
||||
expectedScore (EvaluationScheme (SoftFMeasure beta) [])
|
||||
@ -165,6 +174,8 @@ expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) [])
|
||||
in weightedHarmonicMean beta precision recall
|
||||
expectedScore (EvaluationScheme GLEU [])
|
||||
= 0.7142857142857143
|
||||
expectedScore (EvaluationScheme SegmentAccuracy [])
|
||||
= 0.875
|
||||
|
||||
helpMetricParameterMetricsList :: String
|
||||
helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
|
||||
@ -215,6 +226,13 @@ can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
|
||||
|]
|
||||
formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
|
||||
|]
|
||||
formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
|
||||
1-based indexes or spans separated by commas (spans are inclusive
|
||||
ranges, e.g. "10-14"). For instance, "foo:bar:2,4-7,10" is a
|
||||
label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
|
||||
overlapping segments can be returned (evaluation will fail in
|
||||
such a case).
|
||||
|]
|
||||
|
||||
scoreExplanation :: EvaluationScheme -> Maybe String
|
||||
scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
|
||||
@ -234,6 +252,9 @@ Now we have to calculate precision and recall:
|
||||
Precision is tp / (tp+fp) = 10/(10+0) = 1,
|
||||
recall is tp / (tp+fn) = 10 / (10+4) = 10/14 =~ 0.71428...
|
||||
The GLEU score is min(precision,recall)=0.71428 |]
|
||||
scoreExplanation (EvaluationScheme SegmentAccuracy [])
|
||||
= Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
|
||||
The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
|
||||
|
||||
pasteLines :: String -> String -> String
|
||||
pasteLines a b = printf "%-35s %s\n" a b
|
||||
|
@ -146,6 +146,9 @@ main = hspec $ do
|
||||
describe "TokenAccuracy" $ do
|
||||
it "simple example" $ do
|
||||
runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5
|
||||
describe "SegmentAccuracy" $ do
|
||||
it "simple test" $ do
|
||||
runGEvalTest "segment-accuracy-simple" `shouldReturnAlmost` 0.4444444
|
||||
describe "precision count" $ do
|
||||
it "simple test" $ do
|
||||
precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2
|
||||
@ -342,6 +345,11 @@ main = hspec $ do
|
||||
it "just parse" $ do
|
||||
parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),
|
||||
Annotation "baz" (IS.fromList [4,5,6])]
|
||||
it "just parse wit colons" $ do
|
||||
parseSegmentAnnotations "foo:x:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo:x" (IS.fromList [3,7,8,9,10]),
|
||||
Annotation "baz" (IS.fromList [4,5,6])]
|
||||
it "just parse wit colons" $ do
|
||||
parseSegmentAnnotations "foo:x:3,7-10 baz:2-6" `shouldBe` Left "Overlapping segments"
|
||||
it "just parse 2" $ do
|
||||
parseAnnotations "inwords:1-3 indigits:5" `shouldBe` Right [Annotation "inwords" (IS.fromList [1,2,3]),
|
||||
Annotation "indigits" (IS.fromList [5])]
|
||||
|
@ -0,0 +1,3 @@
|
||||
foo:0 baq:1-2 baz:3
|
||||
aaa:0-1
|
||||
xyz:0 bbb:x:1
|
|
@ -0,0 +1 @@
|
||||
--metric SegmentAccuracy
|
@ -0,0 +1,3 @@
|
||||
foo:0 bar:1-2 baz:3
|
||||
aaa:0-2
|
||||
xyz:0 bbb:x:1 ccc:x:2
|
|
Loading…
Reference in New Issue
Block a user