diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c857bf..63513ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ +## 1.22.0.0 + +* Add SegmentAccuracy + ## 1.21.0.0 * Add Probabilistic-MultiLabel-F-measure diff --git a/README.md b/README.md index 1d60508..026ec26 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ GEval is a Haskell library and a stand-alone tool for evaluating the results of solutions to machine learning challenges as defined in the -[Gonito](https://gonito.net) platform. Also could be used outside the +[Gonito](https://gonito.net) platform. Also, could be used outside the context of Gonito.net challenges, assuming the test data is given in simple TSV (tab-separated values) files. @@ -14,6 +14,29 @@ The official repository is `git://gonito.net/geval`, browsable at ## Installing +### The easy way: just download the fully static GEval binary + +(Assuming you have a 64-bit Linux.) + + wget https://gonito.net/get/bin/geval + chmod u+x geval + ./geval --help + +#### On Windows + +For Windows, you should use Windows PowerShell. + + wget https://gonito.net/get/bin/geval + +Next, you should go to the folder where you download `geval` and right-click to `geval` file. +Go to `Properties` and in the section `Security` grant full access to the folder. + +Or you should use `icacls "folder path to geval" /grant USER:` + +This is a fully static binary, it should work on any 64-bit Linux or 64-bit Windows. + +### Build from scratch + You need [Haskell Stack](https://github.com/commercialhaskell/stack). You could install Stack with your package manager or with: @@ -36,6 +59,8 @@ order to run `geval` you need to either add `$HOME/.local/bin` to PATH="$HOME/.local/bin" geval ... +In Windows you should add new global variable with name 'geval' and path should be the same as above. + ### Troubleshooting If you see a message like this: @@ -64,15 +89,32 @@ In case the lzma package is not installed on your Linux, you need to run (assumi sudo apt-get install pkg-config liblzma-dev libpq-dev libpcre3-dev -### Plan B — just download the GEval binary +#### Windows issues -(Assuming you have a 64-bit Linux.) +If you see this message on Windows during executing `stack test` command: - wget https://gonito.net/get/bin/geval - chmod u+x geval - ./geval --help + In the dependencies for geval-1.21.1.0: +     unix needed, but the stack configuration has no specified version + In the dependencies for lzma-0.0.0.3: +     lzma-clib needed, but the stack configuration has no specified version -This is a fully static binary, it should work on any 64-bit Linux. +You should replace `unix` with `unix-compat` in `geval.cabal` file, +because `unix` package is not supported for Windows. + +And you should add `lzma-clib-5.2.2` and `unix-compat-0.5.2` to section extra-deps in `stack.yaml` file. + +If you see message about missing pkg-config on Windpws you should download two packages from the site: +http://ftp.gnome.org/pub/gnome/binaries/win32/dependencies/ +These packages are: + - pkg-config (the newest version) + - gettext-runtime (the newest version) +Extract `pkg-config.exe` file in Windows PATH +Extract init.dll file from gettext-runtime + +You should also download from http://ftp.gnome.org/pub/gnome/binaries/win32/glib/2.28 glib package +and extract libglib-2.0-0.dll file. + +All files you should put for example in `C:\MinGW\bin` directory. ## Quick tour @@ -189,7 +231,7 @@ But why were double quotes so problematic in German-English translation?! Well, look at the second-worst feature — `''` in the _output_! Oops, it seems like a very stupid mistake with post-processing was done and no double quote was correctly generated, -which decreased the score a little bit for each sentence in which the +which decreased the score a little for each sentence in which the quote was expected. When I fixed this simple bug, the BLUE metric increased from 0.27358 @@ -502,9 +544,9 @@ submitted. The suggested way to do this is as follows: `test-A/expected.tsv` added. This branch should be accessible by Gonito platform, but should be kept “hidden” for regular users (or at least they should be kindly asked not to peek there). It is - recommended (though not obligatory) that this branch contain all + recommended (though not obligatory) that this branch contains all the source codes and data used to generate the train/dev/test sets. - (Use [git-annex](https://git-annex.branchable.com/) if you have really big files there.) + (Use [git-annex](https://git-annex.branchable.com/) if you have huge files there.) Branch (1) should be the parent of the branch (2), for instance, the repo (for the toy “planets” challenge) could be created as follows: @@ -567,7 +609,7 @@ be nice and commit also your source codes. git push mine master Then let Gonito pull them and evaluate your results, either manually clicking -"submit" at the Gonito web site or using `--submit` option (see below). +"submit" at the Gonito website or using `--submit` option (see below). ### Submitting a solution to a Gonito platform with GEval diff --git a/geval.cabal b/geval.cabal index 0939f45..32fb0f5 100644 --- a/geval.cabal +++ b/geval.cabal @@ -1,5 +1,5 @@ name: geval -version: 1.21.1.0 +version: 1.22.0.0 synopsis: Machine learning evaluation tools description: Please see README.md homepage: http://github.com/name/project diff --git a/src/GEval/Annotation.hs b/src/GEval/Annotation.hs index abc8b59..950c93d 100644 --- a/src/GEval/Annotation.hs +++ b/src/GEval/Annotation.hs @@ -4,11 +4,12 @@ module GEval.Annotation (parseAnnotations, Annotation(..), parseObtainedAnnotations, ObtainedAnnotation(..), - matchScore, intSetParser) + matchScore, intSetParser, segmentAccuracy, parseSegmentAnnotations) where import qualified Data.IntSet as IS import qualified Data.Text as T +import Data.Set (intersection, fromList) import Data.Attoparsec.Text import Data.Attoparsec.Combinator @@ -17,11 +18,12 @@ import GEval.Common (sepByWhitespaces, (/.)) import GEval.Probability import Data.Char import Data.Maybe (fromMaybe) +import Data.Either (partitionEithers) import GEval.PrecisionRecall(weightedMaxMatching) data Annotation = Annotation T.Text IS.IntSet - deriving (Eq, Show) + deriving (Eq, Show, Ord) data ObtainedAnnotation = ObtainedAnnotation Annotation Double deriving (Eq, Show) @@ -52,6 +54,36 @@ obtainedAnnotationParser = do parseAnnotations :: T.Text -> Either String [Annotation] parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t +parseSegmentAnnotations :: T.Text -> Either String [Annotation] +parseSegmentAnnotations t = case parseAnnotationsWithColons t of + Left m -> Left m + Right annotations -> if areSegmentsDisjoint annotations + then (Right annotations) + else (Left "Overlapping segments") + +areSegmentsDisjoint :: [Annotation] -> Bool +areSegmentsDisjoint = areIntSetsDisjoint . map (\(Annotation _ s) -> s) + +areIntSetsDisjoint :: [IS.IntSet] -> Bool +areIntSetsDisjoint ss = snd $ foldr step (IS.empty, True) ss + where step _ w@(_, False) = w + step s (u, True) = (s `IS.union` u, s `IS.disjoint` u) + +-- unfortunately, attoparsec does not seem to back-track properly +-- so we need a special function if labels can contain colons +parseAnnotationsWithColons :: T.Text -> Either String [Annotation] +parseAnnotationsWithColons t = case partitionEithers (map parseAnnotationWithColons $ T.words t) of + ([], annotations) -> Right annotations + ((firstProblem:_), _) -> Left firstProblem + +parseAnnotationWithColons :: T.Text -> Either String Annotation +parseAnnotationWithColons t = if T.null label + then Left "Colon expected" + else case parseOnly (intSetParser <* endOfInput) position of + Left m -> Left m + Right s -> Right (Annotation (T.init label) s) + where (label, position) = T.breakOnEnd ":" t + annotationsParser :: Parser [Annotation] annotationsParser = sepByWhitespaces annotationParser @@ -70,3 +102,7 @@ intervalParser = do startIx <- decimal endIx <- (string "-" *> decimal <|> pure startIx) pure $ IS.fromList [startIx..endIx] + +segmentAccuracy :: [Annotation] -> [Annotation] -> Double +segmentAccuracy expected output = (fromIntegral $ length matched) / (fromIntegral $ length expected) + where matched = (fromList expected) `intersection` (fromList output) diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index feaf5d3..157aae2 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -567,6 +567,11 @@ gevalCoreOnSources TokenAccuracy _ = gevalCoreWithoutInput SATokenAccuracy where hitsAndTotalsAgg = CC.foldl (\(h1, t1) (h2, t2) -> (h1 + h2, t1 + t2)) (0, 0) +gevalCoreOnSources SegmentAccuracy _ = gevalCoreWithoutInput SASegmentAccuracy + averageC + id + noGraph + gevalCoreOnSources MultiLabelLogLoss _ = gevalCoreWithoutInput SAMultiLabelLogLoss averageC id diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 5c2b266..b11e09e 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -298,6 +298,19 @@ in the expected file (but not in the output file). |] ++ (commonReadmeMDContents testName) +readmeMDContents SegmentAccuracy testName = [i| +Segment a sentence and tag with POS tags +======================================== + +This is a sample, toy challenge for SegmentAccuracy. + +For each sentence, give a sequence of POS tags, each one with +its position (1-indexed). For instance, `N:1-10` means a nouns +starting from the beginning (the first character) up to to the tenth +character (inclusively). + +|] ++ (commonReadmeMDContents testName) + readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName readmeMDContents (MultiLabelFMeasure beta) testName = [i| Tag names and their component @@ -474,6 +487,9 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman trainContents TokenAccuracy = [hereLit|* V N I like cats * * V * N I can see the rainbow |] +trainContents SegmentAccuracy = [hereLit|Art:1-3 N:5-11 V:12-13 A:15-19 The student's smart +N:1-6 N:8-10 V:12-13 A:15-18 Mary's dog is nice +|] trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta) trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith person/3,4,5 first-name/4 surname/5 Steven bloody Brown person/1,3 first-name/1 surname/3 @@ -541,6 +557,9 @@ Mr Jan Kowalski devInContents TokenAccuracy = [hereLit|The cats on the mat Ala has a cat |] +devInContents SegmentAccuracy = [hereLit|John is smart +Mary's intelligent +|] devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta) devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here I see him @@ -605,6 +624,9 @@ O B-firstname/JAN B-surname/KOWALSKI devExpectedContents TokenAccuracy = [hereLit|* N * * N N V * N |] +devExpectedContents SegmentAccuracy = [hereLit|N:1-4 V:6-7 A:9-13 +N:1-4 V:6-7 A:9-19 +|] devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta) devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2 @@ -625,7 +647,8 @@ devExpectedContents _ = [hereLit|0.82 |] testInContents :: Metric -> String -testInContents GLEU = testInContents BLEU +testInContents GLEU = [hereLit|Alice has a black +|] testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi ja tuli ehtoo , ja tuli aamu , ensimmäinen päivä |] @@ -673,6 +696,9 @@ No name here testInContents TokenAccuracy = [hereLit|I have cats I know |] +testInContents SegmentAccuracy = [hereLit|Mary's cat is old +John is young +|] testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta) testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith Nobody is there @@ -691,7 +717,6 @@ testInContents _ = [hereLit|0.72 0 0.007 |] testExpectedContents :: Metric -> String -testExpectedContents GLEU = testExpectedContents BLEU testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po a ko te ahiahi , ko te ata , he ra kotahi |] @@ -739,6 +764,9 @@ O O O testExpectedContents TokenAccuracy = [hereLit|* V N * V |] +testExpectedContents SegmentAccuracy = [hereLit|N:1-6 N:8-10 V:12-13 A:15-17 +N:1-4 V:6-7 A:9-13 +|] testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta) testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3 @@ -754,10 +782,13 @@ bar:1/50,50,1000,1000 testExpectedContents ClippEU = [hereLit|3/0,0,100,100/10 1/10,10,1000,1000/10 |] +testExpectedContents GLEU = [hereLit|Alice has a black cat +|] testExpectedContents _ = [hereLit|0.11 17.2 |] + gitignoreContents :: String gitignoreContents = [hereLit| *~ diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index 0a53a61..b87c599 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -26,7 +26,7 @@ import Data.Attoparsec.Text (parseOnly) data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU | FMeasure Double | MacroFMeasure Double | NMI | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood - | BIOF1 | BIOF1Labels | TokenAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double + | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE | MultiLabelFMeasure Double | MultiLabelLogLoss | MultiLabelLikelihood | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double deriving (Eq) @@ -67,6 +67,7 @@ instance Show Metric where show BIOF1 = "BIO-F1" show BIOF1Labels = "BIO-F1-Labels" show TokenAccuracy = "TokenAccuracy" + show SegmentAccuracy = "SegmentAccuracy" show MAE = "MAE" show SMAPE = "SMAPE" show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta) @@ -118,6 +119,7 @@ instance Read Metric where readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)] readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)] readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)] + readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)] readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)] readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)] readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)] @@ -154,6 +156,7 @@ getMetricOrdering Likelihood = TheHigherTheBetter getMetricOrdering BIOF1 = TheHigherTheBetter getMetricOrdering BIOF1Labels = TheHigherTheBetter getMetricOrdering TokenAccuracy = TheHigherTheBetter +getMetricOrdering SegmentAccuracy = TheHigherTheBetter getMetricOrdering MAE = TheLowerTheBetter getMetricOrdering SMAPE = TheLowerTheBetter getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs index dc52eb2..0301cde 100644 --- a/src/GEval/MetricsMechanics.hs +++ b/src/GEval/MetricsMechanics.hs @@ -33,7 +33,9 @@ import Data.Maybe (catMaybes) import Control.Monad ((<=<)) -import GEval.Annotation (Annotation, ObtainedAnnotation, parseAnnotations, parseObtainedAnnotations) +import GEval.Annotation (Annotation, ObtainedAnnotation, + parseAnnotations, parseObtainedAnnotations, + parseSegmentAnnotations, segmentAccuracy) import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser) import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization) import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair) @@ -45,7 +47,7 @@ import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countL singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | AAccuracy | AClippEU | AFMeasure | AMacroFMeasure | ANMI | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood - | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure + | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure | AMultiLabelLogLoss | AMultiLabelLikelihood | ASoftFMeasure | AProbabilisticMultiLabelFMeasure | AProbabilisticSoftFMeasure | ASoft2DFMeasure deriving (Eq) @@ -73,6 +75,7 @@ toHelper Likelihood = ALikelihood toHelper BIOF1 = ABIOF1 toHelper BIOF1Labels = ABIOF1Labels toHelper TokenAccuracy = ATokenAccuracy +toHelper SegmentAccuracy = ASegmentAccuracy toHelper (LikelihoodHashed _) = ALikelihoodHashed toHelper MAE = AMAE toHelper SMAPE = ASMAPE @@ -114,6 +117,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where ParsedExpectedType ABIOF1 = [TaggedEntity] ParsedExpectedType ABIOF1Labels = [TaggedEntity] ParsedExpectedType ATokenAccuracy = [Text] + ParsedExpectedType ASegmentAccuracy = [Annotation] ParsedExpectedType AMAE = Double ParsedExpectedType ASMAPE = Double ParsedExpectedType AMultiLabelFMeasure = [Text] @@ -146,6 +150,7 @@ expectedParser SALikelihood = doubleParser expectedParser SABIOF1 = parseBioSequenceIntoEntities expectedParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization expectedParser SATokenAccuracy = intoWords +expectedParser SASegmentAccuracy = parseSegmentAnnotations expectedParser SAMAE = doubleParser expectedParser SASMAPE = doubleParser expectedParser SAMultiLabelFMeasure = intoWords @@ -190,6 +195,7 @@ outputParser SALikelihood = doubleParser outputParser SABIOF1 = parseBioSequenceIntoEntities outputParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization outputParser SATokenAccuracy = intoWords +outputParser SASegmentAccuracy = parseSegmentAnnotations outputParser SAMAE = doubleParser outputParser SASMAPE = doubleParser outputParser SAMultiLabelFMeasure = intoWords @@ -244,6 +250,7 @@ itemStep SALikelihood = itemLogLossError itemStep SABIOF1 = uncurry gatherCountsForBIO itemStep SABIOF1Labels = uncurry gatherCountsForBIO itemStep SATokenAccuracy = countHitsAndTotals +itemStep SASegmentAccuracy = uncurry segmentAccuracy itemStep SAMAE = itemAbsoluteError itemStep SASMAPE = smape itemStep SAMultiLabelFMeasure = getCounts (==) diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index 2158ba9..21659ab 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -63,6 +63,7 @@ listOfAvailableMetrics = [RMSE, BIOF1, BIOF1Labels, TokenAccuracy, + SegmentAccuracy, SoftFMeasure 1.0, SoftFMeasure 2.0, SoftFMeasure 0.25, @@ -93,6 +94,8 @@ isMetricDescribed :: Metric -> Bool isMetricDescribed (SoftFMeasure _) = True isMetricDescribed (Soft2DFMeasure _) = True isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True +isMetricDescribed GLEU = True +isMetricDescribed SegmentAccuracy = True isMetricDescribed _ = False getEvaluationSchemeDescription :: EvaluationScheme -> String @@ -118,8 +121,26 @@ where calibration measures the quality of probabilities (how well they are calib if we have 10 items with probability 0.5 and 5 of them are correct, then the calibration is perfect. |] - - +getMetricDescription GLEU = + [i|For the GLEU score, we record all sub-sequences of +1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then +compute a recall, which is the ratio of the number of matching n-grams +to the number of total n-grams in the target (ground truth) sequence, +and a precision, which is the ratio of the number of matching n-grams +to the number of total n-grams in the generated output sequence. Then +GLEU score is simply the minimum of recall and precision. This GLEU +score's range is always between 0 (no matches) and 1 (all match) and +it is symmetrical when switching output and target. According to +the article, GLEU score correlates quite well with the BLEU +metric on a corpus level but does not have its drawbacks for our per +sentence reward objective. +see: https://arxiv.org/pdf/1609.08144.pdf +|] +getMetricDescription SegmentAccuracy = + [i|Accuracy counted for segments, i.e. labels with positions. +The percentage of labels in the ground truth retrieved in the actual output is returned. +Accuracy is calculated separately for each item and then averaged. +|] outContents :: Metric -> String outContents (SoftFMeasure _) = [hereLit|inwords:1-4 @@ -132,6 +153,11 @@ outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surn surname/1:0.4 first-name/3:0.9 |] +outContents GLEU = [hereLit|Alice has a black +|] +outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17 +N:1-4 V:6-7 A:9-13 +|] expectedScore :: EvaluationScheme -> MetricValue expectedScore (EvaluationScheme (SoftFMeasure beta) []) @@ -146,6 +172,10 @@ expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) []) = let precision = 0.6569596940847289 recall = 0.675 in weightedHarmonicMean beta precision recall +expectedScore (EvaluationScheme GLEU []) + = 0.7142857142857143 +expectedScore (EvaluationScheme SegmentAccuracy []) + = 0.875 helpMetricParameterMetricsList :: String helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of @@ -194,7 +224,15 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed. |] - +formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. +|] +formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of +1-based indexes or spans separated by commas (spans are inclusive +ranges, e.g. "10-14"). For instance, "foo:bar:2,4-7,10" is a +label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no +overlapping segments can be returned (evaluation will fail in +such a case). +|] scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (SoftFMeasure _) []) @@ -206,6 +244,17 @@ As far as the second item is concerned, the total area that covered by the outpu Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000)=0.211. Therefore, the F-score for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|] scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing +scoreExplanation (EvaluationScheme GLEU []) + = Just [hereLit|To find out GLEU score we first count number of tp (true positives) fp(false positives) and fn(false negatives). + We have 4 matching unigrams ("Alice", "has", "a", "black") , 3 bigrams ("Alice has", "has a", "a black"), 2 trigrams ("Alice has a", "has a black") and 1 tetragram ("Alice has a black"), +so tp=10. We have no fp, therefore fp=0. There are 4 fn - ("cat", "black cat", "a black cat", "has a black cat"). +Now we have to calculate precision and recall: + Precision is tp / (tp+fp) = 10/(10+0) = 1, + recall is tp / (tp+fn) = 10 / (10+4) = 10/14 =~ 0.71428... + The GLEU score is min(precision,recall)=0.71428 |] +scoreExplanation (EvaluationScheme SegmentAccuracy []) + = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75). +The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|] pasteLines :: String -> String -> String pasteLines a b = printf "%-35s %s\n" a b diff --git a/test/Spec.hs b/test/Spec.hs index 7eea51c..cb6291e 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -146,6 +146,9 @@ main = hspec $ do describe "TokenAccuracy" $ do it "simple example" $ do runGEvalTest "token-accuracy-simple" `shouldReturnAlmost` 0.5 + describe "SegmentAccuracy" $ do + it "simple test" $ do + runGEvalTest "segment-accuracy-simple" `shouldReturnAlmost` 0.4444444 describe "precision count" $ do it "simple test" $ do precisionCount [["Alice", "has", "a", "cat" ]] ["Ala", "has", "cat"] `shouldBe` 2 @@ -342,6 +345,11 @@ main = hspec $ do it "just parse" $ do parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]), Annotation "baz" (IS.fromList [4,5,6])] + it "just parse wit colons" $ do + parseSegmentAnnotations "foo:x:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo:x" (IS.fromList [3,7,8,9,10]), + Annotation "baz" (IS.fromList [4,5,6])] + it "just parse wit colons" $ do + parseSegmentAnnotations "foo:x:3,7-10 baz:2-6" `shouldBe` Left "Overlapping segments" it "just parse 2" $ do parseAnnotations "inwords:1-3 indigits:5" `shouldBe` Right [Annotation "inwords" (IS.fromList [1,2,3]), Annotation "indigits" (IS.fromList [5])] diff --git a/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv new file mode 100644 index 0000000..4af8b51 --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +foo:0 baq:1-2 baz:3 +aaa:0-1 +xyz:0 bbb:x:1 diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/config.txt b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt new file mode 100644 index 0000000..2f838f0 --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple/config.txt @@ -0,0 +1 @@ +--metric SegmentAccuracy diff --git a/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv new file mode 100644 index 0000000..bc95bcb --- /dev/null +++ b/test/segment-accuracy-simple/segment-accuracy-simple/test-A/expected.tsv @@ -0,0 +1,3 @@ +foo:0 bar:1-2 baz:3 +aaa:0-2 +xyz:0 bbb:x:1 ccc:x:2