diff --git a/geval.cabal b/geval.cabal index 1ca0a95..2048d50 100644 --- a/geval.cabal +++ b/geval.cabal @@ -23,6 +23,7 @@ library , GEval.CreateChallenge , GEval.OptionsParser , GEval.BLEU + , GEval.Haversine , GEval.Clippings , GEval.PrecisionRecall , GEval.ClusteringMetrics diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index e003f72..ad4b90e 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -185,6 +185,7 @@ isPreprocessable (MultiLabelFMeasure _ _) = True isPreprocessable MultiLabelLogLoss = False isPreprocessable MultiLabelLikelihood = False isPreprocessable (Mean metric) = isPreprocessable metric +isPreprocessable Haversine = False defaultOutDirectory = "." defaultTestName = "test-A" @@ -1001,6 +1002,7 @@ continueGEvalCalculations SAMultiLabelLogLoss MultiLabelLogLoss = defineContinua id noGraph +continueGEvalCalculations SAHaversine Haversine = defineContinuation averageC id noGraph defineContinuation :: (ConduitT c Void (ResourceT m) d) -- ^ a Conduit which aggregates all the combined values into -- a "total" value diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 5e283f9..a52c663 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -115,6 +115,15 @@ Do OCR. This is a sample fake challenge for Gonito framework. Replace it with the description of your challenge.|] ++ (commonReadmeMDContents testName) +readmeMDContents Haversine testName = [i| +GEval simple sphere distance +========================== + +Compute distance between two points on a sphere given their longitudes and latitudes. + +This is a sample fake challenge for Gonito framework. Replace it with +the description of your challenge.|] ++ (commonReadmeMDContents testName) + readmeMDContents Accuracy testName = [i| GEval sample classification challenge ===================================== @@ -462,6 +471,11 @@ configContents schemes format testName = -- for the time being we are using the original function. trainInContents :: Metric -> String +trainInContents Haversine = unlines + $ map last + $ map (splitOn "\t") + $ lines + $ trainContents Haversine trainInContents metric = unlines $ map (intercalate "\t") $ map tail @@ -470,6 +484,12 @@ trainInContents metric = unlines $ trainContents metric trainExpectedContents :: Metric -> String +trainExpectedContents Haversine = unlines + $ map (intercalate "\t") + $ map (take 2) + $ map (splitOn "\t") + $ lines + $ trainContents Haversine trainExpectedContents metric = unlines $ map head $ map (splitOn "\t") $ lines $ trainContents metric trainContents :: Metric -> String @@ -484,6 +504,9 @@ trainContents CER = [hereLit|Hannibal ad portas train1.pdf equo ne credite train2.pdf errare humanum est train3.pdf |] +trainContents Haversine = [hereLit|30.47547 -90.100911 some text +33.399478 -110.87095 Another text +|] trainContents Accuracy = [hereLit|Y 10 none yes N -2 strong no @@ -645,6 +668,8 @@ devInContents (Soft2DFMeasure _) = devInContents ClippEU devInContents ClippEU = [hereLit|file1.djvu file2.djvu |] +devInContents Haversine = [hereLit|Some dev text +|] devInContents _ = [hereLit|0.72 0 0.007 9.54 62 0.054 |] @@ -719,6 +744,8 @@ devExpectedContents (Soft2DFMeasure _) = [hereLit| devExpectedContents ClippEU = [hereLit| 10/10,20,30,100/5 3/0,50,500,500/5 |] +devExpectedContents Haversine = [hereLit|32.812883 -109.625582 +|] devExpectedContents _ = [hereLit|0.82 95.2 |] @@ -794,6 +821,9 @@ testInContents (Soft2DFMeasure _) = testInContents ClippEU testInContents ClippEU = [hereLit|file3.djvu file4.djvu |] +testInContents Haversine = [hereLit|Some test text +Another test text +|] testInContents _ = [hereLit|0.72 0 0.007 9.54 62 0.054 |] @@ -871,6 +901,9 @@ testExpectedContents ClippEU = [hereLit|3/0,0,100,100/10 |] testExpectedContents GLEU = [hereLit|Alice has a black cat |] +testExpectedContents Haversine = [hereLit|39.575264 -76.995928 +29.949932 -90.070116 +|] testExpectedContents _ = [hereLit|0.11 17.2 |] @@ -921,6 +954,7 @@ inHeaderContents MultiLabelLikelihood = inHeaderContents MultiLabelLogLoss inHeaderContents MultiLabelLogLoss = Just ["Utterance"] inHeaderContents (Soft2DFMeasure _) = inHeaderContents ClippEU inHeaderContents ClippEU = Just ["DjvuFilePath"] +inHeaderContents Haversine = Just ["Text"] inHeaderContents _ = Just ["OrbitalPeriod", "OrbitalEccentricity", "NumberOfMoons"] outHeaderContents :: Metric -> Maybe [String] @@ -951,6 +985,7 @@ outHeaderContents MultiLabelLikelihood = outHeaderContents MultiLabelLogLoss outHeaderContents MultiLabelLogLoss = Just ["Emotion"] outHeaderContents (Soft2DFMeasure _) = Just ["Rectangle"] outHeaderContents ClippEU = Just ["Rectangle"] +outHeaderContents Haversine = Just ["Longitude", "Latitude"] outHeaderContents _ = Just ["Mass"] gitignoreContents :: String diff --git a/src/GEval/Haversine.hs b/src/GEval/Haversine.hs new file mode 100644 index 0000000..0511edd --- /dev/null +++ b/src/GEval/Haversine.hs @@ -0,0 +1,28 @@ +{-# LANGUAGE OverloadedStrings #-} + +module GEval.Haversine + (haversine) + where + + +haversine :: ((Double, Double), (Double, Double)) -> Double +haversine ((longitude_1, latitude_1), (longitude_2, latitude_2)) = hav longitude_1_rad latitude_1_rad longitude_2_rad latitude_2_rad + where + longitude_1_rad = toRadians longitude_1 + latitude_1_rad = toRadians latitude_1 + longitude_2_rad = toRadians longitude_2 + latitude_2_rad = toRadians latitude_2 + +hav :: Double -> Double -> Double -> Double -> Double +hav longitude_1 latitude_1 longitude_2 latitude_2 = 2 * asin (sqrt h) * r + where + r = 6371.0 -- Radius of earth in kilometers. Use 3956 for miles + longitude = longitude_2 - longitude_1 + latitude = latitude_2 - latitude_1 + h = hav_ longitude + cos latitude * cos latitude * hav_ latitude + +hav_ :: Double -> Double +hav_ x = sin(x / 2) ** 2 + +toRadians :: Double -> Double +toRadians degrees = degrees * pi / 180.0 diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index 77ab327..8597600 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -34,6 +34,7 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accura | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double | ProbabilisticSoftFMeasure Double | Soft2DFMeasure Double | FLCFMeasure Double + | Haversine -- it would be better to avoid infinite recursion here -- `Mean (Mean BLEU)` is not useful, but as it would mean -- a larger refactor, we will postpone this @@ -88,6 +89,7 @@ instance Show Metric where show (MultiLabelFMeasure beta (Harden matchSpec)) = "Harden/" ++ (show $ MultiLabelFMeasure beta matchSpec) show MultiLabelLogLoss = "MultiLabel-Logloss" show MultiLabelLikelihood = "MultiLabel-Likelihood" + show Haversine = "Haversine" show (Mean metric) = "Mean/" ++ (show metric) applyMatchingSpecification :: (MatchingSpecification -> MatchingSpecification) @@ -166,6 +168,7 @@ instance Read Metric where readsPrec _ ('S':'M':'A':'P':'E':theRest) = [(SMAPE, theRest)] readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)] readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'i':'k':'e':'l':'i':'h':'o':'o':'d':theRest) = [(MultiLabelLikelihood, theRest)] + readsPrec _ ('H':'a':'v':'e':'r':'s':'i':'n':'e':theRest) = [(Haversine, theRest)] @@ -206,6 +209,7 @@ getMetricOrdering SMAPE = TheLowerTheBetter getMetricOrdering (MultiLabelFMeasure _ _) = TheHigherTheBetter getMetricOrdering MultiLabelLogLoss = TheLowerTheBetter getMetricOrdering MultiLabelLikelihood = TheHigherTheBetter +getMetricOrdering Haversine = TheLowerTheBetter getMetricOrdering (Mean metric) = getMetricOrdering metric metricCompare :: Metric -> MetricValue -> MetricValue -> Ordering diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs index c0c4c98..6e66336 100644 --- a/src/GEval/MetricsMechanics.hs +++ b/src/GEval/MetricsMechanics.hs @@ -43,6 +43,7 @@ import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIn import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair) import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList) import GEval.MatchingSpecification +import GEval.Haversine -- | Helper type so that singleton can be used. -- | (The problem is that some metrics are parametrized by Double @@ -53,7 +54,7 @@ singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification | AMultiLabelLogLoss | AMultiLabelLikelihood | ASoftFMeasure | AProbabilisticMultiLabelFMeasure | AProbabilisticSoftFMeasure | ASoft2DFMeasure - | AFLCFMeasure + | AFLCFMeasure | AHaversine deriving (Eq) |] @@ -92,6 +93,7 @@ toHelper (FLCFMeasure _) = AFLCFMeasure toHelper (ProbabilisticMultiLabelFMeasure _) = AProbabilisticMultiLabelFMeasure toHelper (ProbabilisticSoftFMeasure _) = AProbabilisticSoftFMeasure toHelper (Soft2DFMeasure _) = ASoft2DFMeasure +toHelper Haversine = AHaversine type family ParsedInputType (t :: AMetric) :: * where ParsedInputType ACharMatch = Text @@ -131,6 +133,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where ParsedExpectedType (AMultiLabelFMeasure _) = [Text] ParsedExpectedType AMultiLabelLogLoss = [Text] ParsedExpectedType AMultiLabelLikelihood = [Text] + ParsedExpectedType AHaversine = (Double, Double) expectedParser :: SAMetric t -> Text -> Either String (ParsedExpectedType t) expectedParser SARMSE = doubleParser @@ -166,6 +169,7 @@ expectedParser SASMAPE = doubleParser expectedParser (SAMultiLabelFMeasure _) = intoWords expectedParser SAMultiLabelLogLoss = intoWords expectedParser SAMultiLabelLikelihood = intoWords +expectedParser SAHaversine = parseSpherePoints type family ParsedOutputType (t :: AMetric) :: * where ParsedOutputType ABLEU = [String] @@ -178,6 +182,7 @@ type family ParsedOutputType (t :: AMetric) :: * where ParsedOutputType AProbabilisticMultiLabelFMeasure = [WordWithProb] ParsedOutputType AMultiLabelLikelihood = ProbList ParsedOutputType AMultiLabelLogLoss = ProbList + ParsedOutputType AHaversine = (Double, Double) ParsedOutputType t = ParsedExpectedType t outputParser :: SAMetric t -> Text -> Either String (ParsedOutputType t) @@ -214,6 +219,7 @@ outputParser SASMAPE = doubleParser outputParser (SAMultiLabelFMeasure _) = intoWords outputParser SAMultiLabelLogLoss = Right . parseIntoProbList outputParser SAMultiLabelLikelihood = Right . parseIntoProbList +outputParser SAHaversine = parseSpherePoints type family ItemIntermediateRepresentationType (t :: AMetric) :: * where ItemIntermediateRepresentationType ABLEU = (Int, Int, Int, Int, Int, Int, Int, Int, Int) @@ -241,6 +247,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where ItemIntermediateRepresentationType ACharMatch = (Text, Text) ItemIntermediateRepresentationType AWER = (Int, Int) ItemIntermediateRepresentationType ACER = (Int, Int) + ItemIntermediateRepresentationType AHaversine = Double ItemIntermediateRepresentationType t = Double itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t @@ -278,6 +285,7 @@ itemStep SASMAPE = smape itemStep (SAMultiLabelFMeasure smatchSpec) = getWeightedCounts (getMatchingFunctionForText $ fromSing smatchSpec) itemStep SAMultiLabelLogLoss = uncurry countLogLossOnProbList itemStep SAMultiLabelLikelihood = uncurry countLogLossOnProbList +itemStep SAHaversine = haversine doubleParser = getValue . TR.double @@ -400,3 +408,12 @@ countHitsAndTotals (es, os) = | e == (pack "*") = (h, t) | o `Prelude.elem` (splitOn (pack ";") e) = (h + 1, t + 1) | otherwise = (h, t + 1) + +parseSpherePoints :: Text -> Either String (Double, Double) +parseSpherePoints t = case DLS.splitOn "\t" (unpack t) of + [longitudeStr, latitudeStr] -> case doubleParser (pack longitudeStr) of + Right longitude -> case doubleParser (pack latitudeStr) of + Right latitude -> Right (longitude, latitude) + Left _ -> Left "cannot parse line with latitude of sphere" + Left _ -> Left "cannot parse line with longitude of sphere" + _ -> Left "cannot parse line with longitude and latitude of sphere" diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index d5a887e..6f62d5d 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -76,6 +76,7 @@ listOfAvailableMetrics = [RMSE, Soft2DFMeasure 1.0, Soft2DFMeasure 2.0, Soft2DFMeasure 0.25, + Haversine, CharMatch] extraInfo :: EvaluationScheme -> Maybe String @@ -103,6 +104,7 @@ isMetricDescribed GLEU = True isMetricDescribed WER = True isMetricDescribed CER = True isMetricDescribed SegmentAccuracy = True +isMetricDescribed Haversine = True isMetricDescribed _ = False getEvaluationSchemeDescription :: EvaluationScheme -> String @@ -170,6 +172,10 @@ getMetricDescription SegmentAccuracy = The percentage of labels in the ground truth retrieved in the actual output is returned. Accuracy is calculated separately for each item and then averaged. |] +getMetricDescription Haversine = + [i|The haversine formula determines the great-circle distance between +two points on a sphere given their longitudes and latitudes (in degrees). +|] outContents :: Metric -> String outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3 @@ -197,6 +203,9 @@ tabula rasai outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17 N:1-4 V:6-7 A:9-13 |] +outContents Haversine = [hereLit|39.575264 -56.995928 +29.949932 -90.070116 +|] expectedScore :: EvaluationScheme -> MetricValue expectedScore (EvaluationScheme (MultiLabelFMeasure 1.0 ExactMatch) []) = 0.6666 @@ -220,6 +229,8 @@ expectedScore (EvaluationScheme WER []) = 0.08571 expectedScore (EvaluationScheme CER []) = 0.14814 +expectedScore (EvaluationScheme Haversine []) + = 1044.2633358563135 helpMetricParameterMetricsList :: String helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of @@ -283,6 +294,9 @@ such a case). formatDescription WER = formatDescription GLEU formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are also considered. |] +formatDescription Haversine = [hereLit|Each line is a latitude and longitude of sphere separated by tabulation, +e.g. "41.558153 -73.051497". +|] scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) []) diff --git a/test/Spec.hs b/test/Spec.hs index c0e7392..22834fc 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -135,6 +135,9 @@ main = hspec $ do runGEvalTest "cer-mean-simple" `shouldReturnAlmost` 0.277777777777778 it "space escaping" $ runGEvalTest "cer-space-escaping" `shouldReturnAlmost` 0.0555555 + describe "Haversine" $ do + it "simple example" $ + runGEvalTest "haversine" `shouldReturnAlmost` 1951.9351057250876 describe "Accuracy" $ do it "simple example" $ runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6 diff --git a/test/haversine/haversine-solution/test-A/out.tsv b/test/haversine/haversine-solution/test-A/out.tsv new file mode 100644 index 0000000..89a70f4 --- /dev/null +++ b/test/haversine/haversine-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +40.735657 -74.172367 +21.124312 -73.051497 +10.232421 -105.734523 diff --git a/test/haversine/haversine/config.txt b/test/haversine/haversine/config.txt new file mode 100644 index 0000000..899c565 --- /dev/null +++ b/test/haversine/haversine/config.txt @@ -0,0 +1 @@ +--metric Haversine diff --git a/test/haversine/haversine/test-A/expected.tsv b/test/haversine/haversine/test-A/expected.tsv new file mode 100644 index 0000000..ca2b3e8 --- /dev/null +++ b/test/haversine/haversine/test-A/expected.tsv @@ -0,0 +1,3 @@ +40.735657 -74.172367 +41.558153 -73.051497 +40.832421 -115.763123