diff --git a/src/GEval/MatchingSpecification.hs b/src/GEval/MatchingSpecification.hs index 6efd243..b41c270 100644 --- a/src/GEval/MatchingSpecification.hs +++ b/src/GEval/MatchingSpecification.hs @@ -31,9 +31,13 @@ singletons [d|data MatchingSpecification = ExactMatch -- ^ exact match, i.e. ide -- is matched and then proceed with some matching spec. | SmartMatch MatchingSpecification -- ^ do fuzzy matching only on values -- containing letters + | Harden MatchingSpecification -- ^ harden a soft match deriving (Eq) |] +hardeningThreshold :: Double +hardeningThreshold = 0.8 + getMatchingFunctionForString :: MatchingSpecification -> String -> String -> Double getMatchingFunctionForString ExactMatch got expected | got == expected = 1.0 @@ -51,6 +55,11 @@ getMatchingFunctionForString (SmartMatch smatchSpec) got expected = getMatchingF then smatchSpec else ExactMatch +getMatchingFunctionForString (Harden smatchSpec) got expected = if softMatch >= hardeningThreshold + then 1.0 + else 0.0 + where softMatch = getMatchingFunctionForString smatchSpec got expected + -- | Whether suitable for fuzzy matching when in the "smart" match mode. -- At the moment we check whether it contains at least one letter -- (we require the exact match for, for instance, numbers written with digits. diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index e3da6c6..d30f214 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -84,6 +84,7 @@ instance Show Metric where show (MultiLabelFMeasure beta FuzzyMatch) = "Fuzzy/" ++ (show $ MultiLabelFMeasure beta ExactMatch) show (MultiLabelFMeasure beta (CutLabel matchSpec)) = "CutLabel/" ++ (show $ MultiLabelFMeasure beta matchSpec) show (MultiLabelFMeasure beta (SmartMatch matchSpec)) = "Smart/" ++ (show $ MultiLabelFMeasure beta matchSpec) + show (MultiLabelFMeasure beta (Harden matchSpec)) = "Harden/" ++ (show $ MultiLabelFMeasure beta matchSpec) show MultiLabelLogLoss = "MultiLabel-Logloss" show MultiLabelLikelihood = "MultiLabel-Likelihood" show (Mean metric) = "Mean/" ++ (show metric) @@ -108,6 +109,9 @@ instance Read Metric where readsPrec p ('S':'m':'a':'r':'t':'/':theRest) = case readsPrec p theRest of [(metric, theRest)] -> [(applyMatchingSpecification SmartMatch metric, theRest)] _ -> [] + readsPrec p ('H':'a':'r':'d':'e':'n':'/':theRest) = case readsPrec p theRest of + [(metric, theRest)] -> [(applyMatchingSpecification Harden metric, theRest)] + _ -> [] readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)] readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)] diff --git a/test/Spec.hs b/test/Spec.hs index b3a23e4..43b8a3d 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -351,6 +351,8 @@ main = hspec $ do runGEvalTest "multilabel-f1-ie-fuzzy" `shouldReturnAlmost` 0.681777777777 it "information extraction with smart fuzzy matching" $ do runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444 + it "information extraction with smart fuzzy matching hardened" $ do + runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555 describe "Mean/MultiLabel-F" $ do it "simple" $ do runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5 diff --git a/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden-solution/test-A/out.tsv b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden-solution/test-A/out.tsv new file mode 100644 index 0000000..753fb27 --- /dev/null +++ b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +important-person=JOHN_BROWN important-person=JOHN_SMITH company-name=Axaxaxaas_Mlo profit=12031 +company-name=Foo_Bar profit=1220 +company-name=Whatever important-person=PIERRE_MENARD diff --git a/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/config.txt b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/config.txt new file mode 100644 index 0000000..231de13 --- /dev/null +++ b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/config.txt @@ -0,0 +1 @@ +--metric Harden/CutLabel/Smart/Fuzzy/MultiLabel-F1:ls<_(inc|ltd)\.?(\s|$)><\2> diff --git a/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/test-A/expected.tsv b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/test-A/expected.tsv new file mode 100644 index 0000000..ff50263 --- /dev/null +++ b/test/multilabel-f1-ie-fuzzy-harden/multilabel-f1-ie-fuzzy-harden/test-A/expected.tsv @@ -0,0 +1,3 @@ +company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown +company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith +company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard