Add hardening

This commit is contained in:
Filip Gralinski 2020-07-02 18:22:29 +02:00
parent 236712c52b
commit 6c295a3325
6 changed files with 22 additions and 0 deletions

View File

@ -31,9 +31,13 @@ singletons [d|data MatchingSpecification = ExactMatch -- ^ exact match, i.e. ide
-- is matched and then proceed with some matching spec. -- is matched and then proceed with some matching spec.
| SmartMatch MatchingSpecification -- ^ do fuzzy matching only on values | SmartMatch MatchingSpecification -- ^ do fuzzy matching only on values
-- containing letters -- containing letters
| Harden MatchingSpecification -- ^ harden a soft match
deriving (Eq) deriving (Eq)
|] |]
hardeningThreshold :: Double
hardeningThreshold = 0.8
getMatchingFunctionForString :: MatchingSpecification -> String -> String -> Double getMatchingFunctionForString :: MatchingSpecification -> String -> String -> Double
getMatchingFunctionForString ExactMatch got expected getMatchingFunctionForString ExactMatch got expected
| got == expected = 1.0 | got == expected = 1.0
@ -51,6 +55,11 @@ getMatchingFunctionForString (SmartMatch smatchSpec) got expected = getMatchingF
then smatchSpec then smatchSpec
else ExactMatch else ExactMatch
getMatchingFunctionForString (Harden smatchSpec) got expected = if softMatch >= hardeningThreshold
then 1.0
else 0.0
where softMatch = getMatchingFunctionForString smatchSpec got expected
-- | Whether suitable for fuzzy matching when in the "smart" match mode. -- | Whether suitable for fuzzy matching when in the "smart" match mode.
-- At the moment we check whether it contains at least one letter -- At the moment we check whether it contains at least one letter
-- (we require the exact match for, for instance, numbers written with digits. -- (we require the exact match for, for instance, numbers written with digits.

View File

@ -84,6 +84,7 @@ instance Show Metric where
show (MultiLabelFMeasure beta FuzzyMatch) = "Fuzzy/" ++ (show $ MultiLabelFMeasure beta ExactMatch) show (MultiLabelFMeasure beta FuzzyMatch) = "Fuzzy/" ++ (show $ MultiLabelFMeasure beta ExactMatch)
show (MultiLabelFMeasure beta (CutLabel matchSpec)) = "CutLabel/" ++ (show $ MultiLabelFMeasure beta matchSpec) show (MultiLabelFMeasure beta (CutLabel matchSpec)) = "CutLabel/" ++ (show $ MultiLabelFMeasure beta matchSpec)
show (MultiLabelFMeasure beta (SmartMatch matchSpec)) = "Smart/" ++ (show $ MultiLabelFMeasure beta matchSpec) show (MultiLabelFMeasure beta (SmartMatch matchSpec)) = "Smart/" ++ (show $ MultiLabelFMeasure beta matchSpec)
show (MultiLabelFMeasure beta (Harden matchSpec)) = "Harden/" ++ (show $ MultiLabelFMeasure beta matchSpec)
show MultiLabelLogLoss = "MultiLabel-Logloss" show MultiLabelLogLoss = "MultiLabel-Logloss"
show MultiLabelLikelihood = "MultiLabel-Likelihood" show MultiLabelLikelihood = "MultiLabel-Likelihood"
show (Mean metric) = "Mean/" ++ (show metric) show (Mean metric) = "Mean/" ++ (show metric)
@ -108,6 +109,9 @@ instance Read Metric where
readsPrec p ('S':'m':'a':'r':'t':'/':theRest) = case readsPrec p theRest of readsPrec p ('S':'m':'a':'r':'t':'/':theRest) = case readsPrec p theRest of
[(metric, theRest)] -> [(applyMatchingSpecification SmartMatch metric, theRest)] [(metric, theRest)] -> [(applyMatchingSpecification SmartMatch metric, theRest)]
_ -> [] _ -> []
readsPrec p ('H':'a':'r':'d':'e':'n':'/':theRest) = case readsPrec p theRest of
[(metric, theRest)] -> [(applyMatchingSpecification Harden metric, theRest)]
_ -> []
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)] readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)]
readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)] readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)]

View File

@ -351,6 +351,8 @@ main = hspec $ do
runGEvalTest "multilabel-f1-ie-fuzzy" `shouldReturnAlmost` 0.681777777777 runGEvalTest "multilabel-f1-ie-fuzzy" `shouldReturnAlmost` 0.681777777777
it "information extraction with smart fuzzy matching" $ do it "information extraction with smart fuzzy matching" $ do
runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444 runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
it "information extraction with smart fuzzy matching hardened" $ do
runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555
describe "Mean/MultiLabel-F" $ do describe "Mean/MultiLabel-F" $ do
it "simple" $ do it "simple" $ do
runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5 runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5

View File

@ -0,0 +1,3 @@
important-person=JOHN_BROWN important-person=JOHN_SMITH company-name=Axaxaxaas_Mlo profit=12031
company-name=Foo_Bar profit=1220
company-name=Whatever important-person=PIERRE_MENARD
1 important-person=JOHN_BROWN important-person=JOHN_SMITH company-name=Axaxaxaas_Mlo profit=12031
2 company-name=Foo_Bar profit=1220
3 company-name=Whatever important-person=PIERRE_MENARD

View File

@ -0,0 +1 @@
--metric Harden/CutLabel/Smart/Fuzzy/MultiLabel-F1:ls<_(inc|ltd)\.?(\s|$)><\2>

View File

@ -0,0 +1,3 @@
company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard
1 company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
2 company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
3 company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard