Add smart mode

This commit is contained in:
Filip Gralinski 2020-07-02 18:14:56 +02:00
parent 00a2fc7d19
commit 236712c52b
6 changed files with 29 additions and 1 deletions

View File

@ -18,6 +18,9 @@ module GEval.MatchingSpecification
import Data.Singletons.TH import Data.Singletons.TH
import Data.Text import Data.Text
import Data.List.Extra (breakOn) import Data.List.Extra (breakOn)
import Data.Char (isLetter)
import Data.List (find)
import Data.Maybe (isJust)
import Text.EditDistance import Text.EditDistance
@ -26,6 +29,8 @@ singletons [d|data MatchingSpecification = ExactMatch -- ^ exact match, i.e. ide
| FuzzyMatch -- ^ fuzzy match by Levenshtein distance | FuzzyMatch -- ^ fuzzy match by Levenshtein distance
| CutLabel MatchingSpecification -- ^ require that the label (part before up to `=`) | CutLabel MatchingSpecification -- ^ require that the label (part before up to `=`)
-- is matched and then proceed with some matching spec. -- is matched and then proceed with some matching spec.
| SmartMatch MatchingSpecification -- ^ do fuzzy matching only on values
-- containing letters
deriving (Eq) deriving (Eq)
|] |]
@ -41,6 +46,16 @@ getMatchingFunctionForString (CutLabel smatchSpec) a b = getMatchingFunctionForS
where a' = cutLabel a where a' = cutLabel a
b' = cutLabel b b' = cutLabel b
getMatchingFunctionForString (SmartMatch smatchSpec) got expected = getMatchingFunctionForString chosenMatch got expected
where chosenMatch = if wantedBySmartMatch expected
then smatchSpec
else ExactMatch
-- | Whether suitable for fuzzy matching when in the "smart" match mode.
-- At the moment we check whether it contains at least one letter
-- (we require the exact match for, for instance, numbers written with digits.
wantedBySmartMatch = isJust . (Data.List.find isLetter)
-- | Remove the label along with the separator (the equal sign) -- | Remove the label along with the separator (the equal sign)
cutLabel :: String -> String cutLabel :: String -> String
cutLabel t = case Data.List.Extra.breakOn "=" t of cutLabel t = case Data.List.Extra.breakOn "=" t of

View File

@ -83,6 +83,7 @@ instance Show Metric where
show (MultiLabelFMeasure beta ExactMatch) = "MultiLabel-F" ++ (show beta) show (MultiLabelFMeasure beta ExactMatch) = "MultiLabel-F" ++ (show beta)
show (MultiLabelFMeasure beta FuzzyMatch) = "Fuzzy/" ++ (show $ MultiLabelFMeasure beta ExactMatch) show (MultiLabelFMeasure beta FuzzyMatch) = "Fuzzy/" ++ (show $ MultiLabelFMeasure beta ExactMatch)
show (MultiLabelFMeasure beta (CutLabel matchSpec)) = "CutLabel/" ++ (show $ MultiLabelFMeasure beta matchSpec) show (MultiLabelFMeasure beta (CutLabel matchSpec)) = "CutLabel/" ++ (show $ MultiLabelFMeasure beta matchSpec)
show (MultiLabelFMeasure beta (SmartMatch matchSpec)) = "Smart/" ++ (show $ MultiLabelFMeasure beta matchSpec)
show MultiLabelLogLoss = "MultiLabel-Logloss" show MultiLabelLogLoss = "MultiLabel-Logloss"
show MultiLabelLikelihood = "MultiLabel-Likelihood" show MultiLabelLikelihood = "MultiLabel-Likelihood"
show (Mean metric) = "Mean/" ++ (show metric) show (Mean metric) = "Mean/" ++ (show metric)
@ -104,6 +105,9 @@ instance Read Metric where
readsPrec p ('C':'u':'t':'L':'a':'b':'e':'l':'/':theRest) = case readsPrec p theRest of readsPrec p ('C':'u':'t':'L':'a':'b':'e':'l':'/':theRest) = case readsPrec p theRest of
[(metric, theRest)] -> [(applyMatchingSpecification CutLabel metric, theRest)] [(metric, theRest)] -> [(applyMatchingSpecification CutLabel metric, theRest)]
_ -> [] _ -> []
readsPrec p ('S':'m':'a':'r':'t':'/':theRest) = case readsPrec p theRest of
[(metric, theRest)] -> [(applyMatchingSpecification SmartMatch metric, theRest)]
_ -> []
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)] readsPrec _ ('M':'S':'E':theRest) = [(MSE, theRest)]
readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)] readsPrec _ ('P':'e':'a':'r':'s':'o':'n':theRest) = [(Pearson, theRest)]

View File

@ -349,6 +349,8 @@ main = hspec $ do
runGEvalTest "multilabel-f1-ie-flags" `shouldReturnAlmost` 0.444444444444 runGEvalTest "multilabel-f1-ie-flags" `shouldReturnAlmost` 0.444444444444
it "information extraction with fuzzy matching" $ do it "information extraction with fuzzy matching" $ do
runGEvalTest "multilabel-f1-ie-fuzzy" `shouldReturnAlmost` 0.681777777777 runGEvalTest "multilabel-f1-ie-fuzzy" `shouldReturnAlmost` 0.681777777777
it "information extraction with smart fuzzy matching" $ do
runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
describe "Mean/MultiLabel-F" $ do describe "Mean/MultiLabel-F" $ do
it "simple" $ do it "simple" $ do
runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5 runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5

View File

@ -0,0 +1,3 @@
important-person=JOHN_BROWN important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
company-name=Foo_Bar profit=1220
company-name=Whatever important-person=PIERRE_MENARD
1 important-person=JOHN_BROWN important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
2 company-name=Foo_Bar profit=1220
3 company-name=Whatever important-person=PIERRE_MENARD

View File

@ -0,0 +1 @@
--metric CutLabel/Smart/Fuzzy/MultiLabel-F1:ls<_(inc|ltd)\.?(\s|$)><\2>

View File

@ -0,0 +1,3 @@
company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard
1 company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
2 company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
3 company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard