Consider word shapes in black-box debugging
This commit is contained in:
parent
e0cfb9c4b0
commit
212457077f
@ -38,6 +38,7 @@ library
|
|||||||
, Text.Tokenizer
|
, Text.Tokenizer
|
||||||
, GEval.Annotation
|
, GEval.Annotation
|
||||||
, GEval.BlackBoxDebugging
|
, GEval.BlackBoxDebugging
|
||||||
|
, Text.WordShape
|
||||||
, Paths_geval
|
, Paths_geval
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
|
@ -3,5 +3,6 @@ module GEval.BlackBoxDebugging
|
|||||||
where
|
where
|
||||||
|
|
||||||
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
||||||
bbdoMinFrequency :: Integer
|
bbdoMinFrequency :: Integer,
|
||||||
|
bbdoWordShapes :: Bool
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,8 @@ import Data.Text
|
|||||||
import Data.List
|
import Data.List
|
||||||
import Data.Monoid ((<>))
|
import Data.Monoid ((<>))
|
||||||
import Text.Tokenizer
|
import Text.Tokenizer
|
||||||
|
import Text.WordShape
|
||||||
|
import GEval.BlackBoxDebugging
|
||||||
|
|
||||||
data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -18,11 +20,12 @@ data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
|||||||
instance Show Feature where
|
instance Show Feature where
|
||||||
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||||
|
|
||||||
data AtomicFeature = TextFeature Text
|
data AtomicFeature = TextFeature Text | ShapeFeature WordShape
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show AtomicFeature where
|
instance Show AtomicFeature where
|
||||||
show (TextFeature t) = unpack t
|
show (TextFeature t) = unpack t
|
||||||
|
show (ShapeFeature (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t)
|
||||||
|
|
||||||
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -36,16 +39,22 @@ tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split
|
|||||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||||
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
||||||
|
|
||||||
extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature]
|
extractAtomicFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFeature]]
|
||||||
extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer)
|
extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++
|
||||||
|
(if bbdoWordShapes bbdo
|
||||||
|
then [nub $ Data.List.map (ShapeFeature . shapify) tokens]
|
||||||
|
else [])
|
||||||
|
where tokens = nub $ (tokenizeForFeatures mTokenizer) t
|
||||||
|
|
||||||
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
|
||||||
extractUnigramFeatures mTokenizer namespace record =
|
extractUnigramFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||||
|
extractUnigramFeatures mTokenizer bbdo namespace record =
|
||||||
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
||||||
$ extractAtomicFeatures mTokenizer record
|
$ Data.List.concat
|
||||||
|
$ extractAtomicFeatures mTokenizer bbdo record
|
||||||
|
|
||||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||||
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
extractUnigramFeaturesFromTabbed mTokenizer bbdo namespace record =
|
||||||
Data.List.concat
|
Data.List.concat
|
||||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ Data.List.concat $ extractAtomicFeatures mTokenizer bbdo t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
@ -104,7 +104,7 @@ forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
|
|||||||
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
||||||
extractFeaturesAndPValues spec bbdo =
|
extractFeaturesAndPValues spec bbdo =
|
||||||
totalCounter
|
totalCounter
|
||||||
.| featureExtractor spec
|
.| featureExtractor spec bbdo
|
||||||
.| uScoresCounter (bbdoMinFrequency bbdo)
|
.| uScoresCounter (bbdoMinFrequency bbdo)
|
||||||
|
|
||||||
|
|
||||||
@ -124,14 +124,14 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
|||||||
(pack $ printf "%0.8f" avg),
|
(pack $ printf "%0.8f" avg),
|
||||||
(pack $ printf "%0.20f" p)]
|
(pack $ printf "%0.20f" p)]
|
||||||
|
|
||||||
featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m ()
|
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||||
featureExtractor spec = CC.map extract .| CC.concat
|
featureExtractor spec bbdo = CC.map extract .| CC.concat
|
||||||
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
||||||
Prelude.map (\f -> RankedFeature f rank score)
|
Prelude.map (\f -> RankedFeature f rank score)
|
||||||
$ Data.List.concat [
|
$ Data.List.concat [
|
||||||
extractUnigramFeatures mTokenizer "exp" expLine,
|
extractUnigramFeatures mTokenizer bbdo "exp" expLine,
|
||||||
extractUnigramFeaturesFromTabbed mTokenizer "in" inLine,
|
extractUnigramFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
extractUnigramFeatures mTokenizer "out" outLine]
|
extractUnigramFeatures mTokenizer bbdo "out" outLine]
|
||||||
mTokenizer = gesTokenizer spec
|
mTokenizer = gesTokenizer spec
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
|
@ -172,6 +172,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
|
|||||||
<> help "Minimum frequency for the worst features"
|
<> help "Minimum frequency for the worst features"
|
||||||
<> value 1
|
<> value 1
|
||||||
<> showDefault)
|
<> showDefault)
|
||||||
|
<*> switch
|
||||||
|
( long "word-shapes"
|
||||||
|
<> help "Consider word shapes")
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
40
src/Text/WordShape.hs
Normal file
40
src/Text/WordShape.hs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
module Text.WordShape
|
||||||
|
(WordShape(..), shapify)
|
||||||
|
where
|
||||||
|
|
||||||
|
import Data.Text as T
|
||||||
|
import Data.Char
|
||||||
|
|
||||||
|
newtype WordShape = WordShape Text
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show WordShape where
|
||||||
|
show (WordShape t) = unpack t
|
||||||
|
|
||||||
|
-- The idea taken from https://github.com/aleju/ner-crf/blob/master/model/features.py#L377
|
||||||
|
|
||||||
|
isBracket :: Char -> Bool
|
||||||
|
isBracket c = cat == OpenPunctuation || cat == ClosePunctuation
|
||||||
|
where cat = generalCategory c
|
||||||
|
|
||||||
|
normalizeChar :: Char -> Char
|
||||||
|
normalizeChar c
|
||||||
|
| isAlpha c && isUpper c = 'A'
|
||||||
|
| isAlpha c && isLower c = 'a'
|
||||||
|
| isDigit c = '9'
|
||||||
|
| isSpace c = ' '
|
||||||
|
| isBracket c = '('
|
||||||
|
| isPunctuation c = '.'
|
||||||
|
| otherwise = '#'
|
||||||
|
|
||||||
|
shapify :: Text -> WordShape
|
||||||
|
shapify t = WordShape $ normalize $ T.map normalizeChar t
|
||||||
|
where normalize t = T.reverse $ pack $ T.foldl step "" t
|
||||||
|
step [] c = [c]
|
||||||
|
step p '9' = '9':p
|
||||||
|
step p@('+':h:t) c
|
||||||
|
| h == c = p
|
||||||
|
| otherwise = c:p
|
||||||
|
step p@(h:t) c
|
||||||
|
| h == c = ('+':p)
|
||||||
|
| otherwise = c:p
|
10
test/Spec.hs
10
test/Spec.hs
@ -14,6 +14,7 @@ import GEval.LineByLine
|
|||||||
import GEval.ParseParams
|
import GEval.ParseParams
|
||||||
import GEval.Submit
|
import GEval.Submit
|
||||||
import Text.Tokenizer
|
import Text.Tokenizer
|
||||||
|
import Text.WordShape
|
||||||
import Data.Attoparsec.Text
|
import Data.Attoparsec.Text
|
||||||
import Options.Applicative
|
import Options.Applicative
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -458,6 +459,15 @@ main = hspec $ do
|
|||||||
it "simple utterance with 'character-by-character' tokenizer" $ do
|
it "simple utterance with 'character-by-character' tokenizer" $ do
|
||||||
tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe`
|
tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe`
|
||||||
["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."]
|
["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."]
|
||||||
|
describe "shapify" $ do
|
||||||
|
it "simple tests" $ do
|
||||||
|
shapify "Poznań" `shouldBe` (WordShape "Aa+")
|
||||||
|
shapify "2019" `shouldBe` (WordShape "9999")
|
||||||
|
shapify "Ala ma (czarnego) kota?" `shouldBe` (WordShape "Aa+ a+ (a+( a+.")
|
||||||
|
shapify "" `shouldBe` (WordShape "")
|
||||||
|
shapify "PCMCIA" `shouldBe` (WordShape "A+")
|
||||||
|
shapify "a" `shouldBe` (WordShape "a")
|
||||||
|
shapify "B5" `shouldBe` (WordShape "A9")
|
||||||
describe "submit" $ do
|
describe "submit" $ do
|
||||||
it "current branch" $ do
|
it "current branch" $ do
|
||||||
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
||||||
|
Loading…
Reference in New Issue
Block a user