diff --git a/geval.cabal b/geval.cabal index 1ec4686..1f72508 100644 --- a/geval.cabal +++ b/geval.cabal @@ -38,6 +38,7 @@ library , Text.Tokenizer , GEval.Annotation , GEval.BlackBoxDebugging + , Text.WordShape , Paths_geval build-depends: base >= 4.7 && < 5 , cond diff --git a/src/GEval/BlackBoxDebugging.hs b/src/GEval/BlackBoxDebugging.hs index 97af1c6..7102ca8 100644 --- a/src/GEval/BlackBoxDebugging.hs +++ b/src/GEval/BlackBoxDebugging.hs @@ -3,5 +3,6 @@ module GEval.BlackBoxDebugging where data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions { - bbdoMinFrequency :: Integer + bbdoMinFrequency :: Integer, + bbdoWordShapes :: Bool } diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index d7a6299..6e85476 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -11,6 +11,8 @@ import Data.Text import Data.List import Data.Monoid ((<>)) import Text.Tokenizer +import Text.WordShape +import GEval.BlackBoxDebugging data Feature = SimpleFeature FeatureNamespace AtomicFeature deriving (Eq, Ord) @@ -18,11 +20,12 @@ data Feature = SimpleFeature FeatureNamespace AtomicFeature instance Show Feature where show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) -data AtomicFeature = TextFeature Text +data AtomicFeature = TextFeature Text | ShapeFeature WordShape deriving (Eq, Ord) instance Show AtomicFeature where show (TextFeature t) = unpack t + show (ShapeFeature (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t) data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int deriving (Eq, Ord) @@ -36,16 +39,22 @@ tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split where splitPred c = c == ' ' || c == '\t' || c == ':' tokenizeForFeatures mTokenizer t = tokenize mTokenizer t -extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature] -extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer) +extractAtomicFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFeature]] +extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++ + (if bbdoWordShapes bbdo + then [nub $ Data.List.map (ShapeFeature . shapify) tokens] + else []) + where tokens = nub $ (tokenizeForFeatures mTokenizer) t -extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature] -extractUnigramFeatures mTokenizer namespace record = + +extractUnigramFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractUnigramFeatures mTokenizer bbdo namespace record = Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af) - $ extractAtomicFeatures mTokenizer record + $ Data.List.concat + $ extractAtomicFeatures mTokenizer bbdo record -extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature] -extractUnigramFeaturesFromTabbed mTokenizer namespace record = +extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractUnigramFeaturesFromTabbed mTokenizer bbdo namespace record = Data.List.concat - $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t) + $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ Data.List.concat $ extractAtomicFeatures mTokenizer bbdo t) $ Prelude.zip [1..] (splitOn "\t" record) diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index da7bc1e..00c2cba 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -104,7 +104,7 @@ forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) () extractFeaturesAndPValues spec bbdo = totalCounter - .| featureExtractor spec + .| featureExtractor spec bbdo .| uScoresCounter (bbdoMinFrequency bbdo) @@ -124,14 +124,14 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) = (pack $ printf "%0.8f" avg), (pack $ printf "%0.20f" p)] -featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m () -featureExtractor spec = CC.map extract .| CC.concat +featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m () +featureExtractor spec bbdo = CC.map extract .| CC.concat where extract (rank, LineRecord inLine expLine outLine _ score) = Prelude.map (\f -> RankedFeature f rank score) $ Data.List.concat [ - extractUnigramFeatures mTokenizer "exp" expLine, - extractUnigramFeaturesFromTabbed mTokenizer "in" inLine, - extractUnigramFeatures mTokenizer "out" outLine] + extractUnigramFeatures mTokenizer bbdo "exp" expLine, + extractUnigramFeaturesFromTabbed mTokenizer bbdo "in" inLine, + extractUnigramFeatures mTokenizer bbdo "out" outLine] mTokenizer = gesTokenizer spec uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 626cb2f..cc2380d 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -172,6 +172,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions <> help "Minimum frequency for the worst features" <> value 1 <> showDefault) + <*> switch + ( long "word-shapes" + <> help "Consider word shapes") singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x] diff --git a/src/Text/WordShape.hs b/src/Text/WordShape.hs new file mode 100644 index 0000000..446bc62 --- /dev/null +++ b/src/Text/WordShape.hs @@ -0,0 +1,40 @@ +module Text.WordShape + (WordShape(..), shapify) + where + +import Data.Text as T +import Data.Char + +newtype WordShape = WordShape Text + deriving (Eq, Ord) + +instance Show WordShape where + show (WordShape t) = unpack t + +-- The idea taken from https://github.com/aleju/ner-crf/blob/master/model/features.py#L377 + +isBracket :: Char -> Bool +isBracket c = cat == OpenPunctuation || cat == ClosePunctuation + where cat = generalCategory c + +normalizeChar :: Char -> Char +normalizeChar c + | isAlpha c && isUpper c = 'A' + | isAlpha c && isLower c = 'a' + | isDigit c = '9' + | isSpace c = ' ' + | isBracket c = '(' + | isPunctuation c = '.' + | otherwise = '#' + +shapify :: Text -> WordShape +shapify t = WordShape $ normalize $ T.map normalizeChar t + where normalize t = T.reverse $ pack $ T.foldl step "" t + step [] c = [c] + step p '9' = '9':p + step p@('+':h:t) c + | h == c = p + | otherwise = c:p + step p@(h:t) c + | h == c = ('+':p) + | otherwise = c:p diff --git a/test/Spec.hs b/test/Spec.hs index e65fb7a..5829f73 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -14,6 +14,7 @@ import GEval.LineByLine import GEval.ParseParams import GEval.Submit import Text.Tokenizer +import Text.WordShape import Data.Attoparsec.Text import Options.Applicative import Data.Text @@ -458,6 +459,15 @@ main = hspec $ do it "simple utterance with 'character-by-character' tokenizer" $ do tokenize (Just CharacterByCharacter) "To be or not to be." `shouldBe` ["T", "o", "_", "b", "e", "_", "o", "r", "_", "n", "o", "t", "_", "t", "o", "_", "b", "e", "."] + describe "shapify" $ do + it "simple tests" $ do + shapify "PoznaƄ" `shouldBe` (WordShape "Aa+") + shapify "2019" `shouldBe` (WordShape "9999") + shapify "Ala ma (czarnego) kota?" `shouldBe` (WordShape "Aa+ a+ (a+( a+.") + shapify "" `shouldBe` (WordShape "") + shapify "PCMCIA" `shouldBe` (WordShape "A+") + shapify "a" `shouldBe` (WordShape "a") + shapify "B5" `shouldBe` (WordShape "A9") describe "submit" $ do it "current branch" $ do runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"