From 4ba61b6e6ea82b053a788ca8c68be46900890c54 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Mon, 16 Dec 2019 13:01:41 +0100 Subject: [PATCH] Prepare helper functions for cross-tabs --- geval.cabal | 2 + src/Data/SplitIntoCrossTabs.hs | 119 +++++++++++++++++++++++++++++++++ stack.yaml | 2 +- test/Spec.hs | 21 ++++++ 4 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/Data/SplitIntoCrossTabs.hs diff --git a/geval.cabal b/geval.cabal index 170d37e..28fd65a 100644 --- a/geval.cabal +++ b/geval.cabal @@ -49,6 +49,7 @@ library , Data.Statistics.Loess , Data.Statistics.Calibration , Data.CartesianStrings + , Data.SplitIntoCrossTabs , Paths_geval build-depends: base >= 4.7 && < 5 , cond @@ -101,6 +102,7 @@ library , filemanip , temporary , utf8-string + , ordered-containers default-language: Haskell2010 executable geval diff --git a/src/Data/SplitIntoCrossTabs.hs b/src/Data/SplitIntoCrossTabs.hs new file mode 100644 index 0000000..1088d5f --- /dev/null +++ b/src/Data/SplitIntoCrossTabs.hs @@ -0,0 +1,119 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Data.SplitIntoCrossTabs + (splitIntoCrossTabs, + CrossTab(..), + TextFrag(..)) + where + +import qualified Data.Text as T +import Data.Text (Text) +import qualified Data.Set.Ordered as OS +import qualified Data.Map.Ordered as OM +import qualified Data.Set as S +import qualified Data.Foldable as F +import qualified Data.Map as M + +import Debug.Trace + +import Data.List (unfoldr, sortBy, maximumBy, minimumBy) + +data CrossTab = SingleItem Text | CrossTab [TextFrag] [TextFrag] + deriving (Show, Eq) + +data TextFrag = Prefix Text | Suffix Text + deriving (Show, Eq, Ord) + + +splitIntoCrossTabs :: [Text] -> [CrossTab] +splitIntoCrossTabs inputs = + map preferVertical + $ map snd + $ sortBy (\(r1,_) (r2, _) -> r1 `compare` r2) + $ map (getRank inputRanks) + $ unfoldr extractTheBestCrossTab inputs + where inputRanks = M.fromList $ zip inputs [1..] + +preferVertical :: CrossTab -> CrossTab +preferVertical s@(SingleItem _) = s +preferVertical c@(CrossTab rowNames columnNames) + | length rowNames < length columnNames = CrossTab columnNames rowNames + | otherwise = c + +getRank :: M.Map Text Int -> CrossTab -> (Int, CrossTab) +getRank ranks c = (bestRank, c) + where bestRank = minimum + $ map (ranks M.!) + $ S.toList + $ toSet c + +extractTheBestCrossTab :: [Text] -> Maybe (CrossTab, [Text]) +extractTheBestCrossTab [] = Nothing +extractTheBestCrossTab ts = Just (theBestCrossTab, rest) + where theBestCrossTab = findTheBestCrossTab ts + rest = filter (`S.notMember` (toSet theBestCrossTab)) ts + +findTheBestCrossTab :: [Text] -> CrossTab +findTheBestCrossTab ts = case orderedEntries of + [] -> SingleItem defaultSingleton + _ -> maximumBy (\t1 t2 -> crossTabSize t1 `compare` crossTabSize t2) + $ map (findTheBestCrossTabForTextPart (SingleItem defaultSingleton) orderedEntries) + $ map snd orderedEntries + where mapping = gatherTextParts ts + orderedEntries = sortBy entryComparator + $ filter (\(_, (_, tset)) -> OS.size tset >= 2) + $ zip [1..] (OM.assocs mapping) + (defaultSingleton:_) = ts + +thenCmp :: Ordering -> Ordering -> Ordering +thenCmp EQ o2 = o2 +thenCmp o1 _ = o1 + +entryComparator (r1, (_, s1)) (r2, (_, s2)) = (OS.size s2 `compare` OS.size s1) + `thenCmp` + (r1 `compare` r2) + +findTheBestCrossTabForTextPart :: CrossTab -> [(Int, (TextFrag, OS.OSet TextFrag))] -> (TextFrag, OS.OSet TextFrag) -> CrossTab +findTheBestCrossTabForTextPart defaultCrossTab entries chosenEntry@(t, tset) = if crossTabSize bestCrossTabFound > 1 + then bestCrossTabFound + else defaultCrossTab + where bestCrossTabFound = foldr step (CrossTab [] (F.toList tset)) entriesOrderedByIntersection + entriesOrderedByIntersection = + sortBy entryComparator + $ filter (\(_, (_, tset')) -> OS.size tset' >= 2) + $ map (\(r, (t', tset')) -> (r, (t', tset' OS.|/\ tset))) entries + step (_, (t', tset')) currentTab@(CrossTab frags common) = selectedTab + where newTab = CrossTab newT (F.toList newCommon) + newT = t':frags + newCommon = (OS.fromList common) OS.|/\ tset' + selectedTab = if crossTabSize newTab >= crossTabSize currentTab + then newTab + else currentTab + +crossTabSize :: CrossTab -> Int +crossTabSize (SingleItem _) = 1 +crossTabSize (CrossTab [] _) = 0 +crossTabSize (CrossTab _ []) = 0 +-- tables really start from 2x2 +crossTabSize (CrossTab [_] _) = 0 +crossTabSize (CrossTab _ [_]) = 0 +crossTabSize (CrossTab rows columns) = length rows * length columns + +toSet :: CrossTab -> S.Set Text +toSet (SingleItem t) = S.singleton t +toSet (CrossTab rowNames columnNames) = S.fromList [rName `combineFrags` cName | rName <- rowNames, cName <- columnNames] + +combineFrags :: TextFrag -> TextFrag -> Text +combineFrags (Prefix prefix) (Suffix suffix) = prefix <> suffix +combineFrags (Suffix suffix) (Prefix prefix) = prefix <> suffix +combineFrags _ _ = error $ "incompatible text fragments" + +getTextParts :: Text -> [(TextFrag, TextFrag)] +getTextParts t = [(Prefix (T.take ix t), Suffix (T.drop ix t)) | ix <- [1..(T.length t)-1]] + +gatherTextParts :: [Text] -> OM.OMap TextFrag (OS.OSet TextFrag) +gatherTextParts = (gather OS.singleton (OS.|<>)) . concat . (map getTextParts) + +gather :: Ord a => (b -> c) -> (c -> c -> c) -> [(a, b)] -> OM.OMap a c +gather createEntry combine = foldr extend OM.empty + where extend (k, v) m = OM.unionWithL (\_ v1 v2 -> combine v1 v2) (OM.singleton (k, (createEntry v))) m diff --git a/stack.yaml b/stack.yaml index b6e0361..3805ab6 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,5 +1,5 @@ flags: {} packages: - '.' -extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1] +extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1,'ordered-containers-0.2.2@sha256:ebf2be3f592d9cf148ea6b8375f8af97148d44f82d8d04476899285e965afdbf,810'] resolver: lts-12.26 diff --git a/test/Spec.hs b/test/Spec.hs index e10ebb4..b4d3511 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -65,6 +65,7 @@ import qualified Statistics.Matrix.Types as SMT import Data.Statistics.Loess (loess) import Data.Statistics.Calibration (calibration) import Data.CartesianStrings (parseCartesianString) +import Data.SplitIntoCrossTabs (splitIntoCrossTabs, CrossTab(..), TextFrag(..)) informationRetrievalBookExample :: [(String, Int)] informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3), @@ -687,6 +688,26 @@ main = hspec $ do "bar-c-0x", "bar-c-1x", "bar-c-2x", "ba-b-0x", "ba-b-1x", "ba-b-2x", "ba-c-0x", "ba-c-1x", "ba-c-2x" ] + describe "cross-tabs" $ do + it "singleton" $ do + splitIntoCrossTabs ["abababab"] `shouldBe` [SingleItem "abababab"] + it "too small" $ do + splitIntoCrossTabs ["aabb", "aacc"] `shouldBe` [SingleItem "aabb", SingleItem "aacc"] + it "two tables" $ do + splitIntoCrossTabs ["yABC", "xx00", "yABD", "ZC", "xx11", "yy00", "yy11", "ZD"] `shouldBe` [ + CrossTab [Prefix "yAB", Prefix "Z"] [Suffix "C", Suffix "D"], + CrossTab [Prefix "xx", Prefix "yy"] [Suffix "00", Suffix "11"]] + it "simple" $ do + splitIntoCrossTabs ["aabsolutely", + "aaafoo", + "other", + "aaabaz", + "aaabaq", + "bbbfoo", + "bbbbaz", + "bbbbaq"] `shouldBe` [SingleItem "aabsolutely", + CrossTab [Suffix "foo", Suffix "baz", Suffix "baq"] [Prefix "aaa", Prefix "bbb"], + SingleItem "other"] checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList