Prepare helper functions for cross-tabs
This commit is contained in:
parent
06a0b1148d
commit
4ba61b6e6e
@ -49,6 +49,7 @@ library
|
|||||||
, Data.Statistics.Loess
|
, Data.Statistics.Loess
|
||||||
, Data.Statistics.Calibration
|
, Data.Statistics.Calibration
|
||||||
, Data.CartesianStrings
|
, Data.CartesianStrings
|
||||||
|
, Data.SplitIntoCrossTabs
|
||||||
, Paths_geval
|
, Paths_geval
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
@ -101,6 +102,7 @@ library
|
|||||||
, filemanip
|
, filemanip
|
||||||
, temporary
|
, temporary
|
||||||
, utf8-string
|
, utf8-string
|
||||||
|
, ordered-containers
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable geval
|
executable geval
|
||||||
|
119
src/Data/SplitIntoCrossTabs.hs
Normal file
119
src/Data/SplitIntoCrossTabs.hs
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module Data.SplitIntoCrossTabs
|
||||||
|
(splitIntoCrossTabs,
|
||||||
|
CrossTab(..),
|
||||||
|
TextFrag(..))
|
||||||
|
where
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Set.Ordered as OS
|
||||||
|
import qualified Data.Map.Ordered as OM
|
||||||
|
import qualified Data.Set as S
|
||||||
|
import qualified Data.Foldable as F
|
||||||
|
import qualified Data.Map as M
|
||||||
|
|
||||||
|
import Debug.Trace
|
||||||
|
|
||||||
|
import Data.List (unfoldr, sortBy, maximumBy, minimumBy)
|
||||||
|
|
||||||
|
data CrossTab = SingleItem Text | CrossTab [TextFrag] [TextFrag]
|
||||||
|
deriving (Show, Eq)
|
||||||
|
|
||||||
|
data TextFrag = Prefix Text | Suffix Text
|
||||||
|
deriving (Show, Eq, Ord)
|
||||||
|
|
||||||
|
|
||||||
|
splitIntoCrossTabs :: [Text] -> [CrossTab]
|
||||||
|
splitIntoCrossTabs inputs =
|
||||||
|
map preferVertical
|
||||||
|
$ map snd
|
||||||
|
$ sortBy (\(r1,_) (r2, _) -> r1 `compare` r2)
|
||||||
|
$ map (getRank inputRanks)
|
||||||
|
$ unfoldr extractTheBestCrossTab inputs
|
||||||
|
where inputRanks = M.fromList $ zip inputs [1..]
|
||||||
|
|
||||||
|
preferVertical :: CrossTab -> CrossTab
|
||||||
|
preferVertical s@(SingleItem _) = s
|
||||||
|
preferVertical c@(CrossTab rowNames columnNames)
|
||||||
|
| length rowNames < length columnNames = CrossTab columnNames rowNames
|
||||||
|
| otherwise = c
|
||||||
|
|
||||||
|
getRank :: M.Map Text Int -> CrossTab -> (Int, CrossTab)
|
||||||
|
getRank ranks c = (bestRank, c)
|
||||||
|
where bestRank = minimum
|
||||||
|
$ map (ranks M.!)
|
||||||
|
$ S.toList
|
||||||
|
$ toSet c
|
||||||
|
|
||||||
|
extractTheBestCrossTab :: [Text] -> Maybe (CrossTab, [Text])
|
||||||
|
extractTheBestCrossTab [] = Nothing
|
||||||
|
extractTheBestCrossTab ts = Just (theBestCrossTab, rest)
|
||||||
|
where theBestCrossTab = findTheBestCrossTab ts
|
||||||
|
rest = filter (`S.notMember` (toSet theBestCrossTab)) ts
|
||||||
|
|
||||||
|
findTheBestCrossTab :: [Text] -> CrossTab
|
||||||
|
findTheBestCrossTab ts = case orderedEntries of
|
||||||
|
[] -> SingleItem defaultSingleton
|
||||||
|
_ -> maximumBy (\t1 t2 -> crossTabSize t1 `compare` crossTabSize t2)
|
||||||
|
$ map (findTheBestCrossTabForTextPart (SingleItem defaultSingleton) orderedEntries)
|
||||||
|
$ map snd orderedEntries
|
||||||
|
where mapping = gatherTextParts ts
|
||||||
|
orderedEntries = sortBy entryComparator
|
||||||
|
$ filter (\(_, (_, tset)) -> OS.size tset >= 2)
|
||||||
|
$ zip [1..] (OM.assocs mapping)
|
||||||
|
(defaultSingleton:_) = ts
|
||||||
|
|
||||||
|
thenCmp :: Ordering -> Ordering -> Ordering
|
||||||
|
thenCmp EQ o2 = o2
|
||||||
|
thenCmp o1 _ = o1
|
||||||
|
|
||||||
|
entryComparator (r1, (_, s1)) (r2, (_, s2)) = (OS.size s2 `compare` OS.size s1)
|
||||||
|
`thenCmp`
|
||||||
|
(r1 `compare` r2)
|
||||||
|
|
||||||
|
findTheBestCrossTabForTextPart :: CrossTab -> [(Int, (TextFrag, OS.OSet TextFrag))] -> (TextFrag, OS.OSet TextFrag) -> CrossTab
|
||||||
|
findTheBestCrossTabForTextPart defaultCrossTab entries chosenEntry@(t, tset) = if crossTabSize bestCrossTabFound > 1
|
||||||
|
then bestCrossTabFound
|
||||||
|
else defaultCrossTab
|
||||||
|
where bestCrossTabFound = foldr step (CrossTab [] (F.toList tset)) entriesOrderedByIntersection
|
||||||
|
entriesOrderedByIntersection =
|
||||||
|
sortBy entryComparator
|
||||||
|
$ filter (\(_, (_, tset')) -> OS.size tset' >= 2)
|
||||||
|
$ map (\(r, (t', tset')) -> (r, (t', tset' OS.|/\ tset))) entries
|
||||||
|
step (_, (t', tset')) currentTab@(CrossTab frags common) = selectedTab
|
||||||
|
where newTab = CrossTab newT (F.toList newCommon)
|
||||||
|
newT = t':frags
|
||||||
|
newCommon = (OS.fromList common) OS.|/\ tset'
|
||||||
|
selectedTab = if crossTabSize newTab >= crossTabSize currentTab
|
||||||
|
then newTab
|
||||||
|
else currentTab
|
||||||
|
|
||||||
|
crossTabSize :: CrossTab -> Int
|
||||||
|
crossTabSize (SingleItem _) = 1
|
||||||
|
crossTabSize (CrossTab [] _) = 0
|
||||||
|
crossTabSize (CrossTab _ []) = 0
|
||||||
|
-- tables really start from 2x2
|
||||||
|
crossTabSize (CrossTab [_] _) = 0
|
||||||
|
crossTabSize (CrossTab _ [_]) = 0
|
||||||
|
crossTabSize (CrossTab rows columns) = length rows * length columns
|
||||||
|
|
||||||
|
toSet :: CrossTab -> S.Set Text
|
||||||
|
toSet (SingleItem t) = S.singleton t
|
||||||
|
toSet (CrossTab rowNames columnNames) = S.fromList [rName `combineFrags` cName | rName <- rowNames, cName <- columnNames]
|
||||||
|
|
||||||
|
combineFrags :: TextFrag -> TextFrag -> Text
|
||||||
|
combineFrags (Prefix prefix) (Suffix suffix) = prefix <> suffix
|
||||||
|
combineFrags (Suffix suffix) (Prefix prefix) = prefix <> suffix
|
||||||
|
combineFrags _ _ = error $ "incompatible text fragments"
|
||||||
|
|
||||||
|
getTextParts :: Text -> [(TextFrag, TextFrag)]
|
||||||
|
getTextParts t = [(Prefix (T.take ix t), Suffix (T.drop ix t)) | ix <- [1..(T.length t)-1]]
|
||||||
|
|
||||||
|
gatherTextParts :: [Text] -> OM.OMap TextFrag (OS.OSet TextFrag)
|
||||||
|
gatherTextParts = (gather OS.singleton (OS.|<>)) . concat . (map getTextParts)
|
||||||
|
|
||||||
|
gather :: Ord a => (b -> c) -> (c -> c -> c) -> [(a, b)] -> OM.OMap a c
|
||||||
|
gather createEntry combine = foldr extend OM.empty
|
||||||
|
where extend (k, v) m = OM.unionWithL (\_ v1 v2 -> combine v1 v2) (OM.singleton (k, (createEntry v))) m
|
@ -1,5 +1,5 @@
|
|||||||
flags: {}
|
flags: {}
|
||||||
packages:
|
packages:
|
||||||
- '.'
|
- '.'
|
||||||
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1]
|
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1,'ordered-containers-0.2.2@sha256:ebf2be3f592d9cf148ea6b8375f8af97148d44f82d8d04476899285e965afdbf,810']
|
||||||
resolver: lts-12.26
|
resolver: lts-12.26
|
||||||
|
21
test/Spec.hs
21
test/Spec.hs
@ -65,6 +65,7 @@ import qualified Statistics.Matrix.Types as SMT
|
|||||||
import Data.Statistics.Loess (loess)
|
import Data.Statistics.Loess (loess)
|
||||||
import Data.Statistics.Calibration (calibration)
|
import Data.Statistics.Calibration (calibration)
|
||||||
import Data.CartesianStrings (parseCartesianString)
|
import Data.CartesianStrings (parseCartesianString)
|
||||||
|
import Data.SplitIntoCrossTabs (splitIntoCrossTabs, CrossTab(..), TextFrag(..))
|
||||||
|
|
||||||
informationRetrievalBookExample :: [(String, Int)]
|
informationRetrievalBookExample :: [(String, Int)]
|
||||||
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
||||||
@ -687,6 +688,26 @@ main = hspec $ do
|
|||||||
"bar-c-0x", "bar-c-1x", "bar-c-2x",
|
"bar-c-0x", "bar-c-1x", "bar-c-2x",
|
||||||
"ba-b-0x", "ba-b-1x", "ba-b-2x",
|
"ba-b-0x", "ba-b-1x", "ba-b-2x",
|
||||||
"ba-c-0x", "ba-c-1x", "ba-c-2x" ]
|
"ba-c-0x", "ba-c-1x", "ba-c-2x" ]
|
||||||
|
describe "cross-tabs" $ do
|
||||||
|
it "singleton" $ do
|
||||||
|
splitIntoCrossTabs ["abababab"] `shouldBe` [SingleItem "abababab"]
|
||||||
|
it "too small" $ do
|
||||||
|
splitIntoCrossTabs ["aabb", "aacc"] `shouldBe` [SingleItem "aabb", SingleItem "aacc"]
|
||||||
|
it "two tables" $ do
|
||||||
|
splitIntoCrossTabs ["yABC", "xx00", "yABD", "ZC", "xx11", "yy00", "yy11", "ZD"] `shouldBe` [
|
||||||
|
CrossTab [Prefix "yAB", Prefix "Z"] [Suffix "C", Suffix "D"],
|
||||||
|
CrossTab [Prefix "xx", Prefix "yy"] [Suffix "00", Suffix "11"]]
|
||||||
|
it "simple" $ do
|
||||||
|
splitIntoCrossTabs ["aabsolutely",
|
||||||
|
"aaafoo",
|
||||||
|
"other",
|
||||||
|
"aaabaz",
|
||||||
|
"aaabaq",
|
||||||
|
"bbbfoo",
|
||||||
|
"bbbbaz",
|
||||||
|
"bbbbaq"] `shouldBe` [SingleItem "aabsolutely",
|
||||||
|
CrossTab [Suffix "foo", Suffix "baz", Suffix "baq"] [Prefix "aaa", Prefix "bbb"],
|
||||||
|
SingleItem "other"]
|
||||||
|
|
||||||
checkConduitPure conduit inList expList = do
|
checkConduitPure conduit inList expList = do
|
||||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||||
|
Loading…
Reference in New Issue
Block a user