Prepare helper functions for cross-tabs
This commit is contained in:
parent
06a0b1148d
commit
4ba61b6e6e
@ -49,6 +49,7 @@ library
|
||||
, Data.Statistics.Loess
|
||||
, Data.Statistics.Calibration
|
||||
, Data.CartesianStrings
|
||||
, Data.SplitIntoCrossTabs
|
||||
, Paths_geval
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, cond
|
||||
@ -101,6 +102,7 @@ library
|
||||
, filemanip
|
||||
, temporary
|
||||
, utf8-string
|
||||
, ordered-containers
|
||||
default-language: Haskell2010
|
||||
|
||||
executable geval
|
||||
|
119
src/Data/SplitIntoCrossTabs.hs
Normal file
119
src/Data/SplitIntoCrossTabs.hs
Normal file
@ -0,0 +1,119 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Data.SplitIntoCrossTabs
|
||||
(splitIntoCrossTabs,
|
||||
CrossTab(..),
|
||||
TextFrag(..))
|
||||
where
|
||||
|
||||
import qualified Data.Text as T
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Set.Ordered as OS
|
||||
import qualified Data.Map.Ordered as OM
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.Foldable as F
|
||||
import qualified Data.Map as M
|
||||
|
||||
import Debug.Trace
|
||||
|
||||
import Data.List (unfoldr, sortBy, maximumBy, minimumBy)
|
||||
|
||||
data CrossTab = SingleItem Text | CrossTab [TextFrag] [TextFrag]
|
||||
deriving (Show, Eq)
|
||||
|
||||
data TextFrag = Prefix Text | Suffix Text
|
||||
deriving (Show, Eq, Ord)
|
||||
|
||||
|
||||
splitIntoCrossTabs :: [Text] -> [CrossTab]
|
||||
splitIntoCrossTabs inputs =
|
||||
map preferVertical
|
||||
$ map snd
|
||||
$ sortBy (\(r1,_) (r2, _) -> r1 `compare` r2)
|
||||
$ map (getRank inputRanks)
|
||||
$ unfoldr extractTheBestCrossTab inputs
|
||||
where inputRanks = M.fromList $ zip inputs [1..]
|
||||
|
||||
preferVertical :: CrossTab -> CrossTab
|
||||
preferVertical s@(SingleItem _) = s
|
||||
preferVertical c@(CrossTab rowNames columnNames)
|
||||
| length rowNames < length columnNames = CrossTab columnNames rowNames
|
||||
| otherwise = c
|
||||
|
||||
getRank :: M.Map Text Int -> CrossTab -> (Int, CrossTab)
|
||||
getRank ranks c = (bestRank, c)
|
||||
where bestRank = minimum
|
||||
$ map (ranks M.!)
|
||||
$ S.toList
|
||||
$ toSet c
|
||||
|
||||
extractTheBestCrossTab :: [Text] -> Maybe (CrossTab, [Text])
|
||||
extractTheBestCrossTab [] = Nothing
|
||||
extractTheBestCrossTab ts = Just (theBestCrossTab, rest)
|
||||
where theBestCrossTab = findTheBestCrossTab ts
|
||||
rest = filter (`S.notMember` (toSet theBestCrossTab)) ts
|
||||
|
||||
findTheBestCrossTab :: [Text] -> CrossTab
|
||||
findTheBestCrossTab ts = case orderedEntries of
|
||||
[] -> SingleItem defaultSingleton
|
||||
_ -> maximumBy (\t1 t2 -> crossTabSize t1 `compare` crossTabSize t2)
|
||||
$ map (findTheBestCrossTabForTextPart (SingleItem defaultSingleton) orderedEntries)
|
||||
$ map snd orderedEntries
|
||||
where mapping = gatherTextParts ts
|
||||
orderedEntries = sortBy entryComparator
|
||||
$ filter (\(_, (_, tset)) -> OS.size tset >= 2)
|
||||
$ zip [1..] (OM.assocs mapping)
|
||||
(defaultSingleton:_) = ts
|
||||
|
||||
thenCmp :: Ordering -> Ordering -> Ordering
|
||||
thenCmp EQ o2 = o2
|
||||
thenCmp o1 _ = o1
|
||||
|
||||
entryComparator (r1, (_, s1)) (r2, (_, s2)) = (OS.size s2 `compare` OS.size s1)
|
||||
`thenCmp`
|
||||
(r1 `compare` r2)
|
||||
|
||||
findTheBestCrossTabForTextPart :: CrossTab -> [(Int, (TextFrag, OS.OSet TextFrag))] -> (TextFrag, OS.OSet TextFrag) -> CrossTab
|
||||
findTheBestCrossTabForTextPart defaultCrossTab entries chosenEntry@(t, tset) = if crossTabSize bestCrossTabFound > 1
|
||||
then bestCrossTabFound
|
||||
else defaultCrossTab
|
||||
where bestCrossTabFound = foldr step (CrossTab [] (F.toList tset)) entriesOrderedByIntersection
|
||||
entriesOrderedByIntersection =
|
||||
sortBy entryComparator
|
||||
$ filter (\(_, (_, tset')) -> OS.size tset' >= 2)
|
||||
$ map (\(r, (t', tset')) -> (r, (t', tset' OS.|/\ tset))) entries
|
||||
step (_, (t', tset')) currentTab@(CrossTab frags common) = selectedTab
|
||||
where newTab = CrossTab newT (F.toList newCommon)
|
||||
newT = t':frags
|
||||
newCommon = (OS.fromList common) OS.|/\ tset'
|
||||
selectedTab = if crossTabSize newTab >= crossTabSize currentTab
|
||||
then newTab
|
||||
else currentTab
|
||||
|
||||
crossTabSize :: CrossTab -> Int
|
||||
crossTabSize (SingleItem _) = 1
|
||||
crossTabSize (CrossTab [] _) = 0
|
||||
crossTabSize (CrossTab _ []) = 0
|
||||
-- tables really start from 2x2
|
||||
crossTabSize (CrossTab [_] _) = 0
|
||||
crossTabSize (CrossTab _ [_]) = 0
|
||||
crossTabSize (CrossTab rows columns) = length rows * length columns
|
||||
|
||||
toSet :: CrossTab -> S.Set Text
|
||||
toSet (SingleItem t) = S.singleton t
|
||||
toSet (CrossTab rowNames columnNames) = S.fromList [rName `combineFrags` cName | rName <- rowNames, cName <- columnNames]
|
||||
|
||||
combineFrags :: TextFrag -> TextFrag -> Text
|
||||
combineFrags (Prefix prefix) (Suffix suffix) = prefix <> suffix
|
||||
combineFrags (Suffix suffix) (Prefix prefix) = prefix <> suffix
|
||||
combineFrags _ _ = error $ "incompatible text fragments"
|
||||
|
||||
getTextParts :: Text -> [(TextFrag, TextFrag)]
|
||||
getTextParts t = [(Prefix (T.take ix t), Suffix (T.drop ix t)) | ix <- [1..(T.length t)-1]]
|
||||
|
||||
gatherTextParts :: [Text] -> OM.OMap TextFrag (OS.OSet TextFrag)
|
||||
gatherTextParts = (gather OS.singleton (OS.|<>)) . concat . (map getTextParts)
|
||||
|
||||
gather :: Ord a => (b -> c) -> (c -> c -> c) -> [(a, b)] -> OM.OMap a c
|
||||
gather createEntry combine = foldr extend OM.empty
|
||||
where extend (k, v) m = OM.unionWithL (\_ v1 v2 -> combine v1 v2) (OM.singleton (k, (createEntry v))) m
|
@ -1,5 +1,5 @@
|
||||
flags: {}
|
||||
packages:
|
||||
- '.'
|
||||
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1]
|
||||
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1,'ordered-containers-0.2.2@sha256:ebf2be3f592d9cf148ea6b8375f8af97148d44f82d8d04476899285e965afdbf,810']
|
||||
resolver: lts-12.26
|
||||
|
21
test/Spec.hs
21
test/Spec.hs
@ -65,6 +65,7 @@ import qualified Statistics.Matrix.Types as SMT
|
||||
import Data.Statistics.Loess (loess)
|
||||
import Data.Statistics.Calibration (calibration)
|
||||
import Data.CartesianStrings (parseCartesianString)
|
||||
import Data.SplitIntoCrossTabs (splitIntoCrossTabs, CrossTab(..), TextFrag(..))
|
||||
|
||||
informationRetrievalBookExample :: [(String, Int)]
|
||||
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
||||
@ -687,6 +688,26 @@ main = hspec $ do
|
||||
"bar-c-0x", "bar-c-1x", "bar-c-2x",
|
||||
"ba-b-0x", "ba-b-1x", "ba-b-2x",
|
||||
"ba-c-0x", "ba-c-1x", "ba-c-2x" ]
|
||||
describe "cross-tabs" $ do
|
||||
it "singleton" $ do
|
||||
splitIntoCrossTabs ["abababab"] `shouldBe` [SingleItem "abababab"]
|
||||
it "too small" $ do
|
||||
splitIntoCrossTabs ["aabb", "aacc"] `shouldBe` [SingleItem "aabb", SingleItem "aacc"]
|
||||
it "two tables" $ do
|
||||
splitIntoCrossTabs ["yABC", "xx00", "yABD", "ZC", "xx11", "yy00", "yy11", "ZD"] `shouldBe` [
|
||||
CrossTab [Prefix "yAB", Prefix "Z"] [Suffix "C", Suffix "D"],
|
||||
CrossTab [Prefix "xx", Prefix "yy"] [Suffix "00", Suffix "11"]]
|
||||
it "simple" $ do
|
||||
splitIntoCrossTabs ["aabsolutely",
|
||||
"aaafoo",
|
||||
"other",
|
||||
"aaabaz",
|
||||
"aaabaq",
|
||||
"bbbfoo",
|
||||
"bbbbaz",
|
||||
"bbbbaq"] `shouldBe` [SingleItem "aabsolutely",
|
||||
CrossTab [Suffix "foo", Suffix "baz", Suffix "baq"] [Prefix "aaa", Prefix "bbb"],
|
||||
SingleItem "other"]
|
||||
|
||||
checkConduitPure conduit inList expList = do
|
||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||
|
Loading…
Reference in New Issue
Block a user