Prepare helper functions for cross-tabs

This commit is contained in:
Filip Gralinski 2019-12-16 13:01:41 +01:00
parent 06a0b1148d
commit 4ba61b6e6e
4 changed files with 143 additions and 1 deletions

View File

@ -49,6 +49,7 @@ library
, Data.Statistics.Loess
, Data.Statistics.Calibration
, Data.CartesianStrings
, Data.SplitIntoCrossTabs
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond
@ -101,6 +102,7 @@ library
, filemanip
, temporary
, utf8-string
, ordered-containers
default-language: Haskell2010
executable geval

View File

@ -0,0 +1,119 @@
{-# LANGUAGE OverloadedStrings #-}
module Data.SplitIntoCrossTabs
(splitIntoCrossTabs,
CrossTab(..),
TextFrag(..))
where
import qualified Data.Text as T
import Data.Text (Text)
import qualified Data.Set.Ordered as OS
import qualified Data.Map.Ordered as OM
import qualified Data.Set as S
import qualified Data.Foldable as F
import qualified Data.Map as M
import Debug.Trace
import Data.List (unfoldr, sortBy, maximumBy, minimumBy)
data CrossTab = SingleItem Text | CrossTab [TextFrag] [TextFrag]
deriving (Show, Eq)
data TextFrag = Prefix Text | Suffix Text
deriving (Show, Eq, Ord)
splitIntoCrossTabs :: [Text] -> [CrossTab]
splitIntoCrossTabs inputs =
map preferVertical
$ map snd
$ sortBy (\(r1,_) (r2, _) -> r1 `compare` r2)
$ map (getRank inputRanks)
$ unfoldr extractTheBestCrossTab inputs
where inputRanks = M.fromList $ zip inputs [1..]
preferVertical :: CrossTab -> CrossTab
preferVertical s@(SingleItem _) = s
preferVertical c@(CrossTab rowNames columnNames)
| length rowNames < length columnNames = CrossTab columnNames rowNames
| otherwise = c
getRank :: M.Map Text Int -> CrossTab -> (Int, CrossTab)
getRank ranks c = (bestRank, c)
where bestRank = minimum
$ map (ranks M.!)
$ S.toList
$ toSet c
extractTheBestCrossTab :: [Text] -> Maybe (CrossTab, [Text])
extractTheBestCrossTab [] = Nothing
extractTheBestCrossTab ts = Just (theBestCrossTab, rest)
where theBestCrossTab = findTheBestCrossTab ts
rest = filter (`S.notMember` (toSet theBestCrossTab)) ts
findTheBestCrossTab :: [Text] -> CrossTab
findTheBestCrossTab ts = case orderedEntries of
[] -> SingleItem defaultSingleton
_ -> maximumBy (\t1 t2 -> crossTabSize t1 `compare` crossTabSize t2)
$ map (findTheBestCrossTabForTextPart (SingleItem defaultSingleton) orderedEntries)
$ map snd orderedEntries
where mapping = gatherTextParts ts
orderedEntries = sortBy entryComparator
$ filter (\(_, (_, tset)) -> OS.size tset >= 2)
$ zip [1..] (OM.assocs mapping)
(defaultSingleton:_) = ts
thenCmp :: Ordering -> Ordering -> Ordering
thenCmp EQ o2 = o2
thenCmp o1 _ = o1
entryComparator (r1, (_, s1)) (r2, (_, s2)) = (OS.size s2 `compare` OS.size s1)
`thenCmp`
(r1 `compare` r2)
findTheBestCrossTabForTextPart :: CrossTab -> [(Int, (TextFrag, OS.OSet TextFrag))] -> (TextFrag, OS.OSet TextFrag) -> CrossTab
findTheBestCrossTabForTextPart defaultCrossTab entries chosenEntry@(t, tset) = if crossTabSize bestCrossTabFound > 1
then bestCrossTabFound
else defaultCrossTab
where bestCrossTabFound = foldr step (CrossTab [] (F.toList tset)) entriesOrderedByIntersection
entriesOrderedByIntersection =
sortBy entryComparator
$ filter (\(_, (_, tset')) -> OS.size tset' >= 2)
$ map (\(r, (t', tset')) -> (r, (t', tset' OS.|/\ tset))) entries
step (_, (t', tset')) currentTab@(CrossTab frags common) = selectedTab
where newTab = CrossTab newT (F.toList newCommon)
newT = t':frags
newCommon = (OS.fromList common) OS.|/\ tset'
selectedTab = if crossTabSize newTab >= crossTabSize currentTab
then newTab
else currentTab
crossTabSize :: CrossTab -> Int
crossTabSize (SingleItem _) = 1
crossTabSize (CrossTab [] _) = 0
crossTabSize (CrossTab _ []) = 0
-- tables really start from 2x2
crossTabSize (CrossTab [_] _) = 0
crossTabSize (CrossTab _ [_]) = 0
crossTabSize (CrossTab rows columns) = length rows * length columns
toSet :: CrossTab -> S.Set Text
toSet (SingleItem t) = S.singleton t
toSet (CrossTab rowNames columnNames) = S.fromList [rName `combineFrags` cName | rName <- rowNames, cName <- columnNames]
combineFrags :: TextFrag -> TextFrag -> Text
combineFrags (Prefix prefix) (Suffix suffix) = prefix <> suffix
combineFrags (Suffix suffix) (Prefix prefix) = prefix <> suffix
combineFrags _ _ = error $ "incompatible text fragments"
getTextParts :: Text -> [(TextFrag, TextFrag)]
getTextParts t = [(Prefix (T.take ix t), Suffix (T.drop ix t)) | ix <- [1..(T.length t)-1]]
gatherTextParts :: [Text] -> OM.OMap TextFrag (OS.OSet TextFrag)
gatherTextParts = (gather OS.singleton (OS.|<>)) . concat . (map getTextParts)
gather :: Ord a => (b -> c) -> (c -> c -> c) -> [(a, b)] -> OM.OMap a c
gather createEntry combine = foldr extend OM.empty
where extend (k, v) m = OM.unionWithL (\_ v1 v2 -> combine v1 v2) (OM.singleton (k, (createEntry v))) m

View File

@ -1,5 +1,5 @@
flags: {}
packages:
- '.'
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1]
extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1,Chart-1.9.1,Chart-cairo-1.9.1,multiset-0.3.4.1,'ordered-containers-0.2.2@sha256:ebf2be3f592d9cf148ea6b8375f8af97148d44f82d8d04476899285e965afdbf,810']
resolver: lts-12.26

View File

@ -65,6 +65,7 @@ import qualified Statistics.Matrix.Types as SMT
import Data.Statistics.Loess (loess)
import Data.Statistics.Calibration (calibration)
import Data.CartesianStrings (parseCartesianString)
import Data.SplitIntoCrossTabs (splitIntoCrossTabs, CrossTab(..), TextFrag(..))
informationRetrievalBookExample :: [(String, Int)]
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
@ -687,6 +688,26 @@ main = hspec $ do
"bar-c-0x", "bar-c-1x", "bar-c-2x",
"ba-b-0x", "ba-b-1x", "ba-b-2x",
"ba-c-0x", "ba-c-1x", "ba-c-2x" ]
describe "cross-tabs" $ do
it "singleton" $ do
splitIntoCrossTabs ["abababab"] `shouldBe` [SingleItem "abababab"]
it "too small" $ do
splitIntoCrossTabs ["aabb", "aacc"] `shouldBe` [SingleItem "aabb", SingleItem "aacc"]
it "two tables" $ do
splitIntoCrossTabs ["yABC", "xx00", "yABD", "ZC", "xx11", "yy00", "yy11", "ZD"] `shouldBe` [
CrossTab [Prefix "yAB", Prefix "Z"] [Suffix "C", Suffix "D"],
CrossTab [Prefix "xx", Prefix "yy"] [Suffix "00", Suffix "11"]]
it "simple" $ do
splitIntoCrossTabs ["aabsolutely",
"aaafoo",
"other",
"aaabaz",
"aaabaq",
"bbbfoo",
"bbbbaz",
"bbbbaq"] `shouldBe` [SingleItem "aabsolutely",
CrossTab [Suffix "foo", Suffix "baz", Suffix "baq"] [Prefix "aaa", Prefix "bbb"],
SingleItem "other"]
checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList