From 8388ab4d27013fa174947e85fd8b60be0b233282 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 11 Aug 2018 22:59:10 +0200 Subject: [PATCH] towards tokenization --- geval.cabal | 2 ++ src/Text/Tokenizer.hs | 32 ++++++++++++++++++++++++++++++++ test/Spec.hs | 7 ++++++- 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 src/Text/Tokenizer.hs diff --git a/geval.cabal b/geval.cabal index 9a1eb97..27e3f37 100644 --- a/geval.cabal +++ b/geval.cabal @@ -33,6 +33,7 @@ library , Data.Conduit.SmartSource , Data.Conduit.Rank , GEval.FeatureExtractor + , Text.Tokenizer , Paths_geval build-depends: base >= 4.7 && < 5 , cond @@ -68,6 +69,7 @@ library , naturalcomp , containers , statistics + , pcre-heavy default-language: Haskell2010 executable geval diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs new file mode 100644 index 0000000..64d609c --- /dev/null +++ b/src/Text/Tokenizer.hs @@ -0,0 +1,32 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} + +module Text.Tokenizer + where + +import qualified Data.Text as T +import Data.Monoid ((<>)) + +import Text.Regex.PCRE.Heavy + +data Tokenizer = V13a + +tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] +tokenize Nothing t = T.words t +tokenize (Just V13a) t = T.words tWithSpaces + where tWithSpaces = T.strip tTokenized + tTokenized = + gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p) + $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p) + $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p) + $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded + tPadded = " " <> tReplaced <> " " + tReplaced = + T.replace ">" ">" + $ T.replace "<" "<" + $ T.replace "&" "&" + $ T.replace """ "\"" + $ T.replace "\n" " " + $ T.replace "-\n" "" + $ T.replace "" "" t + space = " " :: T.Text diff --git a/test/Spec.hs b/test/Spec.hs index aacb2dd..1270087 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -12,6 +12,7 @@ import GEval.ClusteringMetrics import GEval.BIO import GEval.LineByLine import GEval.ParseParams +import Text.Tokenizer import Data.Attoparsec.Text import Options.Applicative import Data.Text @@ -389,7 +390,11 @@ main = hspec $ do (1.5, 3.0), (3.0, 2.0), (4.0, 1.0)] - + describe "tokenizer" $ do + it "simple utterance with '13a' tokenizer" $ do + tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe` + ["To", "be", "or", "not", "to", "be", + ",", "that's", "the", "question", "."] checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList