From 8388ab4d27013fa174947e85fd8b60be0b233282 Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Sat, 11 Aug 2018 22:59:10 +0200
Subject: [PATCH] towards tokenization

---
 geval.cabal           |  2 ++
 src/Text/Tokenizer.hs | 32 ++++++++++++++++++++++++++++++++
 test/Spec.hs          |  7 ++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 src/Text/Tokenizer.hs
diff --git a/geval.cabal b/geval.cabal
index 9a1eb97..27e3f37 100644
--- a/geval.cabal
+++ b/geval.cabal
@@ -33,6 +33,7 @@ library
                      , Data.Conduit.SmartSource
                      , Data.Conduit.Rank
                      , GEval.FeatureExtractor
+                     , Text.Tokenizer
                      , Paths_geval
   build-depends:       base >= 4.7 && < 5
                      , cond
@@ -68,6 +69,7 @@ library
                      , naturalcomp
                      , containers
                      , statistics
+                     , pcre-heavy
   default-language:    Haskell2010
 
 executable geval
diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs
new file mode 100644
index 0000000..64d609c
--- /dev/null
+++ b/src/Text/Tokenizer.hs
@@ -0,0 +1,32 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+
+module Text.Tokenizer
+  where
+
+import qualified Data.Text as T
+import Data.Monoid ((<>))
+
+import Text.Regex.PCRE.Heavy
+
+data Tokenizer = V13a
+
+tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
+tokenize Nothing t = T.words t
+tokenize (Just V13a) t = T.words tWithSpaces
+  where tWithSpaces = T.strip tTokenized
+        tTokenized =
+          gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
+        tPadded = " " <> tReplaced <> " "
+        tReplaced =
+          T.replace "&gt;" ">"
+          $ T.replace "&lt;" "<"
+          $ T.replace "&amp;" "&"
+          $ T.replace "&quot;" "\""
+          $ T.replace "\n" " "
+          $ T.replace "-\n" ""
+          $ T.replace "<skipped>" "" t
+        space = " " :: T.Text
diff --git a/test/Spec.hs b/test/Spec.hs
index aacb2dd..1270087 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
 import GEval.BIO
 import GEval.LineByLine
 import GEval.ParseParams
+import Text.Tokenizer
 import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
@@ -389,7 +390,11 @@ main = hspec $ do
                                  (1.5, 3.0),
                                  (3.0, 2.0),
                                  (4.0, 1.0)]
-
+  describe "tokenizer" $ do
+    it "simple utterance with '13a' tokenizer" $ do
+      tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
+        ["To", "be", "or", "not", "to", "be",
+         ",", "that's", "the", "question", "."]
 
 checkConduitPure conduit inList expList = do
   let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList