towards tokenization

2018-08-11 22:59:10 +02:00 · 2018-08-11 22:59:10 +02:00 · 8388ab4d27
commit 8388ab4d27
parent de52a12b03
3 changed files with 40 additions and 1 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -33,6 +33,7 @@ library
                     , Data.Conduit.SmartSource
                     , Data.Conduit.Rank
                     , GEval.FeatureExtractor
+                     , Text.Tokenizer
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -68,6 +69,7 @@ library
                     , naturalcomp
                     , containers
                     , statistics
+                     , pcre-heavy
  default-language:    Haskell2010

 executable geval
--- a/src/Text/Tokenizer.hs
+++ b/src/Text/Tokenizer.hs
@ -0,0 +1,32 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+
+module Text.Tokenizer
+  where
+
+import qualified Data.Text as T
+import Data.Monoid ((<>))
+
+import Text.Regex.PCRE.Heavy
+
+data Tokenizer = V13a
+
+tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
+tokenize Nothing t = T.words t
+tokenize (Just V13a) t = T.words tWithSpaces
+  where tWithSpaces = T.strip tTokenized
+        tTokenized =
+          gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
+          $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
+        tPadded = " " <> tReplaced <> " "
+        tReplaced =
+          T.replace "&gt;" ">"
+          $ T.replace "&lt;" "<"
+          $ T.replace "&amp;" "&"
+          $ T.replace "&quot;" "\""
+          $ T.replace "\n" " "
+          $ T.replace "-\n" ""
+          $ T.replace "<skipped>" "" t
+        space = " " :: T.Text
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
 import GEval.BIO
 import GEval.LineByLine
 import GEval.ParseParams
+import Text.Tokenizer
 import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
@ -389,7 +390,11 @@ main = hspec $ do
                                 (1.5, 3.0),
                                 (3.0, 2.0),
                                 (4.0, 1.0)]
-
+  describe "tokenizer" $ do
+    it "simple utterance with '13a' tokenizer" $ do
+      tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
+        ["To", "be", "or", "not", "to", "be",
+         ",", "that's", "the", "question", "."]

 checkConduitPure conduit inList expList = do
  let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList