towards tokenization

2018-08-11 22:59:10 +02:00 · 2018-08-11 22:59:10 +02:00 · 8388ab4d27
commit 8388ab4d27
parent de52a12b03
3 changed files with 40 additions and 1 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -33,6 +33,7 @@ library
                     , Data.Conduit.SmartSource
                     , Data.Conduit.Rank
                     , GEval.FeatureExtractor
                     , Text.Tokenizer
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -68,6 +69,7 @@ library
                     , naturalcomp
                     , containers
                     , statistics
                     , pcre-heavy
  default-language:    Haskell2010
 executable geval
--- a/src/Text/Tokenizer.hs
+++ b/src/Text/Tokenizer.hs
@ -0,0 +1,32 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE QuasiQuotes #-}
 module Text.Tokenizer
  where
 import qualified Data.Text as T
 import Data.Monoid ((<>))
 import Text.Regex.PCRE.Heavy
 data Tokenizer = V13a
 tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
 tokenize Nothing t = T.words t
 tokenize (Just V13a) t = T.words tWithSpaces
  where tWithSpaces = T.strip tTokenized
        tTokenized =
          gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
          $ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
          $ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
          $ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
        tPadded = " " <> tReplaced <> " "
        tReplaced =
          T.replace "&gt;" ">"
          $ T.replace "&lt;" "<"
          $ T.replace "&amp;" "&"
          $ T.replace "&quot;" "\""
          $ T.replace "\n" " "
          $ T.replace "-\n" ""
          $ T.replace "<skipped>" "" t
        space = " " :: T.Text
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
 import GEval.BIO
 import GEval.LineByLine
 import GEval.ParseParams
 import Text.Tokenizer
 import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
@ -389,7 +390,11 @@ main = hspec $ do
                                 (1.5, 3.0),
                                 (3.0, 2.0),
                                 (4.0, 1.0)]
-
+  describe "tokenizer" $ do
    it "simple utterance with '13a' tokenizer" $ do
      tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
        ["To", "be", "or", "not", "to", "be",
         ",", "that's", "the", "question", "."]
 checkConduitPure conduit inList expList = do
  let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList