towards tokenization

This commit is contained in:
Filip Gralinski 2018-08-11 22:59:10 +02:00
parent de52a12b03
commit 8388ab4d27
3 changed files with 40 additions and 1 deletions

View File

@ -33,6 +33,7 @@ library
, Data.Conduit.SmartSource
, Data.Conduit.Rank
, GEval.FeatureExtractor
, Text.Tokenizer
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond
@ -68,6 +69,7 @@ library
, naturalcomp
, containers
, statistics
, pcre-heavy
default-language: Haskell2010
executable geval

32
src/Text/Tokenizer.hs Normal file
View File

@ -0,0 +1,32 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Text.Tokenizer
where
import qualified Data.Text as T
import Data.Monoid ((<>))
import Text.Regex.PCRE.Heavy
data Tokenizer = V13a
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
tokenize Nothing t = T.words t
tokenize (Just V13a) t = T.words tWithSpaces
where tWithSpaces = T.strip tTokenized
tTokenized =
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
$ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
tPadded = " " <> tReplaced <> " "
tReplaced =
T.replace "&gt;" ">"
$ T.replace "&lt;" "<"
$ T.replace "&amp;" "&"
$ T.replace "&quot;" "\""
$ T.replace "\n" " "
$ T.replace "-\n" ""
$ T.replace "<skipped>" "" t
space = " " :: T.Text

View File

@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
import GEval.BIO
import GEval.LineByLine
import GEval.ParseParams
import Text.Tokenizer
import Data.Attoparsec.Text
import Options.Applicative
import Data.Text
@ -389,7 +390,11 @@ main = hspec $ do
(1.5, 3.0),
(3.0, 2.0),
(4.0, 1.0)]
describe "tokenizer" $ do
it "simple utterance with '13a' tokenizer" $ do
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
["To", "be", "or", "not", "to", "be",
",", "that's", "the", "question", "."]
checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList