towards tokenization
This commit is contained in:
parent
de52a12b03
commit
8388ab4d27
@ -33,6 +33,7 @@ library
|
||||
, Data.Conduit.SmartSource
|
||||
, Data.Conduit.Rank
|
||||
, GEval.FeatureExtractor
|
||||
, Text.Tokenizer
|
||||
, Paths_geval
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, cond
|
||||
@ -68,6 +69,7 @@ library
|
||||
, naturalcomp
|
||||
, containers
|
||||
, statistics
|
||||
, pcre-heavy
|
||||
default-language: Haskell2010
|
||||
|
||||
executable geval
|
||||
|
32
src/Text/Tokenizer.hs
Normal file
32
src/Text/Tokenizer.hs
Normal file
@ -0,0 +1,32 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
|
||||
module Text.Tokenizer
|
||||
where
|
||||
|
||||
import qualified Data.Text as T
|
||||
import Data.Monoid ((<>))
|
||||
|
||||
import Text.Regex.PCRE.Heavy
|
||||
|
||||
data Tokenizer = V13a
|
||||
|
||||
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
||||
tokenize Nothing t = T.words t
|
||||
tokenize (Just V13a) t = T.words tWithSpaces
|
||||
where tWithSpaces = T.strip tTokenized
|
||||
tTokenized =
|
||||
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
|
||||
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
|
||||
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
|
||||
$ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
|
||||
tPadded = " " <> tReplaced <> " "
|
||||
tReplaced =
|
||||
T.replace ">" ">"
|
||||
$ T.replace "<" "<"
|
||||
$ T.replace "&" "&"
|
||||
$ T.replace """ "\""
|
||||
$ T.replace "\n" " "
|
||||
$ T.replace "-\n" ""
|
||||
$ T.replace "<skipped>" "" t
|
||||
space = " " :: T.Text
|
@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
|
||||
import GEval.BIO
|
||||
import GEval.LineByLine
|
||||
import GEval.ParseParams
|
||||
import Text.Tokenizer
|
||||
import Data.Attoparsec.Text
|
||||
import Options.Applicative
|
||||
import Data.Text
|
||||
@ -389,7 +390,11 @@ main = hspec $ do
|
||||
(1.5, 3.0),
|
||||
(3.0, 2.0),
|
||||
(4.0, 1.0)]
|
||||
|
||||
describe "tokenizer" $ do
|
||||
it "simple utterance with '13a' tokenizer" $ do
|
||||
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
|
||||
["To", "be", "or", "not", "to", "be",
|
||||
",", "that's", "the", "question", "."]
|
||||
|
||||
checkConduitPure conduit inList expList = do
|
||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||
|
Loading…
Reference in New Issue
Block a user