towards tokenization
This commit is contained in:
parent
de52a12b03
commit
8388ab4d27
@ -33,6 +33,7 @@ library
|
|||||||
, Data.Conduit.SmartSource
|
, Data.Conduit.SmartSource
|
||||||
, Data.Conduit.Rank
|
, Data.Conduit.Rank
|
||||||
, GEval.FeatureExtractor
|
, GEval.FeatureExtractor
|
||||||
|
, Text.Tokenizer
|
||||||
, Paths_geval
|
, Paths_geval
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
@ -68,6 +69,7 @@ library
|
|||||||
, naturalcomp
|
, naturalcomp
|
||||||
, containers
|
, containers
|
||||||
, statistics
|
, statistics
|
||||||
|
, pcre-heavy
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable geval
|
executable geval
|
||||||
|
32
src/Text/Tokenizer.hs
Normal file
32
src/Text/Tokenizer.hs
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE QuasiQuotes #-}
|
||||||
|
|
||||||
|
module Text.Tokenizer
|
||||||
|
where
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import Data.Monoid ((<>))
|
||||||
|
|
||||||
|
import Text.Regex.PCRE.Heavy
|
||||||
|
|
||||||
|
data Tokenizer = V13a
|
||||||
|
|
||||||
|
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
||||||
|
tokenize Nothing t = T.words t
|
||||||
|
tokenize (Just V13a) t = T.words tWithSpaces
|
||||||
|
where tWithSpaces = T.strip tTokenized
|
||||||
|
tTokenized =
|
||||||
|
gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p)
|
||||||
|
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> c <> space <> p)
|
||||||
|
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p)
|
||||||
|
$ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (space <>) tPadded
|
||||||
|
tPadded = " " <> tReplaced <> " "
|
||||||
|
tReplaced =
|
||||||
|
T.replace ">" ">"
|
||||||
|
$ T.replace "<" "<"
|
||||||
|
$ T.replace "&" "&"
|
||||||
|
$ T.replace """ "\""
|
||||||
|
$ T.replace "\n" " "
|
||||||
|
$ T.replace "-\n" ""
|
||||||
|
$ T.replace "<skipped>" "" t
|
||||||
|
space = " " :: T.Text
|
@ -12,6 +12,7 @@ import GEval.ClusteringMetrics
|
|||||||
import GEval.BIO
|
import GEval.BIO
|
||||||
import GEval.LineByLine
|
import GEval.LineByLine
|
||||||
import GEval.ParseParams
|
import GEval.ParseParams
|
||||||
|
import Text.Tokenizer
|
||||||
import Data.Attoparsec.Text
|
import Data.Attoparsec.Text
|
||||||
import Options.Applicative
|
import Options.Applicative
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -389,7 +390,11 @@ main = hspec $ do
|
|||||||
(1.5, 3.0),
|
(1.5, 3.0),
|
||||||
(3.0, 2.0),
|
(3.0, 2.0),
|
||||||
(4.0, 1.0)]
|
(4.0, 1.0)]
|
||||||
|
describe "tokenizer" $ do
|
||||||
|
it "simple utterance with '13a' tokenizer" $ do
|
||||||
|
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`
|
||||||
|
["To", "be", "or", "not", "to", "be",
|
||||||
|
",", "that's", "the", "question", "."]
|
||||||
|
|
||||||
checkConduitPure conduit inList expList = do
|
checkConduitPure conduit inList expList = do
|
||||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||||
|
Loading…
Reference in New Issue
Block a user