94 lines
3.4 KiB
Haskell
94 lines
3.4 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
{-# LANGUAGE QuasiQuotes #-}
|
|
|
|
module Text.Tokenizer
|
|
where
|
|
|
|
import qualified Data.Text as T
|
|
import Data.Monoid ((<>))
|
|
|
|
import Text.Regex.PCRE.Heavy
|
|
|
|
data Tokenizer = Minimalistic | V13a | V14International | CharacterByCharacter
|
|
deriving (Eq)
|
|
|
|
instance Show Tokenizer where
|
|
show Minimalistic = "minimalistic"
|
|
show V13a = "13a"
|
|
show V14International = "v14"
|
|
show CharacterByCharacter = "character-by-character"
|
|
|
|
instance Read Tokenizer where
|
|
readsPrec _ ('m':'i':'n':'i':'m':'a':'l':'i':'s':'t':'i':'c':theRest) =
|
|
[(Minimalistic, theRest)]
|
|
readsPrec _ ('1':'3':'a':theRest) = [(V13a, theRest)]
|
|
readsPrec _ ('v':'1':'4':theRest) = [(V14International, theRest)]
|
|
readsPrec _ ('c':'h':'a':'r':'a':'c':'t':'e':'r':'-':'b':'y':'-':'c':'h':'a':'r':'a':'c':'t':'e':'r':theRest) =
|
|
[(CharacterByCharacter, theRest)]
|
|
|
|
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
|
|
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
|
|
|
|
tokenizeTabSeparatedWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
|
tokenizeTabSeparatedWithSpaces Nothing t = t -- special case for efficiency
|
|
tokenizeTabSeparatedWithSpaces tokenizer@(Just _) t =
|
|
T.intercalate "\t"
|
|
$ map (tokenizeWithSpaces tokenizer)
|
|
$ T.splitOn "\t" t
|
|
|
|
space :: T.Text
|
|
space = " "
|
|
|
|
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
|
|
tokenizeWithSpaces Nothing t = t
|
|
-- very simple tokenization, punctuation marks are separated
|
|
-- only at the beginning and end of a word
|
|
tokenizeWithSpaces (Just Minimalistic) t = T.strip tTokenized
|
|
where tTokenized =
|
|
gsub [re|\s{2,}|] ((const space) :: T.Text -> T.Text)
|
|
$ gsub [re|[\w\d]+\S*[\w\d]+|[\w\d]|[^\w\s]+|]
|
|
(\tok -> space <> tok <> space)
|
|
t
|
|
|
|
-- tokenization following the official BLEU implementation
|
|
-- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983
|
|
-- cf. tokenize_v14_international function in sacrebleu evaluator
|
|
tokenizeWithSpaces (Just V14International) t =
|
|
T.strip tTokenized
|
|
where tTokenized =
|
|
gsub [re|\s+|] toSpace
|
|
$ gsub [re|\p{S}|] (\s -> space <> s <> space)
|
|
$ gsub [re|(\p{P})([^\d])|] (\(p:n:_) -> space <> p <> space <> n)
|
|
$ gsub [re|([^\d])(\p{P})|] (\(n:p:_) -> n <> space <> p <> space) t
|
|
|
|
-- tokenization equivalent to mteval-v13a
|
|
-- cf. tokenize_13a function in sacrebleu evaluator
|
|
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized
|
|
where tTokenized =
|
|
gsub [re|\s+|] toSpace
|
|
$ gsub [re|([0-9])(-)|] (\(c:p:_) -> c <> space <> p <> space)
|
|
$ gsub [re|([\.,])([^0-9])|] (\(c:p:_) -> space <> c <> space <> p)
|
|
$ gsub [re|([^0-9])([\.,])|] (\(c:p:_) -> c <> space <> p <> space)
|
|
$ gsub [re|[\{-\~\[-\` -\&\(-\+\:-\@\/]|] (\s -> space <> s <> space) tPadded
|
|
tPadded = " " <> tReplaced <> " "
|
|
tReplaced =
|
|
T.replace ">" ">"
|
|
$ T.replace "<" "<"
|
|
$ T.replace "&" "&"
|
|
$ T.replace """ "\""
|
|
$ T.replace "\n" " "
|
|
$ T.replace "-\n" ""
|
|
$ T.replace "<skipped>" "" t
|
|
|
|
tokenizeWithSpaces (Just CharacterByCharacter) t = T.intercalate " "
|
|
$ map T.singleton
|
|
$ map escapeSpace
|
|
$ T.unpack t
|
|
|
|
toSpace :: T.Text -> T.Text
|
|
toSpace _ = space
|
|
|
|
escapeSpace :: Char -> Char
|
|
escapeSpace ' ' = '_'
|
|
escapeSpace c = c
|