From bbf3853d2a05e252e56645c4e51ae9b5d99f70aa Mon Sep 17 00:00:00 2001 From: rjawor Date: Tue, 29 Dec 2015 21:44:46 +0100 Subject: [PATCH] added lowercasing when tokenizing by space --- concordia/sentence_tokenizer.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 9c7b3b6..2ab44e3 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, bool byWhitespace) { TokenizedSentence result(sentence); - if(byWhitespace) { + if (byWhitespace) { + result.toLowerCase(); + boost::shared_ptr whitespaceRule( new RegexRule("\\S+", TokenAnnotation::WORD, "")); @@ -54,7 +56,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); singleLetterWordsRule->apply(result); } - + return result; }