diff --git a/concordia/sentence_tokenizer.cpp b/concordia/sentence_tokenizer.cpp index 9c7b3b6..2ab44e3 100644 --- a/concordia/sentence_tokenizer.cpp +++ b/concordia/sentence_tokenizer.cpp @@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, bool byWhitespace) { TokenizedSentence result(sentence); - if(byWhitespace) { + if (byWhitespace) { + result.toLowerCase(); + boost::shared_ptr whitespaceRule( new RegexRule("\\S+", TokenAnnotation::WORD, "")); @@ -54,7 +56,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence, new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); singleLetterWordsRule->apply(result); } - + return result; }