added lowercasing when tokenizing by space

This commit is contained in:
rjawor 2015-12-29 21:44:46 +01:00
parent 0a8d2fdd39
commit bbf3853d2a

View File

@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
bool byWhitespace) { bool byWhitespace) {
TokenizedSentence result(sentence); TokenizedSentence result(sentence);
if(byWhitespace) { if (byWhitespace) {
result.toLowerCase();
boost::shared_ptr<RegexRule> whitespaceRule( boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+", new RegexRule("\\S+",
TokenAnnotation::WORD, "")); TokenAnnotation::WORD, ""));
@ -54,7 +56,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
new RegexRule("\\p{L}", TokenAnnotation::WORD, "")); new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result); singleLetterWordsRule->apply(result);
} }
return result; return result;
} }