added lowercasing when tokenizing by space

This commit is contained in:
rjawor 2015-12-29 21:44:46 +01:00
parent 0a8d2fdd39
commit bbf3853d2a

View File

@ -29,6 +29,8 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
TokenizedSentence result(sentence); TokenizedSentence result(sentence);
if (byWhitespace) { if (byWhitespace) {
result.toLowerCase();
boost::shared_ptr<RegexRule> whitespaceRule( boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+", new RegexRule("\\S+",
TokenAnnotation::WORD, "")); TokenAnnotation::WORD, ""));