added lowercasing when tokenizing by space

This commit is contained in:
rjawor 2015-12-29 21:44:46 +01:00
parent 0a8d2fdd39
commit bbf3853d2a

View File

@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
bool byWhitespace) { bool byWhitespace) {
TokenizedSentence result(sentence); TokenizedSentence result(sentence);
if(byWhitespace) { if (byWhitespace) {
result.toLowerCase();
boost::shared_ptr<RegexRule> whitespaceRule( boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+", new RegexRule("\\S+",
TokenAnnotation::WORD, "")); TokenAnnotation::WORD, ""));