added lowercasing when tokenizing by space
This commit is contained in:
parent
0a8d2fdd39
commit
bbf3853d2a
@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
||||
bool byWhitespace) {
|
||||
TokenizedSentence result(sentence);
|
||||
|
||||
if(byWhitespace) {
|
||||
if (byWhitespace) {
|
||||
result.toLowerCase();
|
||||
|
||||
boost::shared_ptr<RegexRule> whitespaceRule(
|
||||
new RegexRule("\\S+",
|
||||
TokenAnnotation::WORD, ""));
|
||||
@ -54,7 +56,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||
singleLetterWordsRule->apply(result);
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user