added lowercasing when tokenizing by space
This commit is contained in:
parent
0a8d2fdd39
commit
bbf3853d2a
@ -29,6 +29,8 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
|||||||
TokenizedSentence result(sentence);
|
TokenizedSentence result(sentence);
|
||||||
|
|
||||||
if (byWhitespace) {
|
if (byWhitespace) {
|
||||||
|
result.toLowerCase();
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> whitespaceRule(
|
boost::shared_ptr<RegexRule> whitespaceRule(
|
||||||
new RegexRule("\\S+",
|
new RegexRule("\\S+",
|
||||||
TokenAnnotation::WORD, ""));
|
TokenAnnotation::WORD, ""));
|
||||||
|
Loading…
Reference in New Issue
Block a user