added lowercasing when tokenizing by space
This commit is contained in:
parent
0a8d2fdd39
commit
bbf3853d2a
@ -28,7 +28,9 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
|||||||
bool byWhitespace) {
|
bool byWhitespace) {
|
||||||
TokenizedSentence result(sentence);
|
TokenizedSentence result(sentence);
|
||||||
|
|
||||||
if(byWhitespace) {
|
if (byWhitespace) {
|
||||||
|
result.toLowerCase();
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> whitespaceRule(
|
boost::shared_ptr<RegexRule> whitespaceRule(
|
||||||
new RegexRule("\\S+",
|
new RegexRule("\\S+",
|
||||||
TokenAnnotation::WORD, ""));
|
TokenAnnotation::WORD, ""));
|
||||||
@ -54,7 +56,7 @@ TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
|||||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||||
singleLetterWordsRule->apply(result);
|
singleLetterWordsRule->apply(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user