tokenize by whitespace option
This commit is contained in:
parent
873d7c300c
commit
0a8d2fdd39
@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
|
||||
}
|
||||
|
||||
TokenizedSentence
|
||||
Concordia::tokenize(const std::string & sentence)
|
||||
Concordia::tokenize(const std::string & sentence,
|
||||
bool byWhitespace)
|
||||
throw(ConcordiaException) {
|
||||
TokenizedSentence result =
|
||||
_hashGenerator->generateHash(sentence);
|
||||
_hashGenerator->generateHash(sentence, byWhitespace);
|
||||
_hashGenerator->serializeWordMap();
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||
const std::vector<std::string> & sentences)
|
||||
const std::vector<std::string> & sentences,
|
||||
bool byWhitespace)
|
||||
throw(ConcordiaException) {
|
||||
std::vector<TokenizedSentence> result;
|
||||
BOOST_FOREACH(std::string sentence, sentences) {
|
||||
result.push_back(_hashGenerator->generateHash(sentence));
|
||||
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
||||
}
|
||||
|
||||
_hashGenerator->serializeWordMap();
|
||||
|
@ -60,20 +60,24 @@ public:
|
||||
|
||||
/*! Tokenizes the given sentence.
|
||||
\param sentence sentence to be tokenized
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\returns tokenized sentence object,
|
||||
containing information about original word positions
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
TokenizedSentence tokenize(const std::string & sentence)
|
||||
TokenizedSentence tokenize(const std::string & sentence,
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Tokenizes all the given sentences.
|
||||
\param sentences vector of sentences to be tokenized
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\returns vector of tokenized sentence objects
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
std::vector<TokenizedSentence> tokenizeAll(
|
||||
const std::vector<std::string> & sentences)
|
||||
const std::vector<std::string> & sentences,
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds an Example to the index.
|
||||
|
@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
|
||||
}
|
||||
|
||||
TokenizedSentence HashGenerator::generateHash(
|
||||
const std::string & sentence) throw(ConcordiaException) {
|
||||
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
|
||||
const std::string & sentence,
|
||||
bool byWhitespace) throw(ConcordiaException) {
|
||||
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
|
||||
ts.generateHash(_wordMap);
|
||||
|
||||
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
||||
|
@ -44,9 +44,11 @@ public:
|
||||
/*!
|
||||
Generates hash of a sentence.
|
||||
\param sentence sentence to generate hash from
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\returns tokenized sentence, containing the hash
|
||||
*/
|
||||
TokenizedSentence generateHash(const std::string & sentence)
|
||||
TokenizedSentence generateHash(const std::string & sentence,
|
||||
bool byWhitespace = false)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*!
|
||||
|
@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
|
||||
SentenceTokenizer::~SentenceTokenizer() {
|
||||
}
|
||||
|
||||
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
|
||||
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
||||
bool byWhitespace) {
|
||||
TokenizedSentence result(sentence);
|
||||
|
||||
_htmlTags->apply(result);
|
||||
if(byWhitespace) {
|
||||
boost::shared_ptr<RegexRule> whitespaceRule(
|
||||
new RegexRule("\\S+",
|
||||
TokenAnnotation::WORD, ""));
|
||||
whitespaceRule->apply(result);
|
||||
} else {
|
||||
_htmlTags->apply(result);
|
||||
|
||||
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
||||
neRule.apply(result);
|
||||
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
||||
neRule.apply(result);
|
||||
}
|
||||
|
||||
result.toLowerCase();
|
||||
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords->apply(result);
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexRule> wordsRule(
|
||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||
TokenAnnotation::WORD, ""));
|
||||
wordsRule->apply(result);
|
||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||
singleLetterWordsRule->apply(result);
|
||||
}
|
||||
|
||||
result.toLowerCase();
|
||||
|
||||
if (_stopWordsEnabled) {
|
||||
_stopWords->apply(result);
|
||||
}
|
||||
|
||||
boost::shared_ptr<RegexRule> wordsRule(
|
||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||
TokenAnnotation::WORD, ""));
|
||||
wordsRule->apply(result);
|
||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||
singleLetterWordsRule->apply(result);
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -34,9 +34,11 @@ public:
|
||||
|
||||
/*! Tokenizes the sentence.
|
||||
\param sentence input sentence
|
||||
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||
\returns tokenized sentence object build on the input sentence
|
||||
*/
|
||||
TokenizedSentence tokenize(const std::string & sentence);
|
||||
TokenizedSentence tokenize(const std::string & sentence,
|
||||
bool byWhitespace = false);
|
||||
|
||||
private:
|
||||
void _createNeRules(std::string & namedEntitiesPath);
|
||||
|
@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
||||
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||
|
||||
concordia.clearIndex();
|
||||
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
Loading…
Reference in New Issue
Block a user