tokenize by whitespace option
This commit is contained in:
parent
873d7c300c
commit
0a8d2fdd39
@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TokenizedSentence
|
TokenizedSentence
|
||||||
Concordia::tokenize(const std::string & sentence)
|
Concordia::tokenize(const std::string & sentence,
|
||||||
|
bool byWhitespace)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
TokenizedSentence result =
|
TokenizedSentence result =
|
||||||
_hashGenerator->generateHash(sentence);
|
_hashGenerator->generateHash(sentence, byWhitespace);
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
std::vector<TokenizedSentence> Concordia::tokenizeAll(
|
||||||
const std::vector<std::string> & sentences)
|
const std::vector<std::string> & sentences,
|
||||||
|
bool byWhitespace)
|
||||||
throw(ConcordiaException) {
|
throw(ConcordiaException) {
|
||||||
std::vector<TokenizedSentence> result;
|
std::vector<TokenizedSentence> result;
|
||||||
BOOST_FOREACH(std::string sentence, sentences) {
|
BOOST_FOREACH(std::string sentence, sentences) {
|
||||||
result.push_back(_hashGenerator->generateHash(sentence));
|
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
|
||||||
}
|
}
|
||||||
|
|
||||||
_hashGenerator->serializeWordMap();
|
_hashGenerator->serializeWordMap();
|
||||||
|
@ -60,20 +60,24 @@ public:
|
|||||||
|
|
||||||
/*! Tokenizes the given sentence.
|
/*! Tokenizes the given sentence.
|
||||||
\param sentence sentence to be tokenized
|
\param sentence sentence to be tokenized
|
||||||
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
\returns tokenized sentence object,
|
\returns tokenized sentence object,
|
||||||
containing information about original word positions
|
containing information about original word positions
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
TokenizedSentence tokenize(const std::string & sentence)
|
TokenizedSentence tokenize(const std::string & sentence,
|
||||||
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Tokenizes all the given sentences.
|
/*! Tokenizes all the given sentences.
|
||||||
\param sentences vector of sentences to be tokenized
|
\param sentences vector of sentences to be tokenized
|
||||||
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
\returns vector of tokenized sentence objects
|
\returns vector of tokenized sentence objects
|
||||||
\throws ConcordiaException
|
\throws ConcordiaException
|
||||||
*/
|
*/
|
||||||
std::vector<TokenizedSentence> tokenizeAll(
|
std::vector<TokenizedSentence> tokenizeAll(
|
||||||
const std::vector<std::string> & sentences)
|
const std::vector<std::string> & sentences,
|
||||||
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds an Example to the index.
|
/*! Adds an Example to the index.
|
||||||
|
@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TokenizedSentence HashGenerator::generateHash(
|
TokenizedSentence HashGenerator::generateHash(
|
||||||
const std::string & sentence) throw(ConcordiaException) {
|
const std::string & sentence,
|
||||||
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
|
bool byWhitespace) throw(ConcordiaException) {
|
||||||
|
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
|
||||||
ts.generateHash(_wordMap);
|
ts.generateHash(_wordMap);
|
||||||
|
|
||||||
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
if (ts.getTokens().size() > Utils::maxSentenceSize) {
|
||||||
|
@ -44,9 +44,11 @@ public:
|
|||||||
/*!
|
/*!
|
||||||
Generates hash of a sentence.
|
Generates hash of a sentence.
|
||||||
\param sentence sentence to generate hash from
|
\param sentence sentence to generate hash from
|
||||||
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
\returns tokenized sentence, containing the hash
|
\returns tokenized sentence, containing the hash
|
||||||
*/
|
*/
|
||||||
TokenizedSentence generateHash(const std::string & sentence)
|
TokenizedSentence generateHash(const std::string & sentence,
|
||||||
|
bool byWhitespace = false)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
|
|||||||
SentenceTokenizer::~SentenceTokenizer() {
|
SentenceTokenizer::~SentenceTokenizer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
|
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
|
||||||
|
bool byWhitespace) {
|
||||||
TokenizedSentence result(sentence);
|
TokenizedSentence result(sentence);
|
||||||
|
|
||||||
_htmlTags->apply(result);
|
if(byWhitespace) {
|
||||||
|
boost::shared_ptr<RegexRule> whitespaceRule(
|
||||||
|
new RegexRule("\\S+",
|
||||||
|
TokenAnnotation::WORD, ""));
|
||||||
|
whitespaceRule->apply(result);
|
||||||
|
} else {
|
||||||
|
_htmlTags->apply(result);
|
||||||
|
|
||||||
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
|
||||||
neRule.apply(result);
|
neRule.apply(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
result.toLowerCase();
|
||||||
|
|
||||||
|
if (_stopWordsEnabled) {
|
||||||
|
_stopWords->apply(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<RegexRule> wordsRule(
|
||||||
|
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
||||||
|
TokenAnnotation::WORD, ""));
|
||||||
|
wordsRule->apply(result);
|
||||||
|
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
||||||
|
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
||||||
|
singleLetterWordsRule->apply(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
result.toLowerCase();
|
|
||||||
|
|
||||||
if (_stopWordsEnabled) {
|
|
||||||
_stopWords->apply(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
boost::shared_ptr<RegexRule> wordsRule(
|
|
||||||
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
|
|
||||||
TokenAnnotation::WORD, ""));
|
|
||||||
wordsRule->apply(result);
|
|
||||||
boost::shared_ptr<RegexRule> singleLetterWordsRule(
|
|
||||||
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
|
|
||||||
singleLetterWordsRule->apply(result);
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,9 +34,11 @@ public:
|
|||||||
|
|
||||||
/*! Tokenizes the sentence.
|
/*! Tokenizes the sentence.
|
||||||
\param sentence input sentence
|
\param sentence input sentence
|
||||||
|
\param byWhitespace whether to tokenize the sentence by whitespace
|
||||||
\returns tokenized sentence object build on the input sentence
|
\returns tokenized sentence object build on the input sentence
|
||||||
*/
|
*/
|
||||||
TokenizedSentence tokenize(const std::string & sentence);
|
TokenizedSentence tokenize(const std::string & sentence,
|
||||||
|
bool byWhitespace = false);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _createNeRules(std::string & namedEntitiesPath);
|
void _createNeRules(std::string & namedEntitiesPath);
|
||||||
|
@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
|
||||||
|
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
|
||||||
|
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
|
||||||
|
|
||||||
|
concordia.clearIndex();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user