tokenize by whitespace option

This commit is contained in:
rjawor 2015-12-27 20:54:40 +01:00
parent 873d7c300c
commit 0a8d2fdd39
7 changed files with 70 additions and 29 deletions

View File

@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
} }
TokenizedSentence TokenizedSentence
Concordia::tokenize(const std::string & sentence) Concordia::tokenize(const std::string & sentence,
bool byWhitespace)
throw(ConcordiaException) { throw(ConcordiaException) {
TokenizedSentence result = TokenizedSentence result =
_hashGenerator->generateHash(sentence); _hashGenerator->generateHash(sentence, byWhitespace);
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();
return result; return result;
} }
std::vector<TokenizedSentence> Concordia::tokenizeAll( std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences) const std::vector<std::string> & sentences,
bool byWhitespace)
throw(ConcordiaException) { throw(ConcordiaException) {
std::vector<TokenizedSentence> result; std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) { BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence)); result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
} }
_hashGenerator->serializeWordMap(); _hashGenerator->serializeWordMap();

View File

@ -60,20 +60,24 @@ public:
/*! Tokenizes the given sentence. /*! Tokenizes the given sentence.
\param sentence sentence to be tokenized \param sentence sentence to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence object, \returns tokenized sentence object,
containing information about original word positions containing information about original word positions
\throws ConcordiaException \throws ConcordiaException
*/ */
TokenizedSentence tokenize(const std::string & sentence) TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*! Tokenizes all the given sentences. /*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized \param sentences vector of sentences to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\returns vector of tokenized sentence objects \returns vector of tokenized sentence objects
\throws ConcordiaException \throws ConcordiaException
*/ */
std::vector<TokenizedSentence> tokenizeAll( std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences) const std::vector<std::string> & sentences,
bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds an Example to the index. /*! Adds an Example to the index.

View File

@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
} }
TokenizedSentence HashGenerator::generateHash( TokenizedSentence HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) { const std::string & sentence,
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence); bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateHash(_wordMap); ts.generateHash(_wordMap);
if (ts.getTokens().size() > Utils::maxSentenceSize) { if (ts.getTokens().size() > Utils::maxSentenceSize) {

View File

@ -44,9 +44,11 @@ public:
/*! /*!
Generates hash of a sentence. Generates hash of a sentence.
\param sentence sentence to generate hash from \param sentence sentence to generate hash from
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the hash \returns tokenized sentence, containing the hash
*/ */
TokenizedSentence generateHash(const std::string & sentence) TokenizedSentence generateHash(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException); throw(ConcordiaException);
/*! /*!

View File

@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
SentenceTokenizer::~SentenceTokenizer() { SentenceTokenizer::~SentenceTokenizer() {
} }
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) { TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
bool byWhitespace) {
TokenizedSentence result(sentence); TokenizedSentence result(sentence);
_htmlTags->apply(result); if(byWhitespace) {
boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+",
TokenAnnotation::WORD, ""));
whitespaceRule->apply(result);
} else {
_htmlTags->apply(result);
BOOST_FOREACH(RegexRule & neRule, _namedEntities) { BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
neRule.apply(result); neRule.apply(result);
}
result.toLowerCase();
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
} }
result.toLowerCase();
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
return result; return result;
} }

View File

@ -34,9 +34,11 @@ public:
/*! Tokenizes the sentence. /*! Tokenizes the sentence.
\param sentence input sentence \param sentence input sentence
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence object build on the input sentence \returns tokenized sentence object build on the input sentence
*/ */
TokenizedSentence tokenize(const std::string & sentence); TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false);
private: private:
void _createNeRules(std::string & namedEntitiesPath); void _createNeRules(std::string & namedEntitiesPath);

View File

@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
} }
BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()