tokenize by whitespace option

This commit is contained in:
rjawor 2015-12-27 20:54:40 +01:00
parent 873d7c300c
commit 0a8d2fdd39
7 changed files with 70 additions and 29 deletions

View File

@ -49,20 +49,22 @@ std::string _createLibraryVersion() {
}
TokenizedSentence
Concordia::tokenize(const std::string & sentence)
Concordia::tokenize(const std::string & sentence,
bool byWhitespace)
throw(ConcordiaException) {
TokenizedSentence result =
_hashGenerator->generateHash(sentence);
_hashGenerator->generateHash(sentence, byWhitespace);
_hashGenerator->serializeWordMap();
return result;
}
std::vector<TokenizedSentence> Concordia::tokenizeAll(
const std::vector<std::string> & sentences)
const std::vector<std::string> & sentences,
bool byWhitespace)
throw(ConcordiaException) {
std::vector<TokenizedSentence> result;
BOOST_FOREACH(std::string sentence, sentences) {
result.push_back(_hashGenerator->generateHash(sentence));
result.push_back(_hashGenerator->generateHash(sentence, byWhitespace));
}
_hashGenerator->serializeWordMap();

View File

@ -60,20 +60,24 @@ public:
/*! Tokenizes the given sentence.
\param sentence sentence to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
TokenizedSentence tokenize(const std::string & sentence)
TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*! Tokenizes all the given sentences.
\param sentences vector of sentences to be tokenized
\param byWhitespace whether to tokenize the sentence by whitespace
\returns vector of tokenized sentence objects
\throws ConcordiaException
*/
std::vector<TokenizedSentence> tokenizeAll(
const std::vector<std::string> & sentences)
const std::vector<std::string> & sentences,
bool byWhitespace = false)
throw(ConcordiaException);
/*! Adds an Example to the index.

View File

@ -29,8 +29,9 @@ HashGenerator::~HashGenerator() {
}
TokenizedSentence HashGenerator::generateHash(
const std::string & sentence) throw(ConcordiaException) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence);
const std::string & sentence,
bool byWhitespace) throw(ConcordiaException) {
TokenizedSentence ts = _sentenceTokenizer->tokenize(sentence, byWhitespace);
ts.generateHash(_wordMap);
if (ts.getTokens().size() > Utils::maxSentenceSize) {

View File

@ -44,9 +44,11 @@ public:
/*!
Generates hash of a sentence.
\param sentence sentence to generate hash from
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence, containing the hash
*/
TokenizedSentence generateHash(const std::string & sentence)
TokenizedSentence generateHash(const std::string & sentence,
bool byWhitespace = false)
throw(ConcordiaException);
/*!

View File

@ -24,29 +24,37 @@ SentenceTokenizer::SentenceTokenizer(
SentenceTokenizer::~SentenceTokenizer() {
}
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence) {
TokenizedSentence SentenceTokenizer::tokenize(const std::string & sentence,
bool byWhitespace) {
TokenizedSentence result(sentence);
_htmlTags->apply(result);
if(byWhitespace) {
boost::shared_ptr<RegexRule> whitespaceRule(
new RegexRule("\\S+",
TokenAnnotation::WORD, ""));
whitespaceRule->apply(result);
} else {
_htmlTags->apply(result);
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
neRule.apply(result);
BOOST_FOREACH(RegexRule & neRule, _namedEntities) {
neRule.apply(result);
}
result.toLowerCase();
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
}
result.toLowerCase();
if (_stopWordsEnabled) {
_stopWords->apply(result);
}
boost::shared_ptr<RegexRule> wordsRule(
new RegexRule("\\p{L}(\\p{L}|'|\\-)*\\p{L}",
TokenAnnotation::WORD, ""));
wordsRule->apply(result);
boost::shared_ptr<RegexRule> singleLetterWordsRule(
new RegexRule("\\p{L}", TokenAnnotation::WORD, ""));
singleLetterWordsRule->apply(result);
return result;
}

View File

@ -34,9 +34,11 @@ public:
/*! Tokenizes the sentence.
\param sentence input sentence
\param byWhitespace whether to tokenize the sentence by whitespace
\returns tokenized sentence object build on the input sentence
*/
TokenizedSentence tokenize(const std::string & sentence);
TokenizedSentence tokenize(const std::string & sentence,
bool byWhitespace = false);
private:
void _createNeRules(std::string & namedEntitiesPath);

View File

@ -435,4 +435,26 @@ BOOST_AUTO_TEST_CASE( ConcordiaCountOccurences )
}
BOOST_AUTO_TEST_CASE( TokenizeWhitespace )
{
Concordia concordia = Concordia(TestResourcesManager::getTempPath(),
TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
TokenizedSentence ts = concordia.tokenize("Ala 23 --- ..//,./ '''8902347 posiada kota", true);
BOOST_CHECK_EQUAL(ts.getTokens().size(), 7);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getStart(), 4);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getEnd(), 6);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(1).getValue(), "23");
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getStart(), 11);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getEnd(), 18);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getType(), 1);
BOOST_CHECK_EQUAL(ts.getTokens().at(3).getValue(), "..//,./");
concordia.clearIndex();
}
BOOST_AUTO_TEST_SUITE_END()